In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, explode
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from functools import reduce
from pyspark.sql.functions import sum as _sum
from math import log

In [2]:
spark = SparkSession.builder.appName("SentimentAnalysis").getOrCreate()

In [3]:
spark

# Load the data

In [4]:
movie_reviews_df = spark.read.csv("IMDB Dataset.csv", header=True, inferSchema=True, quote='"', escape='"')

In [5]:
movie_reviews_df.printSchema()

root
 |-- review: string (nullable = true)
 |-- sentiment: string (nullable = true)



In [6]:
weights = [0.01, 0.99] # 70% for training, 30% for testing
training_data, testing_data = movie_reviews_df.randomSplit(weights)

# Count the labels

In [7]:
def count_labels(data):
    counts = data.groupBy('sentiment').count()
    return counts.rdd.collectAsMap()

In [8]:
#Training
training_labels_counts = count_labels(training_data)
print("Training labels counts:", training_labels_counts)

#Testing
testing_labels_counts = count_labels(testing_data)
print("Testing labels counts:", testing_labels_counts)

Training labels counts: {'positive': 241, 'negative': 234}
Testing labels counts: {'positive': 24759, 'negative': 24766}


# Count the words

In [9]:
def get_words_count_in_a_class(reviews_data_frame, class_label):
    class_reviews = reviews_data_frame.filter(reviews_data_frame.sentiment == class_label)
    words_column = explode(split(class_reviews.review, "\s+")).alias("word")
    words_counts = class_reviews.select(words_column).groupBy("word").count()
    total_count = words_counts.agg(_sum('count')).collect()[0][0]
    words_counts = words_counts.rdd.collectAsMap()
    return {"total-count":total_count, "words-counts":words_counts}

In [10]:
# Training
pos_counts = get_words_count_in_a_class(training_data, "positive")
pos_counts["class-prob"] = training_labels_counts['positive']/(training_labels_counts['positive'] + training_labels_counts['negative'])
neg_counts = get_words_count_in_a_class(training_data, "negative")
neg_counts["class-prob"] = training_labels_counts['negative']/(training_labels_counts['positive'] + training_labels_counts['negative'])

# Testing
test_pos_counts = get_words_count_in_a_class(testing_data, "positive")
test_pos_counts["class-prob"] = testing_labels_counts['positive']/(testing_labels_counts['positive'] + testing_labels_counts['negative'])
test_neg_counts = get_words_count_in_a_class(testing_data, "negative")
test_neg_counts["class-prob"] = testing_labels_counts['negative']/(testing_labels_counts['positive'] + testing_labels_counts['negative'])

# Naive Bayes

In [11]:
def log_prob_calculator(review, class_count):
        words = review.split()
        probs_list = [class_count["words-counts"].get(word, 1)/class_count["total-count"] for word in words]
        return log(class_count['class-prob']) + reduce(lambda a, b: a + log(b), probs_list, 0)

def create_predictor(pos_count_bc, neg_count_bc):
    def predictor(review):
        pos_log_prob = log_prob_calculator(review, pos_count_bc.value)
        neg_log_prob = log_prob_calculator(review, neg_count_bc.value)
        return 'positive' if pos_log_prob > neg_log_prob else 'negative'
    return predictor

In [12]:
def cal_accuracy(data, pos_counts, neg_counts):
    pos_count_bc = spark.sparkContext.broadcast(pos_counts)
    neg_count_bc = spark.sparkContext.broadcast(neg_counts)

    predictor_udf = udf(create_predictor(pos_count_bc, neg_count_bc), StringType())

    predictions = training_data.withColumn('prediction', predictor_udf(data.review))

    accuracy = predictions.filter(predictions.sentiment == predictions.prediction).count()/training_data.count()
    return accuracy

In [None]:
# Training predictions and accuracy
cal_accuracy(training_data, pos_counts, neg_counts)

In [12]:
# Test predictions and accuracy
pos_count_bc = spark.sparkContext.broadcast(test_pos_counts)
neg_count_bc = spark.sparkContext.broadcast(test_neg_counts)

predictor_udf = udf(create_predictor(pos_count_bc, neg_count_bc), StringType())

predictions = training_data.withColumn('prediction', predictor_udf(testing_data.review))

accuracy = predictions.filter(predictions.sentiment == predictions.prediction).count()/training_data.count()
print(accuracy)

0.8363709078443695
