In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, explode

In [2]:
spark = SparkSession.builder.appName("SentimentAnalysis").getOrCreate()

In [3]:
spark

# Load the data

In [4]:
movie_reviews_df = spark.read.csv("IMDB Dataset.csv", header=True, inferSchema=True, quote='"', escape='"')

In [5]:
movie_reviews_df.printSchema()

root
 |-- review: string (nullable = true)
 |-- sentiment: string (nullable = true)



# Count the labels

In [6]:
movie_reviews_df.groupBy('sentiment').count().show()

+---------+-----+
|sentiment|count|
+---------+-----+
| positive|25000|
| negative|25000|
+---------+-----+



In [7]:
counts = movie_reviews_df.groupBy('sentiment').count()
counts = counts.rdd.collectAsMap()
print(counts)

{'positive': 25000, 'negative': 25000}


# Count the words

In [11]:
def get_words_count_in_a_class(reviews_data_frame, class_label):
    class_reviews = reviews_data_frame.filter(reviews_data_frame.sentiment == class_label)
    words_column = explode(split(class_reviews.review, "\s+")).alias("word")
    words_counts = class_reviews.select(words_column).groupBy("word").count()
    words_counts = words_counts.rdd.collectAsMap()
    return words_counts

In [12]:
pos_words_counts = get_words_count_in_a_class(movie_reviews_df, "positive")
neg_words_counts = get_words_count_in_a_class(movie_reviews_df, "negative")