In [1]:
import pandas as pd
import re 
import numpy as np
import spacy

from numpy import asarray
from numpy import zeros
from nltk.corpus import stopwords
from nltk import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Read in sentiment and gernes dataset
sentiment = pd.read_csv('IMDB Dataset.csv')
genres = pd.read_csv('IMDB_dataset_320.000_reviews.csv')[['review', 'category']]

In [3]:
# Include only genres in the 6 major genre categories
genres = genres[genres['category'].isin(['Drama', 'Action', 'Adventure', 'Thriller', 'Horror', 'Comedy'])]
genres.dropna(inplace = True)

In [4]:
# Professors Function: Concates the keys together
def concat_keys(x):
    first, second = x[0], x[1]
    if first <= second:
        return f"{first}{second}"
    else:
        return f"{second}{first}"

In [5]:
# Function that removes characters
def replace(replacements):
    for replace in replacements: 
        genres['review'] = genres['review'].str.replace(replace, '', case = False)
        sentiment['review'] = sentiment['review'].str.replace(replace, '', case = False)

In [6]:
# Returns lowercase version of both review sets
sentiment['review'] = sentiment['review'].str.lower()
genres['review'] = genres['review'].str.lower()

In [7]:
# Finds malformed characters 
sentiment['review'].str.extract(r'(&#[0-9]+)')[0].unique()

array([nan, '&#8217', '&#8230', '&#345', '&#269', '&#61514', '&#305'],
      dtype=object)

In [8]:
genres['review'].str.extract(r'(&#[0-9]+)')[0].unique()

array([nan, '&#0'], dtype=object)

In [9]:
# Replaces all unnessary characters from the dataset
replace(['&#8217', '&#8230', '&#345', '&#269', '&#61514', '&#305', '&#0',
         '/><br', '<br', "\"", "\'", "/", '=',
         '<', '>', ',', '_', '\n', '\.', '-', '\n'])

In [10]:
potential_duplicates = 'abcdefghijklmnopqrstuvwxyz!?'

# Loops through the alphabet and replaces charaters that appear 3+ times in a row with one occurence
for i in potential_duplicates: 
    if i == '?':
        i = '\?'
    genres['review'] = genres.review.apply(lambda x: re.sub(i + i + i + '+', i, x))
    sentiment['review'] = sentiment.review.apply(lambda x: re.sub(i + i + i + '+', i, x))

In [11]:
# New Spacy model to include POS tagging and lemmatization in pipeline
nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser'])

In [12]:
# Performs lemmatization on both datasets
sentiment['review'] = sentiment['review'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))
genres['review'] = genres['review'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))

In [13]:
# Gathers list of stop words
stop_words = list(stopwords.words('english')) + ['also', 'well', 'much', 'get']

In [14]:
# Removes specific words from stop words that are of interest for analysis
for keep in ['before', 'after', 'again',
             'few', 'not', 'don', 'don\'t', 'didn', 'doesn', 
             'doesn\'t', 'didn\'t']: 
    stop_words.remove(keep)

In [15]:
# Removes topwords from both datasets
genres['review'] = genres['review'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop_words]))
sentiment['review'] = sentiment['review'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop_words]))

In [16]:
# Fits a TFIDF Vectorizer on the data
def FitTFIDF(data): 
    # Defines TFIDF Vectorizer, looking only at bi-grams
    vectorizer = TfidfVectorizer(ngram_range=(2, 2),
                                 token_pattern=r'\b[a-zA-Z0-9]{3,}\b',
                                 max_features = 100, stop_words=stop_words)

    # Fits TFIDF Vectorizer to the input dataset
    data_tf = vectorizer.fit_transform(data['review'])
    terms_data = vectorizer.get_feature_names()
    tf_idf = pd.DataFrame(data_tf.toarray().transpose(), index=terms_data)
    
    # Sums up TFIDF values and sorts based on "highest score" 
    sums = tf_idf.sum(axis = 1)
    score = pd.DataFrame(sums, columns = ['score'])
    score.sort_values(by = 'score', ascending = False, inplace = True)

    return score.head(20)

In [17]:
# Gets TFIDF vectorization for positive and negative sentiment classes
tf_idf_positive = FitTFIDF(sentiment.loc[sentiment['sentiment'] == 'positive'])
tf_idf_negative = FitTFIDF(sentiment.loc[sentiment['sentiment'] == 'negative'])

# Gets TFIDF vectorization for different movie genre groups
tf_idf_drama = FitTFIDF(genres.loc[genres['category'] == 'Drama'])
tf_idf_action = FitTFIDF(genres.loc[genres['category'].isin(['Action', 'Adventure'])])
tf_idf_horror = FitTFIDF(genres.loc[genres['category'].isin(['Thriller', 'Horror'])])
tf_idf_comedy = FitTFIDF(genres.loc[genres['category'] == 'Comedy'])

In [18]:
tf_idf_comedy

Unnamed: 0,score
see movie,2191.154719
watch movie,1925.489298
good movie,1712.045775
one good,1644.99603
movie not,1559.360895
romantic comedy,1532.083739
ever see,1505.417968
main character,1486.777382
look like,1486.00048
make movie,1437.646829


In [19]:
# Reload genres dataset and reclassify genre groups
genres_eda = pd.read_csv('IMDB_dataset_320.000_reviews.csv')
genres_eda = genres_eda[genres_eda['category'].isin(['Drama', 'Action', 'Adventure', 'Thriller', 'Horror', 'Comedy'])]
genres_eda['category'] = np.where(genres_eda['category'].isin(['Action', 'Adventure']), 'Action/Adventure',
                        np.where(genres_eda['category'].isin(['Thriller', 'Horror']), 'Thriller/Horror', genres_eda['category']))

In [20]:
genres_eda

Unnamed: 0,id number,Greek title,original title,category,director/creator,movie lenght,movie date,author,review date,review title,review,label,mean of stars,number of reviews,full reviews average stars,url
0,0,Lekin...,Lekin...,Drama,Gulzar,2 hours 51 minutes,1990,dwnpiyush-336-40603,8 April 2012,For those who don't mind a slow pacing to the...,"""Yaara Sili Sili Virah Ki Raat Ka Jalna""'Lekin...",8,9.16,6,7.5,https://www.imdb.com/title/tt00100002/?ref_=tt...
1,1,Lekin...,Lekin...,Drama,Gulzar,2 hours 51 minutes,1990,cseabhi,10 July 2021,Lekin - Gulzar's haunting masterpiece,Gulzar is at his best when he is telling such ...,9,9.16,6,7.5,https://www.imdb.com/title/tt00100002/?ref_=tt...
2,2,Lekin...,Lekin...,Drama,Gulzar,2 hours 51 minutes,1990,abbott32,2 July 2004,Haunting film but would love to have been abl...,I was completely mesmerized by Lekin and espec...,9,9.16,6,7.5,https://www.imdb.com/title/tt00100002/?ref_=tt...
3,3,Lekin...,Lekin...,Drama,Gulzar,2 hours 51 minutes,1990,monoglot,29 November 2005,An intriguing story well told.,Greatly enjoyed the development of the story l...,9,9.16,6,7.5,https://www.imdb.com/title/tt00100002/?ref_=tt...
4,4,Lekin...,Lekin...,Drama,Gulzar,2 hours 51 minutes,1990,Kammu,27 November 1999,It's a classic,"The lines of time are very blurry. Past, prese...",10,9.16,6,7.5,https://www.imdb.com/title/tt00100002/?ref_=tt...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320731,320731,Mario Bros.,Mario Bros.,Action/Adventure,Shigeru Miyamoto,,1983,Olicosmic,22 May 2019,A decently fun arcade classic,"After Mario's debut as ""Jumpman"" in the origin...",7,6.88,8,7.5,https://www.imdb.com/title/tt00176971/?ref_=tt...
320732,320732,Mario Bros.,Mario Bros.,Action/Adventure,Shigeru Miyamoto,,1983,Mzo-3,28 August 2000,One of the classics,This game must be very confusing to the younge...,7,6.88,8,7.5,https://www.imdb.com/title/tt00176971/?ref_=tt...
320733,320733,Mario Bros.,Mario Bros.,Action/Adventure,Shigeru Miyamoto,,1983,celticdragonlrd,21 October 2007,Super Mario Brothers,A classic Nintendo game for old-school video g...,8,6.88,8,7.5,https://www.imdb.com/title/tt00176971/?ref_=tt...
320734,320734,Mario Bros.,Mario Bros.,Action/Adventure,Shigeru Miyamoto,,1983,hellraiser7,15 November 2021,Plumbing Heroes,This is another honorable mention in my favori...,9,6.88,8,7.5,https://www.imdb.com/title/tt00176971/?ref_=tt...


In [21]:
# Gather summarized statistics on each genre group
genres_eda.groupby('category').mean()

Unnamed: 0_level_0,id number,label,mean of stars,number of reviews,full reviews average stars
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Action/Adventure,151965.988092,6.188097,6.188123,116.198838,6.103403
Comedy,155459.696338,6.786963,6.786992,83.554296,6.382677
Drama,159154.388051,6.856301,6.85612,81.349932,6.712467
Thriller/Horror,162761.385469,5.540229,5.540124,89.136779,5.144849


In [22]:
# Gather summarized statistics on each genre group, without duplicates
genres_eda[['original title', 'category', 'mean of stars', 'number of reviews']].drop_duplicates().groupby('category').mean()

Unnamed: 0_level_0,mean of stars,number of reviews
category,Unnamed: 1_level_1,Unnamed: 2_level_1
Action/Adventure,6.187887,22.073882
Comedy,6.880495,15.539251
Drama,7.125577,11.288983
Thriller/Horror,5.570958,15.924152
