In [1]:
import os
import yaml
import nltk
import string
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
credentials_file = os.path.abspath(os.path.join('..', 'credentials.yaml'))
with open(credentials_file, 'r') as f:
    credentials = yaml.safe_load(f)

storage_options = {
    'key': credentials['aws']['access_key'],
    'secret': credentials['aws']['secret_access_key']
}

In [3]:
partition_uri = f's3://{credentials["aws"]["bucket"]}/reviews/reviews_partition_1.csv'
partition = pd.read_csv(
    partition_uri,
    compression='gzip',
    storage_options=storage_options
)
partition = partition.drop('author', axis=1).astype({'review_date': 'datetime64[D]'})
partition['positive_sentiment'] = partition['rating'] > 5
mem = partition.memory_usage(deep=True).sum()/1024/1024
print('Number of reviews:', len(partition))
print(f'Memory usage: {mem:.1f} Mb')
partition.head()

Number of reviews: 242940
Memory usage: 319.9 Mb


Unnamed: 0,text,rating,title,movie_id,upvotes,total_votes,review_date,positive_sentiment
0,"Chucky (the murderous doll from ""Child's Play""...",9.0,Silly but fun,/title/tt0144120/,33,40,2006-07-06,True
1,"They obviously made ""Bride of Chucky"" with the...",10.0,glass ceiling has a new meaning,/title/tt0144120/,17,20,2009-03-21,True
2,Well my opinion has changed for this one becau...,10.0,Who The (Beep) Is Martha Stewart?(**** Out Of...,/title/tt0144120/,19,22,2012-12-13,True
3,Clever is the word that comes to mind when I t...,6.0,A rough ride to Hackensack for Chucky and his...,/title/tt0144120/,11,15,2008-10-29,True
4,Realizing he needs to turn back into human for...,10.0,The best of the series,/title/tt0144120/,11,15,2012-08-23,True


In [34]:
partition_sample = (
    partition[~partition['rating'].isna()]
    .sort_values(['positive_sentiment', 'total_votes'])
    .groupby('positive_sentiment')
    .tail(5000)
)
partition_sample.head()

Unnamed: 0,text,rating,title,movie_id,upvotes,total_votes,review_date,positive_sentiment
231528,ZERO chemistry between the lead actors. My soc...,2.0,Here's the problem...,/title/tt1563742/,43,62,2020-04-13,False
231575,So this movie is terrible i was miserable whil...,2.0,horrible,/title/tt1563742/,29,62,2018-05-15,False
231593,"If you waste your time and money to see this, ...",1.0,One of the worst movies of all time,/title/tt1563742/,24,62,2018-05-05,False
237444,"""Crash"" is not the best picture of the year; t...",1.0,Despicable,/title/tt0375679/,36,62,2006-03-06,False
237446,I don't think this movie deserves to be nomine...,5.0,Best Motion Picture of the Year? C'Mon!,/title/tt0375679/,34,62,2006-02-03,False


In [42]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [36]:
count_vect = CountVectorizer()

In [50]:
count_vect = CountVectorizer(ngram_range=(1, 2), max_features=500)
count_matrix = count_vect.fit_transform(partition_sample['text'])
print(
    f'Number of samples: {count_matrix.shape[0]}',
    f'Number of features: {count_matrix.shape[1]}',
    sep='\n'
)

Number of samples: 10000
Number of features: 500


In [52]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(count_matrix, partition_sample['positive_sentiment'])

In [54]:
prediction = logreg.predict(count_matrix)

In [55]:
from sklearn.metrics import classification_report

In [59]:
report = classification_report(partition_sample['positive_sentiment'], prediction)
print(report)

              precision    recall  f1-score   support

       False       0.84      0.86      0.85      5000
        True       0.85      0.83      0.84      5000

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [61]:
tf_vect = TfidfVectorizer(use_idf=False, ngram_range=(1, 2), max_features=500)
tf_matrix = tf_vect.fit_transform(partition_sample['text'])
print(
    f'Number of samples: {tf_matrix.shape[0]}',
    f'Number of features: {tf_matrix.shape[1]}',
    sep='\n'
)

Number of samples: 10000
Number of features: 500


In [62]:
tf_logreg = LogisticRegression(max_iter=1000)
tf_logreg.fit(tf_matrix, partition_sample['positive_sentiment'])

In [67]:
tf_prediction = tf_logreg.predict(tf_matrix)
tf_report = classification_report(partition_sample['positive_sentiment'], tf_prediction)
print(tf_report)

              precision    recall  f1-score   support

       False       0.84      0.81      0.82      5000
        True       0.82      0.84      0.83      5000

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000



In [65]:
tfidf_vect = TfidfVectorizer(use_idf=True, ngram_range=(1, 2), max_features=500)
tfidf_matrix = tfidf_vect.fit_transform(partition_sample['text'])
print(
    f'Number of samples: {tfidf_matrix.shape[0]}',
    f'Number of features: {tfidf_matrix.shape[1]}',
    sep='\n'
)

Number of samples: 10000
Number of features: 500


In [66]:
tfidf_logreg = LogisticRegression(max_iter=1000)
tfidf_logreg.fit(tfidf_matrix, partition_sample['positive_sentiment'])

In [68]:
tfidf_prediction = tfidf_logreg.predict(tfidf_matrix)
tfidf_report = classification_report(partition_sample['positive_sentiment'], tfidf_prediction)
print(tfidf_report)

              precision    recall  f1-score   support

       False       0.85      0.84      0.84      5000
        True       0.84      0.85      0.84      5000

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000

