In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import nltk
import string

In [16]:
#import file of reviews
df = pd.read_csv('critics.csv')
#remove reviews without text, reset index
df = df[~df['quote'].isnull()]
df = df.reset_index(drop=True)
#remove reviews without a "fresh" or "rotten" rating, reset index
df = df[df['fresh'].isin(['fresh','rotten'])]
df = df.reset_index(drop=True)
#create reset point for df
starter = df

In [174]:
#reset df to "undo" modifications and start over
df = starter

In [139]:
#Baseline: original sklearn Count Vectorizer with lowercase normalization, default tokenizer
vectorizer_baseline = CountVectorizer()

In [138]:
# 1. Preprocessing Modifications to Count Vectorizer
#    Each vectorizer (1a, 1b, 1c) adds an additional preprocessing step to the previous version

In [17]:
# 1a. Count Vectorizer with preprocessing the following modification(s): 
#     normalize with stemmer

class StemTokenizer(object):
    def __init__(self):
        self.stemmer = nltk.PorterStemmer()
                
    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in nltk.word_tokenize(doc)]

vectorizer_1a = CountVectorizer(
    lowercase = True,
    # tokenize and stem
    tokenizer=StemTokenizer()
)

In [18]:
# 1b. Count Vectorizer with preprocessing the following modification(s): 
#     normalize with stemmer
#     clean stop words and punctuation

class StemStopWordsTokenizer(object):
    def __init__(self):
        self.stemmer = nltk.PorterStemmer()
        stop_words_list = [nltk.word_tokenize(word) for word in nltk.corpus.stopwords.words('English')]
        stop_words_flattened = set([item for sublist in stop_words_list for item in sublist])
        self.stop_words = list(stop_words_flattened) + list(string.punctuation)
        
    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in nltk.word_tokenize(doc) if t not in self.stop_words]

vectorizer_1b = CountVectorizer(
    lowercase = True,
    #tokenize, stem, remove stop words / punctuation
    tokenizer=StemStopWordsTokenizer()
)

In [30]:
# 1c. Count Vectorizer with preprocessing the following modification(s): 
#     normalize with stemmer
#     clean stop words and punctuation 
#     limit document frequency

class StemStopWordsTokenizer(object):
    def __init__(self):
        self.stemmer = nltk.PorterStemmer()
        stop_words_list = [nltk.word_tokenize(word) for word in nltk.corpus.stopwords.words('English')]
        stop_words_flattened = set([item for sublist in stop_words_list for item in sublist])
        self.stop_words = list(stop_words_flattened) + list(string.punctuation)
        
    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in nltk.word_tokenize(doc) if t not in self.stop_words]

vectorizer_1c = CountVectorizer(
    lowercase = True,
    # tokenize, stem, remove stop words / punctuation
    tokenizer=StemStopWordsTokenizer(),
    # set document frequency minimum
    min_df=15
)

In [None]:
# 3. Context Incorporation to Count Vectorizer with N-grams

In [146]:
# 3a. Count Vectorizer (1c) with the following modification(s): 
#     normalize with stemmer
#     clean stop words and punctuation 
#     limit document frequency
#     incorporate word context with n-grams

class StemStopWordsTokenizer(object):
    def __init__(self):
        self.stemmer = nltk.PorterStemmer()
        stop_words_list = [nltk.word_tokenize(word) for word in nltk.corpus.stopwords.words('English')]
        stop_words_flattened = set([item for sublist in stop_words_list for item in sublist])
        self.stop_words = list(stop_words_flattened) + list(string.punctuation)
        
    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in nltk.word_tokenize(doc) if t not in self.stop_words]

vectorizer_3a = CountVectorizer(
    lowercase = True,
    #set n-grams (2 = bigrams, 3 = trigrams, etc.)
    ngram_range=(1, 2),
    # tokenize, stem, remove stop words / punctuation
    tokenizer=StemStopWordsTokenizer(),
    #set document frequency minimum
    min_df=15
)

In [175]:
# 3b. Count Vectorizer with preprocessing the following modification(s): 
#     normalize with stemmer
#     limit document frequency
#     !!! Stop words included !!!
#     incorporate word context with n-grams

class StemTokenizer(object):
    def __init__(self):
        self.stemmer = nltk.PorterStemmer()
                
    def __call__(self, doc):
        return [self.stemmer.stem(t) for t in nltk.word_tokenize(doc)]

vectorizer_3b = CountVectorizer(
    lowercase = True,
    #set n-grams (2 = bigrams, 3 = trigrams, etc.)
    ngram_range=(1, 2),
    # tokenize and stem
    tokenizer=StemTokenizer(),
    #set document frequency minimum
    min_df=5
)

In [176]:
#Vectorize with vectorizer_baseline, vectorizer_1a, vectorizer_1b, vectorizer_1c, vectorizer_3a, vectorizer_3b
vectorized = vectorizer_3b.fit_transform(df['quote'])
X = pd.DataFrame.sparse.from_spmatrix(vectorized, columns=vectorizer_3b.get_feature_names())
y = df['fresh'] == 'fresh'
print(len(X.columns))

#Split data to train (75%) and test (25%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

14749


In [186]:
#Modification 2: Adjust Naive Bayes alpha hyperparameter (originally 0.01)

nb = MultinomialNB(alpha=0.7)
nb.fit(X_train,y_train)

MultinomialNB(alpha=0.7)

In [185]:
predictions = nb.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

       False       0.68      0.71      0.70      1480
        True       0.82      0.80      0.81      2404

    accuracy                           0.77      3884
   macro avg       0.75      0.75      0.75      3884
weighted avg       0.77      0.77      0.77      3884

