In [1]:
import os
import string
from nltk.tokenize import word_tokenize
from gensim import corpora
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from nltk import pos_tag, bigrams
from nltk.stem.porter import PorterStemmer
import numpy as np

In [2]:
def clean_review(review, stem):
    """Clean a given movie review
    
    Tokenizes and removes punctuation and additional styling/formatting
    
    Args:
        review (str): movie review text string
        stem (boolean): indicator whether to stem or just tokenize
        
    Returns:
        clean_review (list): tokenized, cleaned movie review
    """
    # Remove styling
    review = review.replace("--", "").replace("_", " ").replace("-", " ")
    review = word_tokenize(review)
    if stem:
        stemmer = PorterStemmer()
        review = [stemmer.stem(token) for token in review]
    # Remove punctuation
    cleaned_review = [x for x in review if not re.fullmatch('[' + string.punctuation + ']+', x)]
    return cleaned_review

In [3]:
def read_reviews(dir_path, stem=False):
    """Reads in all reviews from directory and performs train/test split
    
    Args:
        dir_path (str): path to directory containing reviews
        stem (boolean): indicator whether to stem or just tokenize
        
    Returns:
        train (list): list of cleaned, tokenized movie reviews for training
        test (list): list of cleaned, tokenized movie reviews for testing
    """
    train = []
    test = []
    for filename in os.listdir(dir_path):
        with open(os.path.join(dir_path, filename), 'r') as f:
            review = clean_review(f.read(), stem)
            # Files beginning with cv9 indicate that it's part of the test set (cross-validation fold 9)
            if filename.startswith('cv9'):
                test.append(review)
            else:
                train.append(review)
    return train, test

In [4]:
def show_most_informative_features(vectorizer, classifier, n=10):
    class_labels = classifier.classes_
    feature_names = vectorizer.get_feature_names()  
    topn_pos_class = sorted(zip(classifier.feature_count_[1], feature_names),reverse=True)[:n]
    topn_neg_class = sorted(zip(classifier.feature_count_[0], feature_names),reverse=True)[:n]    

    print("Important words in positive reviews")
    for coef, feature in topn_pos_class:
        print(class_labels[1], coef, feature) 
    print("-----------------------------------------")
    print("Important words in negative reviews")
    for coef, feature in topn_neg_class:
        print(class_labels[0], coef, feature)        

In [6]:
def get_adj_adv(reviews):
    """Filters out words that aren't adjective or adverbs from review
    
    Args:
        reviews (list): list of cleaned, tokenized movie reviews
        
    Returns:
        reviews_adj_adv (list): list of cleaned, tokenized movie reviews with only adverbs/adjectives
    """
    reviews_adj_adv = []
    # POS tags for adjectives and adverbs
    adj_adv = ['JJ','JJR','JJS','RB','RBR','RBS']
    for review in reviews:
        review_pos = pos_tag(review)
        review_adj_adv = [word[0] for word in review_pos if word[1] in adj_adv]
        reviews_adj_adv.append(review_adj_adv)
    return reviews_adj_adv

In [7]:
def calc_pos_neg(review, NRC_emotion):
    """Calculates the positive/negative review ratio from NRC_emotion
    
    Args:
        review (list): tokenized, cleaned movie review
        NRC_emotion (df): dataframe of word, affect, and indicator
    
    Returns:
        ratio (float): positive/negative ratio given tokens in review
    """
    pos = 0
    neg = 0
    for token in review:
        if token in NRC_emotion.Word.values:
            word_emotion = NRC_emotion[NRC_emotion.Word == token]
            # Increase count if word has a flag for positive or negative
            if "positive" in word_emotion.Affect.values:
                pos += 1
            if "negative" in word_emotion.Affect.values:
                neg += 1
    if neg != 0:
        ratio = pos/neg
    else:
        ratio = pos
    return ratio

# Read in and preprocess data

Since the data has already been tokenized and downcased, our preprocessing includes removing styling (as seen through words surrounded by `_` or `-` in visual inspection) and removing punctuation. For some experiments, words are also stemmed and/or stop words are removed

In [8]:
neg_train, neg_test = read_reviews('review_polarity.v2/txt_sentoken/neg')
pos_train, pos_test = read_reviews('review_polarity.v2/txt_sentoken/pos')
pos_train_labels = [1 for i in range(len(pos_train))]
neg_train_labels = [0 for i in range(len(neg_train))]
pos_test_labels = [1 for i in range(len(pos_test))]
neg_test_labels = [0 for i in range(len(neg_test))]
train = pos_train + neg_train
test = pos_test + neg_test
train_labels = pos_train_labels + neg_train_labels
test_labels = pos_test_labels + neg_test_labels

# Problem 1

## M1

In [46]:
vectorizer = CountVectorizer(binary=True)
train_features = vectorizer.fit_transform([' '.join(review) for review in train])
test_features = vectorizer.transform([' '.join(review) for review in test])
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels)
predictions = nb_clf.predict(test_features)
accuracy = accuracy_score(test_labels, predictions)
accuracy

0.865

In [10]:
class_labels = nb_clf.classes_
show_most_informative_features(vectorizer, nb_clf)

Important words in positive reviews
1 900.0 the
1 900.0 of
1 899.0 to
1 899.0 is
1 899.0 and
1 898.0 in
1 891.0 it
1 887.0 that
1 879.0 with
1 873.0 as
-----------------------------------------
Important words in negative reviews
0 899.0 the
0 899.0 of
0 899.0 and
0 898.0 to
0 898.0 is
0 896.0 in
0 880.0 that
0 879.0 it
0 873.0 with
0 860.0 this


## M2

In [52]:
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform([' '.join(review) for review in train])
test_features = vectorizer.transform([' '.join(review) for review in test])
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels)
predictions = nb_clf.predict(test_features)
accuracy = accuracy_score(test_labels, predictions)
accuracy

0.85

In [12]:
class_labels = nb_clf.classes_
show_most_informative_features(vectorizer, nb_clf)

Important words in positive reviews
1 37123.0 the
1 17778.0 and
1 16692.0 of
1 14798.0 to
1 12885.0 is
1 10535.0 in
1 7467.0 it
1 7282.0 that
1 5785.0 as
1 5273.0 with
-----------------------------------------
Important words in negative reviews
0 31470.0 the
0 14027.0 and
0 13857.0 to
0 13857.0 of
0 10407.0 is
0 9074.0 in
0 7044.0 it
0 6988.0 that
0 4421.0 this
0 4420.0 as


## M3

In [13]:
train_adj_adv = get_adj_adv(train)
test_adj_adv = get_adj_adv(test)

In [60]:
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform([' '.join(review) for review in train_adj_adv])
test_features = vectorizer.transform([' '.join(review) for review in test_adj_adv])
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels)
predictions = nb_clf.predict(test_features)
accuracy = accuracy_score(test_labels, predictions)
accuracy

0.87

In [15]:
class_labels = nb_clf.classes_
show_most_informative_features(vectorizer, nb_clf)

Important words in positive reviews
1 2666.0 not
1 1633.0 more
1 1316.0 so
1 1247.0 most
1 1200.0 just
1 1111.0 good
1 1072.0 also
1 1045.0 even
1 1027.0 very
1 1014.0 only
-----------------------------------------
Important words in negative reviews
0 2430.0 not
0 1533.0 so
0 1391.0 just
0 1362.0 more
0 1221.0 even
0 1205.0 only
0 1031.0 good
0 935.0 bad
0 915.0 much
0 805.0 most


## M4

In [19]:
vectorizer = TfidfVectorizer(min_df = 5, max_df = 0.8, stop_words='english', sublinear_tf=True)
train_features = vectorizer.fit_transform([' '.join(review) for review in train])
test_features = vectorizer.transform([' '.join(review) for review in test])
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels)
predictions = nb_clf.predict(test_features)
accuracy = accuracy_score(test_labels, predictions)
accuracy

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


0.865

In [20]:
class_labels = nb_clf.classes_
show_most_informative_features(vectorizer, nb_clf)

Important words in positive reviews
1 22.344073604866896 movie
1 18.75741514140869 like
1 17.743868809614842 does
1 16.896703821132636 story
1 16.854901947165093 life
1 16.76517275594625 just
1 16.656367315117194 good
1 16.355788961060934 time
1 15.085297228325906 character
1 14.486111572711723 best
-----------------------------------------
Important words in negative reviews
0 27.247686886244978 movie
0 21.205307710450015 like
0 19.79476920211381 just
0 18.706809390109427 bad
0 17.641005245526756 does
0 17.091878055954655 good
0 16.611207729535327 plot
0 16.55398161042137 time
0 15.030208169308109 character
0 14.87075481260817 story


## M5

In [35]:
vectorizer = CountVectorizer(binary=True, ngram_range=(2,2))
train_features = vectorizer.fit_transform([' '.join(review) for review in train])
test_features = vectorizer.transform([' '.join(review) for review in test])
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels)
predictions = nb_clf.predict(test_features)
accuracy = accuracy_score(test_labels, predictions)
accuracy

0.86

In [36]:
class_labels = nb_clf.classes_
show_most_informative_features(vectorizer, nb_clf)

Important words in positive reviews
1 845.0 of the
1 787.0 in the
1 682.0 the film
1 657.0 to the
1 634.0 and the
1 577.0 to be
1 570.0 on the
1 533.0 with the
1 521.0 for the
1 515.0 is the
-----------------------------------------
Important words in negative reviews
0 828.0 of the
0 796.0 in the
0 641.0 the film
0 624.0 to be
0 577.0 to the
0 560.0 and the
0 545.0 on the
0 481.0 with the
0 477.0 for the
0 457.0 the movie


## Results Analysis
The best model for problem 1 is M3 with **87% accuracy**. M3 probably saw the highest performance given that most positive/negative words that would capture sentiment are adjectives and adverbs.

When looking at the most informative words, M1, M2, and M5 do not have informative lists because stop words were not removed. Most of the informative words are just frequent words (e.g. of, the, and, that, it). M3 is slightly better because it captures words like good for positive and bad for negative, but they also share a lot of words. M4 is the best, which makes sense given it's formula emphasis on high value words, but again the positive and negative lists share words. This shared list of words indicated that we need to create a custom stop words list and run the experiments again.

# Problem 2

In [21]:
stem_neg_train, stem_neg_test = read_reviews('review_polarity.v2/txt_sentoken/neg', stem=True)
stem_pos_train, stem_pos_test = read_reviews('review_polarity.v2/txt_sentoken/pos', stem=True)
stem_pos_train_labels = [1 for i in range(len(stem_pos_train))]
stem_neg_train_labels = [0 for i in range(len(stem_neg_train))]
stem_pos_test_labels = [1 for i in range(len(stem_pos_test))]
stem_neg_test_labels = [0 for i in range(len(stem_neg_test))]
stem_train = stem_pos_train + stem_neg_train
stem_test = stem_pos_test + stem_neg_test
stem_train_labels = stem_pos_train_labels + stem_neg_train_labels
stem_test_labels = stem_pos_test_labels + stem_neg_test_labels

## M1

In [68]:
vectorizer = CountVectorizer(binary=True)
stem_train_features = vectorizer.fit_transform([' '.join(review) for review in stem_train])
stem_test_features = vectorizer.transform([' '.join(review) for review in stem_test])
nb_clf = MultinomialNB()
nb_clf.fit(stem_train_features, stem_train_labels)
predictions = nb_clf.predict(stem_test_features)
accuracy = accuracy_score(stem_test_labels, predictions)
accuracy

0.85

In [23]:
class_labels = nb_clf.classes_
show_most_informative_features(vectorizer, nb_clf)

Important words in positive reviews
1 900.0 the
1 900.0 of
1 899.0 to
1 899.0 is
1 899.0 and
1 898.0 in
1 894.0 it
1 887.0 that
1 879.0 with
1 873.0 as
-----------------------------------------
Important words in negative reviews
0 899.0 the
0 899.0 of
0 899.0 and
0 898.0 to
0 898.0 is
0 896.0 in
0 889.0 it
0 880.0 that
0 873.0 with
0 860.0 thi


## M2

In [74]:
vectorizer = CountVectorizer()
stem_train_features = vectorizer.fit_transform([' '.join(review) for review in stem_train])
stem_test_features = vectorizer.transform([' '.join(review) for review in stem_test])
nb_clf = MultinomialNB()
nb_clf.fit(stem_train_features, stem_train_labels)
predictions = nb_clf.predict(stem_test_features)
accuracy = accuracy_score(stem_test_labels, predictions)
accuracy

0.84

In [25]:
class_labels = nb_clf.classes_
show_most_informative_features(vectorizer, nb_clf)

Important words in positive reviews
1 37122.0 the
1 17778.0 and
1 16696.0 of
1 14799.0 to
1 12885.0 is
1 10541.0 in
1 8604.0 it
1 7283.0 that
1 5785.0 as
1 5536.0 film
-----------------------------------------
Important words in negative reviews
0 31470.0 the
0 14027.0 and
0 13858.0 of
0 13857.0 to
0 10407.0 is
0 9085.0 in
0 7949.0 it
0 6991.0 that
0 4512.0 film
0 4423.0 thi


## M3

In [26]:
stem_train_adj_adv = get_adj_adv(stem_train)
stem_test_adj_adv = get_adj_adv(stem_test)

In [80]:
vectorizer = CountVectorizer()
stem_train_features = vectorizer.fit_transform([' '.join(review) for review in stem_train_adj_adv])
stem_test_features = vectorizer.transform([' '.join(review) for review in stem_test_adj_adv])
nb_clf = MultinomialNB()
nb_clf.fit(stem_train_features, stem_train_labels)
predictions = nb_clf.predict(stem_test_features)
accuracy = accuracy_score(stem_test_labels, predictions)
accuracy

0.82

In [28]:
class_labels = nb_clf.classes_
show_most_informative_features(vectorizer, nb_clf)

Important words in positive reviews
1 2679.0 not
1 1894.0 hi
1 1634.0 more
1 1293.0 so
1 1248.0 most
1 1240.0 thi
1 1200.0 just
1 1138.0 good
1 1129.0 other
1 1072.0 also
-----------------------------------------
Important words in negative reviews
0 2434.0 not
0 1495.0 so
0 1391.0 just
0 1369.0 hi
0 1361.0 more
0 1304.0 thi
0 1241.0 even
0 1051.0 good
0 943.0 bad
0 915.0 much


## M4

In [29]:
vectorizer = TfidfVectorizer(min_df = 5, max_df = 0.8, stop_words='english', sublinear_tf=True)
stem_train_features = vectorizer.fit_transform([' '.join(review) for review in stem_train])
stem_test_features = vectorizer.transform([' '.join(review) for review in stem_test])
nb_clf = MultinomialNB()
nb_clf.fit(stem_train_features, stem_train_labels)
predictions = nb_clf.predict(stem_test_features)
accuracy = accuracy_score(stem_test_labels, predictions)
accuracy

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


0.845

In [30]:
class_labels = nb_clf.classes_
show_most_informative_features(vectorizer, nb_clf)

Important words in positive reviews
1 23.28718465338969 wa
1 21.123628977220868 charact
1 21.028249830714365 like
1 19.956752083075894 make
1 19.730727920758845 time
1 18.902287883181813 doe
1 18.644755330309536 stori
1 18.214209246092075 veri
1 18.13782430551459 scene
1 18.011183542812528 good
-----------------------------------------
Important words in negative reviews
0 25.82322463722663 wa
0 23.65874998551258 like
0 21.59474054878139 charact
0 21.032481785881604 just
0 19.97537887820938 bad
0 19.965091970184897 make
0 19.837018532768607 onli
0 19.170617578873305 time
0 18.768513416423385 doe
0 18.49071156796062 scene


## M5

In [37]:
vectorizer = CountVectorizer(binary=True, ngram_range=(2,2))
stem_train_features = vectorizer.fit_transform([' '.join(review) for review in stem_train])
stem_test_features = vectorizer.transform([' '.join(review) for review in stem_test])
nb_clf = MultinomialNB()
nb_clf.fit(stem_train_features, stem_train_labels)
predictions = nb_clf.predict(stem_test_features)
accuracy = accuracy_score(stem_test_labels, predictions)
accuracy

0.85

## Results Analysis
The best model for problem 2 is M1 and M5 tied with **85% accuracy**. However, all models saw a drop in performance when using Porter stemming indicating we should not use it. 

When looking at the most informative words, M1 and M2 do not have informative lists because stop words were not removed. Most of the informative words are just frequent words (e.g. of, the, and, that, it). M3 is slightly better because it captures words like good for positive and bad for negative, but they also share a lot of words. M4 is the best, which makes sense given it's formula emphasis on high value words, but again the positive and negative lists share words. This shared list of words indicated that we need to create a custom stop words list and run the experiments again.

# Problem 3

In [32]:
NRC_emotion = pd.read_csv("NRC_Emotion.txt", sep="\t", skiprows=22, names=["Word", "Affect", "Indicator"])
NRC_emotion = NRC_emotion[NRC_emotion.Indicator==1]

In [33]:
train_ratios = [calc_pos_neg(review, NRC_emotion) for review in train]
test_ratios = [calc_pos_neg(review, NRC_emotion) for review in test]

In [34]:
nb_clf = MultinomialNB()
nb_clf.fit(np.array(train_ratios).reshape(-1, 1), train_labels)
predictions = nb_clf.predict(np.array(test_ratios).reshape(-1, 1))
accuracy = accuracy_score(test_labels, predictions)
accuracy

0.5

In [39]:
len(set(NRC_emotion.Word.values))

6467

## Results Analysis
This model had the worst performance at **50% accuracy**. Given that the NRC Emotions only has 6,467 words with a labeled affect, the vocabulary is too small to capture the sentiment of a movie review. Additionally, the word `scary` may have a negative connotation in general, but may be a positive attribute when reviewing a horror film. Even with a larger vocabulary or more domain specific NRC Emotions dataset, reducing all the information from a review down to a single ratio may not capture enough of the signal in the data.

# Question 4
If I were to further iterate on this model, the first thing I would do is remove stop words from a custom stop word list. From looking at the signifcant words list, we can see they include a lot of English stop words, like `the` or `and`, or words that have no value in a movie review due to their high frequency, like `movie`, `film`, `story`, or `character`. As I would continue to iterate, I would add any words that appear in both the positive and negative lists to my custom stop words because they are clearly not indicative of sentiment. 

When vectorizing my models, I would also tune the `min_df` and `max_df` parameters. Given the assignment briefing I only set them on tf-idf and used the defaults on all the `CountVectorizer`. However, when playing with these parameters, I often saw a boost in performance on most models. As seen below, I was able to beat my best performance of **87% accuracy** to as high as **88% accuracy** by tuning my hyperparmeters:

In [45]:
# Problem 1 M1
vectorizer = CountVectorizer(binary=True, max_df=0.8, min_df=5)
train_features = vectorizer.fit_transform([' '.join(review) for review in train])
test_features = vectorizer.transform([' '.join(review) for review in test])
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels)
predictions = nb_clf.predict(test_features)
accuracy = accuracy_score(test_labels, predictions)
accuracy

0.875

In [59]:
# Problem 1 M3
vectorizer = CountVectorizer(max_df=0.8, min_df=2)
train_features = vectorizer.fit_transform([' '.join(review) for review in train_adj_adv])
test_features = vectorizer.transform([' '.join(review) for review in test_adj_adv])
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels)
predictions = nb_clf.predict(test_features)
accuracy = accuracy_score(test_labels, predictions)
accuracy

0.875

In [67]:
# Problem 2 M1
vectorizer = CountVectorizer(binary=True, max_df=0.8, min_df=4)
stem_train_features = vectorizer.fit_transform([' '.join(review) for review in stem_train])
stem_test_features = vectorizer.transform([' '.join(review) for review in stem_test])
nb_clf = MultinomialNB()
nb_clf.fit(stem_train_features, stem_train_labels)
predictions = nb_clf.predict(stem_test_features)
accuracy = accuracy_score(stem_test_labels, predictions)
accuracy

0.88

Other experiments to try include normalizing frequency counts so review length does not have an impact. Review length wouldn't be more indicative of the writer's style than sentiment. I would also want to see the impact of lemmatization versus stemming or tokenization only. Finally, I would also want to experiment with pre-trained word vectors. I would expect a model incorporating word vectors would be the most performant given it's ability to capture semantic meaning, which is highly informative to sentiment.