In [89]:
# installing appropiate libraries and packages

# %pip install pandas
# %pip install numpy
# %pip install rouge-score
# %pip install matplotlib
# %pip install scikit-learn
# %pip install torch

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from rouge_score import rouge_scorer

import csv
import itertools as it
import numpy as np
import sklearn.decomposition
from tqdm import tqdm
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [2]:
# loading full dataset in
movies = pd.read_csv('rotten_tomatoes_movies.csv')
reviews = pd.read_csv('rotten_tomatoes_critic_reviews.csv')

In [3]:
# dropping movies that don't have ground truth
filteredMovies = movies.dropna(subset = "critics_consensus")

In [4]:
# parameter: how many movies to conduct the analysis for (CHANGE THIS)
to_keep = 5

# furtherFilteredMovies = filteredMovies[:to_keep]
furtherFilteredMovies= filteredMovies.sample(n = to_keep)
furtherFilteredMovies

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
7533,m/harvard_beats_yale_29_29,Harvard Beats Yale 29-29,"In November 1968, undefeated Ivy League footba...",Harvard Beats Yale 29-29 is compelling viewing...,PG,"Documentary, Sports & Fitness",Kevin Rafferty,,"Tommy Lee Jones, Brian Dowling, Vic Gatto, Fra...",2008-09-05,...,Emerging Pictures,Fresh,92.0,38.0,Upright,76.0,553.0,19,35,3
11853,m/pioneer_2013,Pioneer,The ultimate adventure becomes a nightmare whe...,Pioneer boasts strong acting and a throwback c...,R,"Drama, Mystery & Suspense",Erik Skjoldbjaerg,"Nikolaj Frobenius, Hans Gunnarsson, Cathinka N...","Wes Bentley, Stephen Lang, Aksel Hennie, Steph...",2014-12-05,...,Magnolia Pictures,Rotten,55.0,44.0,Spilled,33.0,423.0,18,24,20
6983,m/ghostbusters_2,Ghostbusters 2,After saving New York City from a ghost attack...,"Thanks to the cast, Ghostbusters 2 is reasonab...",PG,"Comedy, Science Fiction & Fantasy",Ivan Reitman,"Harold Ramis, Dan Aykroyd","Bill Murray, Dan Aykroyd, Sigourney Weaver, Ha...",1989-06-16,...,Sony Pictures Home Entertainment,Rotten,54.0,39.0,Upright,61.0,406948.0,7,21,18
10459,m/miss_congeniality_2,Miss Congeniality 2 - Armed and Fabulous,Gracie Hart (Sandra Bullock) was never thrille...,Sandra Bullock is still as appealing as ever; ...,PG-13,Comedy,John Pasquin,"Marc Lawrence (II), Marc Lawrence","Sandra Bullock, Regina King, William Shatner, ...",2005-03-23,...,Warner Bros.,Rotten,15.0,147.0,Spilled,43.0,447190.0,36,22,125
2839,m/antonias_line,Antonia (Antonia's Line),"After World War II, Antonia (Willeke van Ammel...","Magical and morbid, Antonia picturesque landsc...",R,"Art House & International, Comedy, Drama, Romance",Marleen Gorris,Marleen Gorris,"Willeke van Ammelrooy, Els Dottermans, Jan Dec...",1995-09-12,...,BMG,Fresh,67.0,49.0,Upright,91.0,6224.0,21,33,16


In [5]:
movies_to_keep = furtherFilteredMovies[['rotten_tomatoes_link']]

# data table for each critic review that has ground truth
combined_data = movies_to_keep.merge(reviews, how = 'left')

In [6]:
combined_data.head()

Unnamed: 0,rotten_tomatoes_link,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content
0,m/harvard_beats_yale_29_29,John Anderson,True,Variety,Fresh,,2008-09-15,How many thrillers could put the outcome in th...
1,m/harvard_beats_yale_29_29,,True,New York Magazine/Vulture,Fresh,,2008-11-17,"This touching, exciting film works less as a c..."
2,m/harvard_beats_yale_29_29,Paul Brenner,False,Filmcritic.com,Fresh,4/5,2008-11-18,Ideas were flying around like bullets. And so ...
3,m/harvard_beats_yale_29_29,Manohla Dargis,True,New York Times,Fresh,4.5/5,2008-11-19,Kevin Rafferty makes the case for remembrance ...
4,m/harvard_beats_yale_29_29,Kyle Smith,True,New York Post,Rotten,2/4,2008-11-19,"The movie, which absurdly tries to paint the H..."


In [7]:
# dropping data that don't have review values
filtered_combined_data = combined_data.dropna(subset = "review_content")

In [8]:
import numpy as np
import re
import string
import random

def show_similar_words(tokenizer, reps, tokens):
    reps = reps / (np.sqrt((reps ** 2).sum(axis=1, keepdims=True)))
    #for i, (word, token) in enumerate(tokenizer.word_to_token.items()):
    for token in tokens:
        word = tokenizer.token_to_word[token]
        rep = reps[token, :]
        sims = ((reps - rep) ** 2).sum(axis=1)
        nearest = np.argsort(sims)
        print(word, token)
        for j in nearest[1:6]:
            print(" ", tokenizer.token_to_word[j], "%.3f" % sims[j])

class Tokenizer:
  def __init__(self, min_occur=10):
    self.word_to_token = {}
    self.token_to_word = {}
    self.word_count = {}

    self.word_to_token['<unk>'] = 0
    self.token_to_word[0] = '<unk>'
    self.vocab_size = 1

    self.min_occur = min_occur

  def fit(self, corpus):
    for review in corpus:
      review = review.strip().lower()
      words = re.findall(r"[\w']+|[.,!?;]", review)
      for word in words:
          if word not in self.word_count:
              self.word_count[word] = 0
          self.word_count[word] += 1

    for review in corpus:
      review = review.strip().lower()
      words = re.findall(r"[\w']+|[.,!?;]", review)
      for word in words:
        if self.word_count[word] < self.min_occur:
          continue
        if word in self.word_to_token:
          continue
        self.word_to_token[word] = self.vocab_size
        self.token_to_word[self.vocab_size] = word
        self.vocab_size += 1

  def tokenize(self, corpus):
    tokenized_corpus = []
    for review in corpus:
      review = review.strip().lower()
      words = re.findall(r"[\w']+|[.,!?;]", review)
      tokenized_review = []
      for word in words:
        if word not in self.word_to_token:
          tokenized_review.append(0)
        else:
          tokenized_review.append(self.word_to_token[word])
      tokenized_corpus.append(tokenized_review)
    return tokenized_corpus

  def de_tokenize(self, tokenized_corpus):
    corpus = []
    for tokenized_review in tokenized_corpus:
      review = []
      for token in tokenized_review:
        review.append(self.token_to_word[token])
      corpus.append(" ".join(review))
    return corpus


class CountVectorizer:
  def __init__(self, min_occur=10):
    self.tokenizer = Tokenizer(min_occur)

  def fit(self, corpus):
    self.tokenizer.fit(corpus)

  def transform(self, corpus):
    n = len(corpus)
    X = np.zeros((n, self.tokenizer.vocab_size))
    for i, review in enumerate(corpus):
      review = review.strip().lower()
      words = re.findall(r"[\w']+|[.,!?;]", review)
      for word in words:
        if word not in self.tokenizer.word_count or self.tokenizer.word_count[word] < self.tokenizer.min_occur:
          X[i][0] += 1
        else:
          X[i][self.tokenizer.word_to_token[word]] += 1
    return X

def get_ngrams(tokenized_corpus, window_size, pad_idx=2006):
    ngrams = []
    for i, review in enumerate(tokenized_corpus):
        for j, word in enumerate(review):
            min_ind = max(0, j-window_size)
            max_ind = min(len(review), j+window_size+1)
            ctx = np.zeros(2 * window_size, dtype=np.int64) + pad_idx
            for ik, k in enumerate(range(min_ind, j)):
                ctx[ik] = review[k]
            for ik, k in enumerate(range(j+1, max_ind)):
                ctx[window_size+ik] = review[k]
            ngrams.append((ctx, review[j]))
    return ngrams

In [9]:
percent_train = 0.7
num_train = int(percent_train*to_keep)

# TODO add validation data
training_movies, testing_movies = furtherFilteredMovies[:num_train], furtherFilteredMovies[num_train:]

training_movies['movie_title']

reviews = combined_data["review_content"]
rev_type = combined_data["review_type"]
labels = rev_type.apply(lambda x: 1 if x == "Fresh" else 0)

In [10]:
combined_data.dropna(subset = ["review_content", 'review_type'], inplace=True)

train_data = combined_data[combined_data['rotten_tomatoes_link'].isin(training_movies['rotten_tomatoes_link'])]
test_data = combined_data[~combined_data['rotten_tomatoes_link'].isin(training_movies['rotten_tomatoes_link'])]

train_reviews, train_labels = train_data["review_content"], train_data["review_type"].apply(lambda x: 1 if x == "Fresh" else 0)
test_reviews, test_labels = test_data["review_content"], test_data["review_type"].apply(lambda x: 1 if x == "Fresh" else 0)

In [11]:
vectorizer = CountVectorizer()
vectorizer.fit(train_reviews)
train_bow_matrix = vectorizer.transform(train_reviews)
test_bow_matrix = vectorizer.transform(test_reviews)
print(f"BoW matrix is {train_bow_matrix.shape[0]} x {train_bow_matrix.shape[1]}")

BoW matrix is 105 x 31


In [12]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
model = gnb.fit(train_bow_matrix, train_labels)

test_pred = model.predict(test_bow_matrix)

In [13]:
train_dataset = pd.DataFrame(train_reviews).reset_index().merge(pd.DataFrame(train_labels).reset_index())
test_dataset = pd.DataFrame(test_reviews).reset_index().merge(pd.DataFrame(test_labels).reset_index())
train_dataset

Unnamed: 0,index,review_content,review_type
0,0,How many thrillers could put the outcome in th...,1
1,1,"This touching, exciting film works less as a c...",1
2,2,Ideas were flying around like bullets. And so ...,1
3,3,Kevin Rafferty makes the case for remembrance ...,1
4,4,"The movie, which absurdly tries to paint the H...",0
...,...,...,...
100,116,An almost uniformly substandard follow-up that...,0
101,117,"The sequel is, on the whole, a fairly mechanic...",0
102,118,A rather underwhelming sequel...,1
103,119,Murray's in his prime here as one of four fear...,1


In [14]:
s1 = "A sentence"
s2 = "A sentence. Another sentence."

def sentence_splitter(s):
    return [i for i in s.split(".") if i != ""]

print(sentence_splitter(s1))
print(sentence_splitter(s2))

['A sentence']
['A sentence', ' Another sentence']


In [15]:
train_dataset["sentences"] = train_dataset["review_content"].apply(sentence_splitter)
train_dataset = train_dataset.explode("sentences")
test_dataset["sentences"] = test_dataset["review_content"].apply(sentence_splitter)
test_dataset = test_dataset.explode("sentences")

In [105]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as torch_data

class Word2VecModel(nn.Module):
    # A torch module implementing a word2vec predictor. The `forward` function
    # should take a batch of context word ids as input and predict the word 
    # in the middle of the context as output, as in the CBOW model from lecture.
    # Hint: look at how padding is handled in lab_util.get_ngrams when
    # initializing `ctx`: vocab_size is used as the padding token for contexts
    # near the beginning and end of sequences. If you use an embedding module
    # in your Word2Vec implementation, make sure to account for this extra
    # padding token in the input dimension and include the `padding_idx` kwarg.

    def __init__(self, vocab_size, embedding_size, padding_idx=467):
        super().__init__()

        self.embeddings = nn.Embedding(vocab_size+1, embedding_size, padding_idx=padding_idx)
        self.linear = nn.Linear(embedding_size, vocab_size)
        self.softmax = nn.Softmax()
        
    def forward(self, context):
        # Context is an `n_batch x n_context` matrix of integer word ids.
        # In this case, n_context = 2 * window_size where window_size is defined
        # in lab_util.py. This is because each word has both left and right context.
        # This function should return an `n_batch x vocab_size` matrix with
        # element i, j being the (possibly log) probability of the middle word
        # in context i being word j.

        return self.softmax(self.linear(torch.mean(self.embeddings(context), axis=1)))

In [106]:
def learn_reps_word2vec(corpus, window_size, rep_size, n_epochs, n_batch):
    # This method takes in a corpus of training sentences. It returns a matrix of
    # word embeddings with the same structure as used in the previous section of 
    # the assignment. (You can extract this matrix from the parameters of the 
    # Word2VecModel.)

    tokenizer = Tokenizer()
    tokenizer.fit(corpus)
    tokenized_corpus = tokenizer.tokenize(corpus)

    ngrams = get_ngrams(tokenized_corpus, window_size, pad_idx=467)

    # device = torch.device('cuda')  # run on colab gpu
    model = Word2VecModel(tokenizer.vocab_size, rep_size)
    opt = optim.Adam(model.parameters(), lr=0.001)

    loader = torch_data.DataLoader(ngrams, batch_size=n_batch, shuffle=True)

    # What loss function should we use for Word2Vec?
    loss_fn = torch.nn.CrossEntropyLoss()  # Your code here!

    losses = []  # Potentially useful for debugging (loss should go down!)
    for epoch in tqdm(range(n_epochs)):
        epoch_loss = 0
        for context, label in loader:
            # As described above, `context` is a batch of context word ids, and
            # `label` is a batch of predicted word labels.

            # Here, perform a forward pass to compute predictions for the model.
            # Your code here!
            context = context
            label = label
            preds = model.forward(context)


            # Now finish the backward pass and gradient update.
            # Remember, you need to compute the loss, zero the gradients
            # of the model parameters, perform the backward pass, and
            # update the model parameters.
            # Your code here!
            loss = loss_fn(preds, label)
            model.zero_grad()
            loss.backward()
            opt.step()


            epoch_loss += loss.item()
        losses.append(epoch_loss)

    # Hint: you want to return a `vocab_size x embedding_size` numpy array
    embedding_matrix = model.linear.weight.cpu().detach().numpy()  # Your code here!

    return embedding_matrix

In [107]:
reps_word2vec = learn_reps_word2vec(train_dataset["sentences"], 2, 300, 10, 100)

AssertionError: Padding_idx must be within num_embeddings

In [16]:
def train_model(featurizer, xs, ys):
    xs_featurized = featurizer(xs)
    model = sklearn.linear_model.LogisticRegression(penalty='none', max_iter=1000)
    model.fit(xs_featurized, ys)
    return model

def eval_model(model, featurizer, xs, ys):
    xs_featurized = featurizer(xs)
    pred_ys = model.predict(xs_featurized)
    return np.mean(pred_ys == ys)

def training_experiment(name, featurizer, n_train):
    print(f"{name} features, {n_train} examples")
    train_xs = vectorizer.transform(train_reviews[:n_train])
    train_ys = train_labels[:n_train]
    test_xs = vectorizer.transform(test_reviews)
    test_ys = test_labels
    model = train_model(featurizer, train_xs, train_ys)
    acc = eval_model(model, featurizer, test_xs, test_ys)
    print(acc, '\n')
    return acc

def w2v_featurizer(xs):
    # This function takes in a matrix in which each row contains the word counts
    # for the given review. It should return a matrix in which each row contains
    # the average Word2Vec embedding of each review (hint: this will be very
    # similar to `lsa_featurizer` from above, just using Word2Vec embeddings 
    # instead of LSA).

    feats = np.matmul(xs, reps_word2vec) # Your code here!

    # normalize
    return feats / np.sqrt((feats ** 2).sum(axis=1, keepdims=True))

In [17]:
def sentence_mean(sentence):
    sentence_list = sentence.split()
    cur_total = None
    count = 0
    for word in sentence_list:
        try:
            word_vec = wv[word]
            if cur_total is None:
                cur_total = word_vec
            else:
                cur_total = np.copy(cur_total) + word_vec
            count += 1
        except KeyError:
            pass
    if cur_total is None:
        return wv['</s>']
    return cur_total/count

In [18]:
train_dataset["sentence_vec"] = train_dataset["sentences"].apply(sentence_mean)
test_dataset["sentence_vec"] = test_dataset["sentences"].apply(sentence_mean)
train_dataset

Unnamed: 0,index,review_content,review_type,sentences,sentence_vec
0,0,How many thrillers could put the outcome in th...,1,How many thrillers could put the outcome in th...,"[0.025644403, 0.06503778, 0.026081687, 0.11363..."
1,1,"This touching, exciting film works less as a c...",1,"This touching, exciting film works less as a c...","[-0.008740594, 0.03962977, 0.023433909, 0.0952..."
2,2,Ideas were flying around like bullets. And so ...,1,Ideas were flying around like bullets,"[0.018697103, 0.098836266, 0.05561574, 0.14388..."
2,2,Ideas were flying around like bullets. And so ...,1,And so were the footballs,"[-0.033984374, 0.06347656, 0.048077393, 0.0577..."
3,3,Kevin Rafferty makes the case for remembrance ...,1,Kevin Rafferty makes the case for remembrance ...,"[0.04748535, 0.06387939, 0.030846024, 0.098077..."
...,...,...,...,...,...
100,116,An almost uniformly substandard follow-up that...,0,An almost uniformly substandard follow-up that...,"[0.02564538, 0.031082816, 0.046438467, 0.05816..."
101,117,"The sequel is, on the whole, a fairly mechanic...",0,"The sequel is, on the whole, a fairly mechanic...","[0.0792923, 0.05810547, -0.012290107, 0.079050..."
102,118,A rather underwhelming sequel...,1,A rather underwhelming sequel,"[0.11645508, 0.0501709, 0.034423828, 0.1708374..."
103,119,Murray's in his prime here as one of four fear...,1,Murray's in his prime here as one of four fear...,"[0.062149048, 0.018037276, 0.06344223, 0.07019..."


In [21]:
train_data["sentences"] = train_data["review_content"].apply(sentence_splitter)
train_data = train_data.explode("sentences")
test_data["sentences"] = test_data["review_content"].apply(sentence_splitter)
test_data = test_data.explode("sentences")

In [22]:
train_data["sentence_vec"] = train_data["sentences"].apply(sentence_mean)
test_data["sentence_vec"] = test_data["sentences"].apply(sentence_mean)
train_data

Unnamed: 0,rotten_tomatoes_link,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content,sentences,sentence_vec
0,m/harvard_beats_yale_29_29,John Anderson,True,Variety,Fresh,,2008-09-15,How many thrillers could put the outcome in th...,How many thrillers could put the outcome in th...,"[0.025644403, 0.06503778, 0.026081687, 0.11363..."
1,m/harvard_beats_yale_29_29,,True,New York Magazine/Vulture,Fresh,,2008-11-17,"This touching, exciting film works less as a c...","This touching, exciting film works less as a c...","[-0.008740594, 0.03962977, 0.023433909, 0.0952..."
2,m/harvard_beats_yale_29_29,Paul Brenner,False,Filmcritic.com,Fresh,4/5,2008-11-18,Ideas were flying around like bullets. And so ...,Ideas were flying around like bullets,"[0.018697103, 0.098836266, 0.05561574, 0.14388..."
2,m/harvard_beats_yale_29_29,Paul Brenner,False,Filmcritic.com,Fresh,4/5,2008-11-18,Ideas were flying around like bullets. And so ...,And so were the footballs,"[-0.033984374, 0.06347656, 0.048077393, 0.0577..."
2,m/harvard_beats_yale_29_29,Paul Brenner,False,Filmcritic.com,Fresh,4/5,2008-11-18,Ideas were flying around like bullets. And so ...,Ideas were flying around like bullets,"[0.018697103, 0.098836266, 0.05561574, 0.14388..."
...,...,...,...,...,...,...,...,...,...,...
116,m/ghostbusters_2,Tim Brayton,False,Antagony & Ecstasy,Rotten,5/10,2011-05-30,An almost uniformly substandard follow-up that...,An almost uniformly substandard follow-up that...,"[0.02564538, 0.031082816, 0.046438467, 0.05816..."
117,m/ghostbusters_2,Kathleen Carroll,True,New York Daily News,Rotten,,2015-06-15,"The sequel is, on the whole, a fairly mechanic...","The sequel is, on the whole, a fairly mechanic...","[0.0792923, 0.05810547, -0.012290107, 0.079050..."
118,m/ghostbusters_2,David Nusair,False,Reel Film Reviews,Fresh,2.5/4,2016-07-19,A rather underwhelming sequel...,A rather underwhelming sequel,"[0.11645508, 0.0501709, 0.034423828, 0.1708374..."
119,m/ghostbusters_2,Colette DeDonato,False,Common Sense Media,Fresh,4/5,2017-06-20,Murray's in his prime here as one of four fear...,Murray's in his prime here as one of four fear...,"[0.062149048, 0.018037276, 0.06344223, 0.07019..."


In [23]:
from numpy import dot
from numpy.linalg import norm

n_rows = train_data.shape[0]
graph_matrix = np.zeros((n_rows, n_rows))
for i in tqdm(range(n_rows)):
    for j in range(i+1, n_rows):
        vec1, vec2 = train_data["sentence_vec"].iloc[i], train_data["sentence_vec"].iloc[j]
        cos_sim = dot(vec1, vec2)/(norm(vec1)*norm(vec2))
        if cos_sim > 0 and cos_sim <= 0.5:
            graph_matrix[i][j] = cos_sim
            graph_matrix[j][i] = cos_sim

100%|██████████| 427/427 [00:03<00:00, 113.71it/s]


In [26]:
n = n_rows
def win(matrix, m, o):
    k = 0
    for i in range(0, n):
        if(matrix[i][m] != 0):
            k = k+matrix[i][m]
    l = 0
    for i in range(0, n):
        if(matrix[o][i] != 0):
            for j in range(0, n):
                if(matrix[j][i] != 0):
                    l = l+matrix[j][i]
    return float(k/l)
  
  
def wout(matrix, m, o):
    k = 0
    for i in range(0, n):
        if(matrix[0][i] != 0):
            k = k+matrix[0][i]
    l = 0
    for i in range(0, n):
        if(matrix[o][i] != 0):
            for j in range(0, n):
                if(matrix[i][j] != 0):
                    l = l+matrix[i][j]
    return float(k/l)
  
  
def graphrank(matrix, o, n, p):
    a = 0
    for i in range(0, n):
        if(matrix[i][o] != 0):
            k = 0
            for s in range(0, n):
                if(matrix[i][s] != 0):
                    k = k+matrix[i][s]
            a = a+float((p[i]/k)*win(matrix, i, o)*wout(matrix, i, o))
    return a

In [28]:
d = 0.85
o = 10
p = []
  
for i in tqdm(range(0, n)):
    p.append(1)
for k in tqdm(range(0, o)):
    for u in tqdm(range(0, n)):
        g = graphrank(graph_matrix, u, n, p)
        p[u] = (1-d)+d*g
for i in tqdm(range(0, n)):
    print("Page rank of node ", i+1, "is : ", p[i])

100%|██████████| 427/427 [00:00<00:00, 212476.90it/s]
  0%|          | 0/10 [00:00<?, ?it/s]