In [202]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [244]:
df = pd.read_csv('dataset/critics.csv')
df = df[~df.quote.isnull()]
df.head()

Unnamed: 0,critic,fresh,imdb,publication,quote,review_date,rtid,title
1,Derek Adams,fresh,114709,Time Out,"So ingenious in concept, design and execution ...",2009-10-04,9559,Toy story
2,Richard Corliss,fresh,114709,TIME Magazine,The year's most inventive comedy.,2008-08-31,9559,Toy story
3,David Ansen,fresh,114709,Newsweek,A winning animated feature that has something ...,2008-08-18,9559,Toy story
4,Leonard Klady,fresh,114709,Variety,The film sports a provocative and appealing st...,2008-06-09,9559,Toy story
5,Jonathan Rosenbaum,fresh,114709,Chicago Reader,"An entertaining computer-generated, hyperreali...",2008-03-10,9559,Toy story


In [245]:
n_quotes = len(df)
n_critics = df.rtid.unique().size
n_movie = df.critic.unique().size

print('Number of quotes: %d' %(n_quotes))
print('Number of critics: %d' %(n_critics))
print('Number of movies: %d' %(n_movie))

Number of quotes: 15561
Number of critics: 1921
Number of movies: 623


In [246]:
y = (df.fresh == 'fresh').values.astype(np.int64)

#### Test Bag of Word function in Python

In [47]:
from sklearn.feature_extraction.text import CountVectorizer
sample_text = ['dota too', 'bad dota', 'stupid too']
print('Original Text is:\n', '\n'.join(sample_text))

vectorizer = CountVectorizer(min_df = 0)
vec = vectorizer.fit_transform(sample_text)
vec = vec.toarray()

print('')
print('transformed text vector is: \n', vec)

print('')
print('feature names are: \n', vectorizer.get_feature_names())

Original Text is:
 dota too
bad dota
stupid too

transformed text vector is: 
 [[0 1 0 1]
 [1 1 0 0]
 [0 0 1 1]]

feature names are: 
 ['bad', 'dota', 'stupid', 'too']


In [64]:
def make_bow(df, min_score):
    vectorizer = CountVectorizer(min_df = min_score)
    X = vectorizer.fit_transform(df.quote)
    X = X.toarray()
    y = (df.fresh == 'fresh').values.astype(np.int64)
    
    return X,y    

In [95]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

X, y = make_bow(df, 0)
xtrain, xtest, ytrain, ytest = train_test_split(X, y, train_size = 0.8, random_state = 66)

clf = MultinomialNB()
clf.fit(xtrain, ytrain)

print('training accuracy: %0.2f%%' %(clf.score(xtrain, ytrain)*100))
print('training accuracy: %0.2f%%' %(clf.score(xtest, ytest)*100))

training accuracy: 91.87%
training accuracy: 78.00%


In [259]:
from sklearn.model_selection import KFold

def log_likelihood(clf, x, y):
    prob = clf.predict_log_proba(x)
    rotten = y == 0
    fresh = ~rotten
    
    return prob[rotten, 0].sum() + prob[fresh, 1].sum()

def cv_optimize(clf, x, y, score, n_fold):
    
    cv_score = 0
    kf = KFold(n_splits=n_fold)
    for train, valid in kf.split(x):
        clf.fit(x[train], y[train])
        cv_score += score(clf, x[valid], y[valid])
        
    return cv_score / n_fold
    

In [134]:
train, test = train_test_split(range(df.shape[0]), train_size = 0.8, random_state = 66)
mask = np.zeros(df.shape[0])
mask[train] = 1
mask = mask == 1

In [264]:
alpha = [0, 1, 5, 10, 50]
min_dfs = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]

max_score = -np.inf
best_alpha = 0
best_mindf = 0

for a in alpha:
    for mindf in min_dfs:
        
        vectorize = CountVectorizer(min_df = mindf)
        X = vectorize.fit_transform(df.quote)
        X = X.toarray()
        Y = y
        
        xtrain = X[mask]
        ytrain = Y[mask]
        
        clf = MultinomialNB(alpha = a)

        cvscore = cv_optimize(clf, xtrain, ytrain, log_likelihood, 5)
        
        if cvscore > max_score:
            max_score = cvscore
            best_alpha, best_mindf = a, mindf
            

  self.feature_log_prob_ = (np.log(smoothed_fc) -


In [266]:
cvscore, best_alpha, best_mindf

(-1651.5514694824374, 5, 0.001)