In [1]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df = pd.read_csv('./train_set/labeledTrainData.tsv', sep='\t')

In [3]:
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'])

In [18]:
features = vectorizer.fit_transform(X_train)
features_test = vectorizer.transform(X_test)

In [19]:
words = vectorizer.get_feature_names()[45000:45010]

In [20]:
pd.DataFrame(features[10:17, 45000:45010].todense() ,columns = words )

Unnamed: 0,plutonium,ply,plying,plymouth,plympton,plywood,plz,pm,pmrc,pms
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0


In [22]:
from sklearn.naive_bayes import MultinomialNB

model1 = MultinomialNB()
model1.fit(features, y_train)
pred1 = model1.predict_proba(features_test)

In [24]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

def get_metrics(predictions):
    prob_positive = predictions[:, 1]
    acc = accuracy_score(y_test, prob_positive > 0.5)
    auc = roc_auc_score(y_test, prob_positive)
    
    return """
    Accuracy: {:6.4f}
    AUC:      {:6.4f}
    """.format(acc, auc)

print get_metrics(pred1)


    Accuracy: 0.8483
    AUC:      0.9183
    


In [25]:
review = """I was going to say something awesome, but I simply can't
     because the movie is so bad."""

In [27]:
print model1.predict(vectorizer.transform([review]))

[0]


In [32]:
print model1.predict_proba(vectorizer.transform([review]))

[[0.91383954 0.08616046]]


In [37]:
class MyMoviePredictor:
    def __init__(self, model, vectorizer):
        self.model = model
        self.vectorizer = vectorizer
    
    def _transform_single_review(self,  review_text):
        return self.vectorizer.transform([review_text])
    
    def _transform_multiple_reviews(self, list_of_reviews):
        return self.vectorizer.transform(list_of_reviews)
    
    def make_single_hard_prediction(self, review_text):
        return self.model.predict(self._transform_single_review(review_text))
    
    def make_single_soft_prediction(self, review_text):
        """
        Returns a single number: 
          the probability of the review being positive
        """
        return self.model.predict_proba(self._transform_single_review(review_text))[0,1]
    
    def make_multiple_hard_predictions(self, list_of_reviews):
        """
        Returns a n x 1 array of numbers, where the nth row is the probability of 
        review n being a positive review
        """
        return self.model.predict_proba(self._transform_multiple_reviews(list_of_reviews))[:, 1]
        

In [38]:
movie_predictor = MyMoviePredictor(model1, vectorizer)

In [39]:
movie_predictor.make_single_soft_prediction(review)

0.08616046265364664

In [41]:
import cPickle as pickle

In [42]:
pickle.dump(movie_predictor, open('./train_set/model.pkl', 'wb'))

In [44]:
pickle.dump(vectorizer, open('./train_set/vectorizer.pkl', 'wb'))
pickle.dump(model1, open('./train_set/naivebayes.pkl', 'wb'))