In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import model_selection

In [2]:
raw = pd.read_csv("data/imdb.csv")

In [3]:
raw.head(5)

Unnamed: 0,text,sentiment
0,What a waste of talent -- although it appears ...,0
1,No matter how you feel about Michael Jackson h...,1
2,Contrary to what many may believe as this movi...,1
3,10/10 for this film.<br /><br />i'm a british ...,1
4,"In theory, 'Director's Commentary' should have...",0


## TFIDF + MNB (using movie review data as trainning data)

In [4]:
# split data to training set and testing set
def review_series_to_list(review_series):
    review_list=[]
    n_review = len(review_series)
    for i in range(0,n_review):
        review_list.append(review_series[i])
    return review_list  

train_review_list = review_series_to_list(raw['text'])

X_train, X_test, y_train, y_test = train_test_split(
    train_review_list, raw['sentiment'], test_size=0.33, random_state=42)

In [5]:
nb_model = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])

nb_fit = nb_model.fit(X_train, y_train)

In [6]:
nb_model_bow = Pipeline([('vect', CountVectorizer()),
                    
                     ('clf', MultinomialNB()),
])

nb_fit_bow = nb_model_bow.fit(X_train, y_train)

In [7]:
nb_predicted2 = nb_model_bow.predict(X_test)
nb_accuracy2 = np.mean(nb_predicted2 == y_test) 

In [8]:
###Prediction and evaluation
nb_predicted = nb_model.predict(X_test)
nb_accuracy = np.mean(nb_predicted == y_test) 
print (nb_accuracy)
print(metrics.classification_report(y_test, nb_predicted,
    target_names=['Rating < 5(0)','Rating >=7(1)']))
metrics.confusion_matrix(y_test, nb_predicted)


0.859515151515
               precision    recall  f1-score   support

Rating < 5(0)       0.83      0.89      0.86      8171
Rating >=7(1)       0.89      0.83      0.86      8329

  avg / total       0.86      0.86      0.86     16500



array([[7299,  872],
       [1446, 6883]])

## TFIDF + logistics (using movie review data as trainning data)

In [9]:
from sklearn.linear_model import LogisticRegression as LR
lr_model = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LR()),
])

lr_fit = lr_model.fit(X_train, y_train)


In [10]:
lr_predicted = lr_model.predict(X_test)
lr_accuracy = np.mean(lr_predicted == y_test) 
print (lr_accuracy)
print(metrics.classification_report(y_test, lr_predicted,
    target_names=['Rating < 5(0)','Rating >=7(1)']))
metrics.confusion_matrix(y_test, lr_predicted)

0.892484848485
               precision    recall  f1-score   support

Rating < 5(0)       0.90      0.89      0.89      8171
Rating >=7(1)       0.89      0.90      0.89      8329

  avg / total       0.89      0.89      0.89     16500



array([[7234,  937],
       [ 837, 7492]])

In [11]:
lrl2_model = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LR(penalty = 'l2', dual = True, random_state = 0)),
])

lrl2_fit = lrl2_model.fit(X_train, y_train)


In [12]:
lrl2_predicted = lrl2_model.predict(X_test)
lrl2_accuracy = np.mean(lrl2_predicted == y_test) 
print (lrl2_accuracy)
print(metrics.classification_report(y_test, lr_predicted,
    target_names=['Rating < 5(0)','Rating >=7(1)']))
metrics.confusion_matrix(y_test, lr_predicted)

0.892484848485
               precision    recall  f1-score   support

Rating < 5(0)       0.90      0.89      0.89      8171
Rating >=7(1)       0.89      0.90      0.89      8329

  avg / total       0.89      0.89      0.89     16500



array([[7234,  937],
       [ 837, 7492]])

In [13]:
scores5 = model_selection.cross_val_score(lrl2_fit, X_train, y_train, cv=5)
scores5   

array([ 0.89031488,  0.89238806,  0.88865672,  0.8880597 ,  0.89386476])

### Grid search 

In [14]:
grid_values = {'C':[30]}

In [15]:
from sklearn.grid_search import GridSearchCV
model_LR = GridSearchCV(LR(penalty = 'l2', dual = True, random_state = 0), 
                        grid_values, scoring = 'roc_auc', cv = 20) 



## BOF + MNB

In [16]:
def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", raw_review) #revew_text 
    
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return(" ".join(meaningful_words ))   

In [17]:
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords # Import the stop word list
clean_review = review_to_words(raw["text"][0] )



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [18]:
# Get the number of reviews based on the dataframe column size
num_reviews = raw["text"].size

# Initialize an empty list to hold the clean reviews
clean_train_reviews = []

# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list 
for i in range( 0, num_reviews ):
    # Call our function for each one, and add the result to the list of
    # clean reviews
    clean_train_reviews.append( review_to_words( raw["text"][i] ) )



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [19]:
clean_train_reviews = []
for i in range( 0, num_reviews ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print (  "Review %d of %d\n" % ( i+1, num_reviews ) )                                                                   
    clean_train_reviews.append( review_to_words( raw["text"][i] ))



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Review 1000 of 50000

Review 2000 of 50000

Review 3000 of 50000

Review 4000 of 50000

Review 5000 of 50000

Review 6000 of 50000

Review 7000 of 50000

Review 8000 of 50000

Review 9000 of 50000

Review 10000 of 50000

Review 11000 of 50000

Review 12000 of 50000

Review 13000 of 50000

Review 14000 of 50000

Review 15000 of 50000

Review 16000 of 50000

Review 17000 of 50000

Review 18000 of 50000

Review 19000 of 50000

Review 20000 of 50000

Review 21000 of 50000

Review 22000 of 50000

Review 23000 of 50000

Review 24000 of 50000

Review 25000 of 50000

Review 26000 of 50000

Review 27000 of 50000

Review 28000 of 50000

Review 29000 of 50000

Review 30000 of 50000

Review 31000 of 50000

Review 32000 of 50000

Review 33000 of 50000

Review 34000 of 50000

Review 35000 of 50000

Review 36000 of 50000

Review 37000 of 50000

Review 38000 of 50000

Review 39000 of 50000

Review 40000 of 50000

Review 41000 of 50000

Review 42000 of 50000

Review 43000 of 50000

Review 44000 of 5000

In [20]:
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

In [21]:
train_data_features = vectorizer.fit_transform(clean_train_reviews)

In [22]:
train_data_features = train_data_features.toarray()

In [23]:
vocab = vectorizer.get_feature_names()

In [24]:
train_data_features

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [27]:
from sklearn.model_selection import cross_val_score
NB_bow = MultinomialNB()
NB_bow_fit = NB_bow.fit( train_data_features, raw["sentiment"] )
scores5 = cross_val_score(NB_bow_fit, train_data_features, raw["sentiment"], cv=5)
scores5   

array([ 0.8449,  0.8504,  0.8439,  0.8566,  0.8498])

In [28]:
scores10 = cross_val_score(NB_bow_fit, train_data_features, raw["sentiment"], cv=10)
scores10

array([ 0.8438,  0.847 ,  0.854 ,  0.8464,  0.8486,  0.8444,  0.8528,
        0.8614,  0.8518,  0.8466])

In [29]:
print("Accuracy 5cv : %0.2f (+/- %0.2f)" % (scores5.mean(), scores5.std() * 2))
print("Accuracy 10cv : %0.2f (+/- %0.2f)" % (scores10.mean(), scores10.std() * 2))

Accuracy 5cv : 0.85 (+/- 0.01)
Accuracy 10cv : 0.85 (+/- 0.01)


## BOF + Logistics

In [30]:
LR_bow_fit = LR()
LR_bow_fit = LR_bow_fit.fit( train_data_features, raw["sentiment"] )
scores5 = cross_val_score(LR_bow_fit, train_data_features, raw["sentiment"], cv=5)
scores5   

array([ 0.8723,  0.8733,  0.8755,  0.8735,  0.8674])

In [31]:
scores10 = cross_val_score(LR_bow_fit, train_data_features, raw["sentiment"], cv=10)
scores10

array([ 0.87  ,  0.8764,  0.8766,  0.8736,  0.8752,  0.8842,  0.8758,
        0.8816,  0.8782,  0.866 ])

In [32]:
print("Accuracy 5cv : %0.2f (+/- %0.2f)" % (scores5.mean(), scores5.std() * 2))
print("Accuracy 10cv : %0.2f (+/- %0.2f)" % (scores10.mean(), scores10.std() * 2))

Accuracy 5cv : 0.87 (+/- 0.01)
Accuracy 10cv : 0.88 (+/- 0.01)
