# Noonum Sentiment Analysis

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA, TruncatedSVD
#import nltk
#from nltk.corpus import wordnet as wn
#from nltk.corpus import stopwords
#from nltk.corpus.reader.wordnet import WordNetError
#stopset = set(stopwords.words('english'))

## Process Data

Two datasets were used in this project. 

<b> 1.IMDB Dataset (65.9 MB)</b> 
<br>IMDB Dataset consists of 50,000 IMDB movie reviews, specially selected for sentiment analysis. The sentiment of reviews is binary, meaning the IMDB rating < 5 results in a sentiment score of 0, and rating >=7 have a sentiment score of 1. 25,000 of them are labeled with sentiment score, therefore used as training corpus. </br>

<b> 2.Twitter Data (157 MB)</b>
<br>Twitter Data is based on data from two sources:  </br>
<br>1.University of Michigan Sentiment Analysis competition on Kaggle (https://inclass.kaggle.com/c/si650winter11)</br>
<br>2.Twitter Sentiment Corpus by Niek Sanders (http://www.sananalytics.com/lab/twitter-sentiment/) </br>
<br>This dataset contains 1,578,627 classified tweets, each row is marked as 1 for positive sentiment and 0 for negative sentiment. 

In [2]:
# IMDB Movie Review Dataset
IMDB = pd.read_csv("labeledTrainData.tsv", header=0, \
                    delimiter="\t", quoting=3)
# Twitter Dataset
twitter = pd.read_csv('all.csv', error_bad_lines=False)

b'Skipping line 8836: expected 4 fields, saw 5\n'
b'Skipping line 535882: expected 4 fields, saw 7\n'


In [3]:
def review_series_to_list(review_series):
    review_list=[]
    n_review = len(review_series)
    for i in range(0,n_review):
        review_list.append(review_series[i])
    return review_list  

In [4]:
tw_review_list = review_series_to_list(twitter['SentimentText'])
tw_X_train, tw_X_test, tw_y_train, tw_y_test = train_test_split(
    tw_review_list, twitter['Sentiment'], test_size=0.33, random_state=42)

train_review_list = review_series_to_list(IMDB['review'])
X_train, X_test, y_train, y_test = train_test_split(
    train_review_list, IMDB['sentiment'], test_size=0.33, random_state=42)

## Metrics

### 1.Accuracy
### 2.f1 score
### 3.ROC_AUC

## Model Selection 

1. Logistics Regression
2. Naive Bayes 
3. SVM

###  Since IMDB is a smaller dataset, we will use this dataset to test which combination has the highest accuracy. We will just use default settings here. 

In [5]:
'''
train = {'IMDB','twitter'}
test = {'Twitter'}   tw_y_test
'''
training_model = ['LR','NB','SVM']
vectorizer = ['BOW','TFIDF']

def model_training(training_model,vectorizer):
    if training_model == 'LR':
        print ("Training Logistics model...")
        if vectorizer == 'BOW':
            print ("Using Bag of Words...")
            model_LR_BOW = Pipeline([('vect', CountVectorizer()),
                     ('clf', LR()),])
            return model_LR_BOW
           
        elif vectorizer == 'TFIDF':
            print ("Using TFIDF...")
            model_LR_TFIDF = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LR()),])
            return model_LR_TFIDF
            
    elif training_model == 'NB':
        print ("Training Naive Bayes model...")
        if vectorizer == 'BOW':
            print ("Using Bag of Words...")
            model_NB_BOW = Pipeline([('vect', CountVectorizer()),
                     ('clf', MultinomialNB()),])
            return model_NB_BOW
                        
        elif vectorizer == 'TFIDF':
            print ("Using TFIDF...")
            model_NB_TFIDF = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),])
            return model_NB_TFIDF
    
    elif training_model == 'SVM':
        print ("Training SVM model...")
        if vectorizer == 'BOW':
            print ("Using Bag of Words...")
            model_SVM_BOW = Pipeline([('vect', CountVectorizer()),
                     ('clf', SGDClassifier()),])
            return model_SVM_BOW
                        
        elif vectorizer == 'TFIDF':
            print ("Using TFIDF...")
            model_SVM_TFIDF = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier()),])
            return model_SVM_TFIDF
                    
def model_fitting_in_different_dataset(train,test):
    if train == 'IMDB' and test == 'Twitter':       
        print ('Training model on IMDB data and testing on Twitter data...') 
        for i in training_model:
            for j in vectorizer:
                print (i,j)
                fit_model1 = model_training(i,j).fit(X_train, y_train)
                predicted1 = fit_model1.predict(tw_X_test)
                accuracy1 = np.mean(predicted1 == tw_y_test)
                print (accuracy1)
            
    elif train == 'Twitter' and test == 'Twitter':
        print ('Training model on Twitter data and testing on Twitter data...')
        for i in training_model:
            for j in vectorizer:
                print (i,j)
                fit_model2 = model_training(i,j).fit(tw_X_train, tw_y_train)
                predicted2 = fit_model2.predict(tw_X_test)
                accuracy2 = np.mean(predicted2 == tw_y_test)
                print (accuracy2)
                
       
    elif train == 'IMDB' and test == 'IMDB':
        print ('Training model on IMDB data and testing on IMDB data...')
        for i in training_model:
            for j in vectorizer:
                print (i,j)
                fit_model3 = model_training(i,j).fit(X_train, y_train)
                #cv5 = cross_val_score(fit_model3, X_train, y_train, cv=5)
                predicted3 = fit_model3.predict(X_test)
                accuracy3 = np.mean(predicted3 == y_test)
                scores10 = cross_val_score(fit_model3, X_train, y_train, cv=10)
                print (accuracy3)
                print("Accuracy 10cv : %0.2f (+/- %0.2f)" % (scores10.mean(), scores10.std() * 2))
                #print ("5 fold cross validation for training data: ",cv5)
                
    
    

model_fitting_in_different_dataset('IMDB','IMDB')
#print (cross_val_score(fit_model3, train, test, cv=5) )

# Can also use this to test the model combination on Twitter dataset, but take longer time.
# model_fitting_in_different_dataset('Twitter','Twitter')


Training model on IMDB data and testing on IMDB data...
LR BOW
Training Logistics model...
Using Bag of Words...
0.88496969697
Accuracy 10cv : 0.88 (+/- 0.01)
LR TFIDF
Training Logistics model...
Using TFIDF...
0.889818181818
Accuracy 10cv : 0.88 (+/- 0.01)
NB BOW
Training Naive Bayes model...
Using Bag of Words...
0.854545454545
Accuracy 10cv : 0.85 (+/- 0.03)
NB TFIDF
Training Naive Bayes model...
Using TFIDF...
0.864848484848
Accuracy 10cv : 0.86 (+/- 0.03)
SVM BOW
Training SVM model...
Using Bag of Words...
0.841333333333
Accuracy 10cv : 0.83 (+/- 0.07)
SVM TFIDF
Training SVM model...
Using TFIDF...
0.895757575758
Accuracy 10cv : 0.89 (+/- 0.01)


In [36]:
model = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LR()),])
model =model.fit(X_train, y_train)
cross_val_score(model, X_train, y_train, cv=10)

array([ 0.87529833,  0.8872315 ,  0.88424821,  0.87708831,  0.88961814,
        0.87335723,  0.88590203,  0.88351254,  0.87275986,  0.87037037])

Conclusion: In IMDB dataset, we can find out the TFIDF works better than BOW. LR and SVM works better than Naive Bayes model in default parameter setting. Therefore, we can try to use grid search to find the best parameters.

### Fine-tune LR + TFIDF

In [14]:
# Tuning parameters
def logistics_model_withGS(train_X, train_y,val_X,val_y):# may take several hours to run
    LR_withGS = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         #Dimensionality reduction using truncated SVD
                         ('tsvd', TruncatedSVD()),
                         ('clf', LR()),
                         ])

    parameters = {'vect__ngram_range': [(1, 1),(1,2)],
                  #'vect__stop_words': (None, 'english'),
                  'vect__lowercase': (True, False),
                  #'tfidf__use_idf': (True, False),
                  #'tsvd__n_components': (1100, ),
                  #'clf__penalty': ('l1', 'l2'),
                  #'clf__C': [0.1, 1]
                  }
    print ('Start grid search' )
    gs_clf = GridSearchCV(LR_withGS, parameters,scores = 'accuracy')
    fit_gs_clf = gs_clf.fit(train_X, train_y)
    pred_gs_clf = fit_gs_clf.predict(val_X)
    accuracy = 
    
    print (gs_clf)
    gs_clf.fit(data, target)
    return gs_clf.best_params_ 

In [None]:
def logistics_model_default(train_X, train_y,val_X,val_y):
    LR_default = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         #Dimensionality reduction using truncated SVD
                         #('tsvd', TruncatedSVD()),
                         ('clf', LR()),
                         ])
    LR_default = LR_default.fit(data,target)
    pre_LR_default = 
    print (gs_clf)
    gs_clf.fit(data, target)
    return gs_clf.best_params_ 

In [10]:
logistics_model_withGS(X_train, y_train)

Start grid search
GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'vect__ngram_range': [(1, 1), (1, 2)], 'vect__lowercase': (True, False)},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)
GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngr

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'vect__ngram_range': [(1, 1), (1, 2)], 'vect__lowercase': (True, False)},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

## Testing

In [5]:
# nb model train IMDB test TFIDF
nb_model = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])

nb_fit = nb_model.fit(X_train, y_train)
tw_nb_predicted = nb_model.predict(X_test)
tw_nb_accuracy = np.mean(tw_nb_predicted == y_test) 
print (tw_nb_accuracy)

0.864848484848


In [24]:
# remove stopwords and add ngram range 
nb_model = Pipeline([('vect', TfidfVectorizer(stop_words=stopset,ngram_range = (1,3))),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])

nb_fit = nb_model.fit(X_train, y_train)
tw_nb_predicted = nb_model.predict(X_test)
tw_nb_accuracy = np.mean(tw_nb_predicted == y_test) 
print (tw_nb_accuracy)

0.886666666667
