# Movie reviews - Final Assignment Iteration #4
### Heew Kim | Nov 2020

Fourth iteration of the final assignment 1 using the movie review dataset based on feature modification combined with SVM models with optimized C parameters. 

In [1]:
# all imports and magic commands
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from my_measures import BinaryClassificationPerformance
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
%matplotlib inline

### IMPORTANT!!! Make sure you are using `BinaryClassificationPerformance` v1.02

In [2]:
help(BinaryClassificationPerformance)

Help on class BinaryClassificationPerformance in module my_measures:

class BinaryClassificationPerformance(builtins.object)
 |  BinaryClassificationPerformance(predictions, labels, desc, probabilities=None)
 |  
 |  Performance measures to evaluate the fit of a binary classification model, v1.02
 |  
 |  Methods defined here:
 |  
 |  __init__(self, predictions, labels, desc, probabilities=None)
 |      Initialize attributes: predictions-vector of predicted values for Y, labels-vector of labels for Y
 |  
 |  compute_measures(self)
 |      Compute performance measures defined by Flach p. 57
 |  
 |  img_indices(self)
 |      Get the indices of true and false positives to be able to locate the corresponding images in a list of image names
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the obj

## MODIFIED feature building and extraction functions 

In [3]:
# function that takes raw data and completes all preprocessing required before model fits
def process_raw_data_modified(fn, my_random_seed, test=False):
    # read and summarize data
    movie_data = pd.read_csv(fn, sep='\t')
    print("movie_data is:", type(movie_data))
    print("movie_data has", movie_data.shape[0], "rows and", movie_data.shape[1], "columns", "\n")
    print("the data types for each of the columns in movie_data:")
    print(movie_data.dtypes, "\n")
    print("the first 10 rows in movie_data:")
    print(movie_data.head(5))
    if (not test):
        print("The rate of 'good' movie reviews in the dataset: ")
        print(movie_data['sentiment'].mean())

    # vectorize Bag of Words from review text; as sparse matrix; added uni- & bigrams; deleted stopwords
    if (not test): # fit_transform()
        hv = HashingVectorizer(n_features=2 ** 16, ngram_range = (1,2), alternate_sign=False)
        X_hv = hv.fit_transform(movie_data.review)
        fitted_transformations.append(hv)
        print("Shape of HashingVectorizer X:")
        print(X_hv.shape)
    else: # transform() 
        X_hv = fitted_transformations[0].transform(movie_data.review)
        print("Shape of HashingVectorizer X:")
        print(X_hv.shape)
    
    # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
    if (not test):
        transformer = TfidfTransformer()
        X_tfidf = transformer.fit_transform(X_hv)
        fitted_transformations.append(transformer)
    else:
        X_tfidf = fitted_transformations[1].transform(X_hv)
    
    # create additional quantitative features
    # features from Amazon.csv to add to feature set
    movie_data['word_count'] = movie_data['review'].str.split(' ').str.len()
    movie_data['punc_count'] = movie_data['review'].str.count("\.")

    X_quant_features = movie_data[["word_count", "punc_count"]]
    print("Look at a few rows of the new quantitative features: ")
    print(X_quant_features.head(10))
    
    # Combine all quantitative features into a single sparse matrix
    X_quant_features_csr = csr_matrix(X_quant_features)
    X_combined = hstack([X_tfidf, X_quant_features_csr])
    X_matrix = csr_matrix(X_combined) # convert to sparse matrix
    print("Size of combined bag of words and new quantitative variables matrix:")
    print(X_matrix.shape)
    
    # Create `X`, scaled matrix of features
    # feature scaling
    if (not test):
        sc = StandardScaler(with_mean=False)
        X = sc.fit_transform(X_matrix)
        fitted_transformations.append(sc)
        print(X.shape)
        y = movie_data['sentiment']
    else:
        X = fitted_transformations[2].transform(X_matrix)
        print(X.shape)
    
    # Create Training and Test Sets
    # enter an integer for the random_state parameter; any integer will work
    if (test):
        X_submission_test = X
        print("Shape of X_test for submission:")
        print(X_submission_test.shape)
        print('SUCCESS!')
        return(movie_data, X_submission_test)
    else: 
        X_train, X_test, y_train, y_test, X_raw_train, X_raw_test = train_test_split(X, y, movie_data, test_size=0.2, random_state=my_random_seed)
        print("Shape of X_train and X_test:")
        print(X_train.shape)
        print(X_test.shape)
        print("Shape of y_train and y_test:")
        print(y_train.shape)
        print(y_test.shape)
        print("Shape of X_raw_train and X_raw_test:")
        print(X_raw_train.shape)
        print(X_raw_test.shape)
        print('SUCCESS!')
        return(X_train, X_test, y_train, y_test, X_raw_train, X_raw_test)

## MODIFIED training and test sets from function

In [36]:
# create an empty list to store any use of fit_transform() to transform() later
# it is a global list to store model and feature extraction fits
fitted_transformations = []

# CHANGE FILE PATH and my_random_seed number (any integer other than 74 will do):
X_train, X_test, y_train, y_test, X_raw_train, X_raw_test = process_raw_data_modified(fn='/Users/heewoong.kim/Documents/GitHub/ml/ProjectDataset/moviereviews_train.tsv', 
                                                                                      my_random_seed=99)

print("Number of fits stored in `fitted_transformations` list: ")
print(len(fitted_transformations))

movie_data is: <class 'pandas.core.frame.DataFrame'>
movie_data has 25000 rows and 3 columns 

the data types for each of the columns in movie_data:
id           object
sentiment     int64
review       object
dtype: object 

the first 10 rows in movie_data:
       id  sentiment                                             review
0  5814_8          1  With all this stuff going down at the moment w...
1  2381_9          1  \The Classic War of the Worlds\" by Timothy Hi...
2  7759_3          0  The film starts with a manager (Nicholas Bell)...
3  3630_4          0  It must be assumed that those who praised this...
4  9495_8          1  Superbly trashy and wondrously unpretentious 8...
The rate of 'good' movie reviews in the dataset: 
0.5
Shape of HashingVectorizer X:
(25000, 65536)
Look at a few rows of the new quantitative features: 
   word_count  punc_count
0         433          20
1         158          16
2         378          20
3         379           8
4         367           9
5

### MODEL: Support-Vector Machine (SVM) linear model with C 8e-6

In [37]:
from sklearn.svm import LinearSVC 

m1 = LinearSVC(loss='hinge', C=8e-6)
m1.fit(X_train, y_train)
#train set performance compute
m1_train = BinaryClassificationPerformance(m1.predict(X_train), y_train, 'svm-tr')
m1_train.compute_measures()
#test set performance
m1_test = BinaryClassificationPerformance(m1.predict(X_test), y_test, 'svm-tt')
m1_test.compute_measures()

print(m1_train.performance_measures)
print(m1_test.performance_measures)

{'Pos': 9988, 'Neg': 10012, 'TP': 9836, 'TN': 9859, 'FP': 153, 'FN': 152, 'Accuracy': 0.98475, 'Precision': 0.9846831514666132, 'Recall': 0.9847817380857028, 'desc': 'svm-tr'}
{'Pos': 2512, 'Neg': 2488, 'TP': 2236, 'TN': 2170, 'FP': 318, 'FN': 276, 'Accuracy': 0.8812, 'Precision': 0.87548942834769, 'Recall': 0.8901273885350318, 'desc': 'svm-tt'}


### MODEL: Ridge Regression Model alpha 9.5e+4

In [38]:
from sklearn import linear_model

# ridge regression based on alpha set at 7e+4 based on previous model tuning
m2 = linear_model.RidgeClassifier(alpha=9.5e+4)
m2.fit(X_train, y_train)
#train set performance
m2_train = BinaryClassificationPerformance(m2.predict(X_train), y_train, 'rgm-tr')
m2_train.compute_measures()
#test set performance
m2_test = BinaryClassificationPerformance(m2.predict(X_test), y_test, 'rgm-tt')
m2_test.compute_measures()

print(m2_train.performance_measures)
print(m2_test.performance_measures)

{'Pos': 9988, 'Neg': 10012, 'TP': 9965, 'TN': 10002, 'FP': 10, 'FN': 23, 'Accuracy': 0.99835, 'Precision': 0.9989974937343359, 'Recall': 0.9976972366840208, 'desc': 'rgm-tr'}
{'Pos': 2512, 'Neg': 2488, 'TP': 2243, 'TN': 2170, 'FP': 318, 'FN': 269, 'Accuracy': 0.8826, 'Precision': 0.8758297540023429, 'Recall': 0.8929140127388535, 'desc': 'rgm-tt'}


### MODEL: Naive Bayes 

In [39]:
from sklearn.naive_bayes import MultinomialNB

m3 = MultinomialNB()
m3.fit(X_train, y_train)

m3_train = BinaryClassificationPerformance(m3.predict(X_train), y_train, 'nbs-tr')
m3_train.compute_measures()
print(m3_train.performance_measures)

m3_test = BinaryClassificationPerformance(m3.predict(X_test), y_test, 'nbs-tt')
m3_test.compute_measures()
print(m3_test.performance_measures)


{'Pos': 9988, 'Neg': 10012, 'TP': 9659, 'TN': 9685, 'FP': 327, 'FN': 329, 'Accuracy': 0.9672, 'Precision': 0.9672541558181454, 'Recall': 0.9670604725670805, 'desc': 'nbs-tr'}
{'Pos': 2512, 'Neg': 2488, 'TP': 2146, 'TN': 2085, 'FP': 403, 'FN': 366, 'Accuracy': 0.8462, 'Precision': 0.8418987838367987, 'Recall': 0.8542993630573248, 'desc': 'nbs-tt'}


### MODEL: Perceptron

In [40]:
from sklearn import linear_model
m4 = linear_model.SGDClassifier(loss='perceptron')
m4.fit(X_train, y_train)

m4_train = BinaryClassificationPerformance(m4.predict(X_train), y_train, 'prc-tr')
m4_train.compute_measures()
print(m4_train.performance_measures)

m4_test = BinaryClassificationPerformance(m4.predict(X_test), y_test, 'prc-tt')
m4_test.compute_measures()
print(m4_test.performance_measures)

{'Pos': 9988, 'Neg': 10012, 'TP': 9988, 'TN': 10012, 'FP': 0, 'FN': 0, 'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'desc': 'prc-tr'}
{'Pos': 2512, 'Neg': 2488, 'TP': 2103, 'TN': 2089, 'FP': 399, 'FN': 409, 'Accuracy': 0.8384, 'Precision': 0.8405275779376499, 'Recall': 0.8371815286624203, 'desc': 'prc-tt'}


### MODEL: Random Forest Classifier

In [41]:
from sklearn.ensemble import RandomForestClassifier
m5 = RandomForestClassifier(max_depth=10, random_state=1, n_estimators = 500)
m5.fit(X_train, y_train)

m5_train = BinaryClassificationPerformance(m5.predict(X_train), y_train, 'rdf_tr')
m5_train.compute_measures()
print(m5_train.performance_measures)

m5_test = BinaryClassificationPerformance(m5.predict(X_test), y_test, 'rdf_tt')
m5_test.compute_measures()
print(m5_test.performance_measures)

{'Pos': 9988, 'Neg': 10012, 'TP': 9371, 'TN': 8646, 'FP': 1366, 'FN': 617, 'Accuracy': 0.90085, 'Precision': 0.8727763807394989, 'Recall': 0.9382258710452543, 'desc': 'rdf_tr'}
{'Pos': 2512, 'Neg': 2488, 'TP': 2195, 'TN': 1994, 'FP': 494, 'FN': 317, 'Accuracy': 0.8378, 'Precision': 0.8162885831164002, 'Recall': 0.8738057324840764, 'desc': 'rdf_tt'}


### MODEL: logistic regression

In [42]:
from sklearn import linear_model
m6 = linear_model.SGDClassifier(loss='log')
m6.fit(X_train, y_train)

m6_train = BinaryClassificationPerformance(m6.predict(X_train), y_train, 'log-tr')
m6_train.compute_measures()
print(m6_train.performance_measures)

m6_test = BinaryClassificationPerformance(m6.predict(X_test), y_test, 'log-tt')
m6_test.compute_measures()
print(m6_test.performance_measures)

{'Pos': 9988, 'Neg': 10012, 'TP': 9988, 'TN': 10012, 'FP': 0, 'FN': 0, 'Accuracy': 1.0, 'Precision': 1.0, 'Recall': 1.0, 'desc': 'log-tr'}
{'Pos': 2512, 'Neg': 2488, 'TP': 2088, 'TN': 2068, 'FP': 420, 'FN': 424, 'Accuracy': 0.8312, 'Precision': 0.8325358851674641, 'Recall': 0.8312101910828026, 'desc': 'log-tt'}


## Voting Ensemble using m1-6

In [45]:
from sklearn.ensemble import VotingClassifier

# add estimtors of the sub models 
estimators = []
estimators.append(('svm', m1))
estimators.append(('rgm', m2))
#estimators.append(('nbs', m3))
#estimators.append(('prc', m4))
#estimators.append(('rdf', m5))
#estimators.append(('log', m6))

# create the ensemble model
ensemble = VotingClassifier(estimators)
ensemble.fit(X_train, y_train)

ensemble_train = BinaryClassificationPerformance(ensemble.predict(X_train), y_train, 'esb-tr')
ensemble_train.compute_measures()
print(ensemble_train.performance_measures)

ensemble_test = BinaryClassificationPerformance(ensemble.predict(X_test), y_test, 'esb-tt')
ensemble_test.compute_measures()
print(ensemble_test.performance_measures)


{'Pos': 9988, 'Neg': 10012, 'TP': 9836, 'TN': 10002, 'FP': 10, 'FN': 152, 'Accuracy': 0.9919, 'Precision': 0.9989843591306115, 'Recall': 0.9847817380857028, 'desc': 'esb-tr'}
{'Pos': 2512, 'Neg': 2488, 'TP': 2226, 'TN': 2196, 'FP': 292, 'FN': 286, 'Accuracy': 0.8844, 'Precision': 0.8840349483717236, 'Recall': 0.8861464968152867, 'desc': 'esb-tt'}


---

# <span style="color:red">SUBMISSION</span>

---

In [46]:
# read in test data for submission
# CHANGE FILE PATH and my_random_seed number (any integer other than 74 will do): 
raw_data, X_test_submission = process_raw_data_modified(fn='/Users/heewoong.kim/Documents/GitHub/ml/ProjectDataset/moviereviews_test.tsv', my_random_seed=99, test=True)
print("Number of rows in the submission test set (should be 25,000): ")

movie_data is: <class 'pandas.core.frame.DataFrame'>
movie_data has 25000 rows and 2 columns 

the data types for each of the columns in movie_data:
id        object
review    object
dtype: object 

the first 10 rows in movie_data:
         id                                             review
0  12311_10  Naturally in a film who's main themes are of m...
1    8348_2  This movie is a disaster within a disaster fil...
2    5828_4  All in all, this is a movie for kids. We saw i...
3    7186_2  Afraid of the Dark left me with the impression...
4   12128_7  A very accurate depiction of small time mob li...
Shape of HashingVectorizer X:
(25000, 65536)
Look at a few rows of the new quantitative features: 
   word_count  punc_count
0         131           5
1         169          15
2         176          18
3         112           5
4         133           8
5         331          20
6         121          18
7         230          22
8          59           3
9         224          14
Size 

In [47]:
# store the id from the raw data
my_submission = pd.DataFrame(raw_data["id"])
# concatenate predictions to the id
my_submission["prediction"] = ensemble.predict(X_test_submission)
# look at the proportion of positive predictions
print(my_submission['prediction'].mean())

0.48776


In [48]:
raw_data.head()

Unnamed: 0,id,review,word_count,punc_count
0,12311_10,Naturally in a film who's main themes are of m...,131,5
1,8348_2,This movie is a disaster within a disaster fil...,169,15
2,5828_4,"All in all, this is a movie for kids. We saw i...",176,18
3,7186_2,Afraid of the Dark left me with the impression...,112,5
4,12128_7,A very accurate depiction of small time mob li...,133,8


In [49]:
my_submission.head()

Unnamed: 0,id,prediction
0,12311_10,1
1,8348_2,0
2,5828_4,1
3,7186_2,0
4,12128_7,1


In [50]:
my_submission.shape

(25000, 2)

In [51]:
# export submission file as csv
# CHANGE FILE PATH: 
my_submission.to_csv('/Users/heewoong.kim/Documents/GitHub/ml/final_assignment_1/moviereviews_submission4.csv', index=False)

# Submit to Canvas: 1) the CSV file that was written in the previous cell and 2) the url to the repository (GitHub or other) that contains your code and documentation