In [1]:
import pandas as pd
df = pd.read_csv("new_data.csv")

In [2]:
#Creating document term matrix

from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer(max_df=0.9,
                                min_df=20,
                                stop_words='english')
tf_master = tf_vectorizer.fit_transform(df['text'])
print("Vocabulary Size:",tf_master.shape[1])

Vocabulary Size: 2967


In [3]:
#Creating 1/0 label for "sentiment"

def f(row):
    if row['stars'] == 5 or row['stars'] == 4:
        val = 1
    else:
        val = 0
    return val

df['labels'] = df.apply(f,axis=1)
labels = df['labels']

In [4]:
#Training test split

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split( 
tf_master, labels, test_size = 0.25, random_state = 95828)



In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Building an unpruned decision tree
clf1 = DecisionTreeClassifier(criterion="entropy",random_state=95825)
clf1.fit(X_train,y_train)
ypred_train = clf1.predict(X_train)
ypred_test = clf1.predict(X_test)
print("Training Accuracy: ",str(accuracy_score(y_train,ypred_train)))
print("Test Accuracy: ",str(accuracy_score(y_test,ypred_test)))

Training Accuracy:  1.0
Test Accuracy:  0.7612


In [6]:
import numpy as np
from sklearn.model_selection import cross_validate
num_folds = 5
fold_scores=[]
param_values = np.arange(5,90,5)
arg_max = None
max_cross_val_score = -np.inf
for C in param_values:
    nums=[]
    #print("C = " + str(C))
    clf = DecisionTreeClassifier(criterion="entropy",max_depth=C,random_state=95825)
    cv_results = cross_validate(clf, X_train, y_train,cv=num_folds ,return_train_score=False)
    nums.append(list(cv_results['test_score']))
    fold_scores.append(np.mean(nums))
    
    cross_val_score = np.mean(nums)
    if cross_val_score > max_cross_val_score:
            max_cross_val_score = cross_val_score
            arg_max = C


best_C = arg_max
print("Best C:",best_C)
print("Cross Validation Score:",max_cross_val_score)


Best C: 40
Cross Validation Score: 0.772928808531


In [7]:
#Using best hyperparameter to predict test accuracy

clf2 = DecisionTreeClassifier(criterion="entropy",max_depth=best_C,random_state=95825)
clf2.fit(X_train,y_train)
ypred_train = clf2.predict(X_train)
ypred_test = clf2.predict(X_test)
print("Training Accuracy: ",str(accuracy_score(y_train,ypred_train)))
print("Test Accuracy: ",str(accuracy_score(y_test,ypred_test)))

Training Accuracy:  0.9676
Test Accuracy:  0.7664


In [8]:
#Random Forest

from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV

clf3 = DecisionTreeClassifier(criterion="entropy")
num_trees = 100
m = int (0.2 *tf_master.shape[1])
model = BaggingClassifier(base_estimator=clf3, n_estimators=num_trees, max_features = m, random_state =95828 )
model = model.fit(X_train, y_train)

param_grid = [{'base_estimator__max_depth':np.arange(5,90,5)}]
grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)
print(grid.best_score_)
print("best depth is : ",grid.best_params_)

0.869733333333
best depth is :  {'base_estimator__max_depth': 80}


In [9]:
#RF on test data with best depth

clf4 = DecisionTreeClassifier(criterion="entropy",max_depth=grid.best_params_)
num_trees = 100
m = int (0.2 *tf_master.shape[1])

model_new = BaggingClassifier(base_estimator=clf4, n_estimators=num_trees, max_features = m, random_state =95828 )
model_new = model.fit(X_train, y_train)
ypred_train = model_new.predict(X_train)
ypred_test = model_new.predict(X_test)
print("Training Accuracy: ",str(accuracy_score(y_train,ypred_train)))
print("Test Accuracy: ",str(accuracy_score(y_test,ypred_test)))


Training Accuracy:  0.9996
Test Accuracy:  0.8512


In [10]:
#Unregularized Logistic Regression

from sklearn.linear_model import LogisticRegression
LogReg = LogisticRegression(penalty='l1',C=1e10)
LogReg.fit(X_train, y_train)
ypred_train = LogReg.predict(X_train)
ypred_test = LogReg.predict(X_test)
print("Training Accuracy: ",str(accuracy_score(y_train,ypred_train)))
print("Test Accuracy: ",str(accuracy_score(y_test,ypred_test)))


Training Accuracy:  1.0
Test Accuracy:  0.8436


In [11]:
#Regularized Logisitc Regression

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
fold_scores=[]
num_folds = 5
k_fold = KFold(num_folds)
param_values = np.linspace(0.04,1,num=20)
arg_max = None
max_cross_val_score = -np.inf
for C in param_values:
    nums=[]
    
    print("C = " + str(C))
    clf = LogisticRegression(penalty='l1',C=C)        
    cv_results = cross_validate(clf, X_train, y_train,cv=num_folds ,return_train_score=False)
    nums.append(list(cv_results['test_score']))
    fold_scores.append(np.mean(nums))
    
    cross_val_score = np.mean(nums)
    if cross_val_score > max_cross_val_score:
            max_cross_val_score = cross_val_score
            arg_max = C


best_C = arg_max
print("Best C:",best_C)
print("Cross Validation Score:",max_cross_val_score)


C = 0.04
C = 0.0905263157895
C = 0.141052631579
C = 0.191578947368
C = 0.242105263158
C = 0.292631578947
C = 0.343157894737
C = 0.393684210526
C = 0.444210526316
C = 0.494736842105
C = 0.545263157895
C = 0.595789473684
C = 0.646315789474
C = 0.696842105263
C = 0.747368421053
C = 0.797894736842
C = 0.848421052632
C = 0.898947368421
C = 0.949473684211
C = 1.0
Best C: 0.545263157895
Cross Validation Score: 0.890668226964


In [12]:
LogReg_regularized = LogisticRegression(penalty='l1',C=best_C,)
LogReg_regularized.fit(X_train, y_train)
ypred_train = LogReg_regularized.predict(X_train)
ypred_test = LogReg_regularized.predict(X_test)
print("Training Accuracy: ",str(accuracy_score(y_train,ypred_train)))
print("Test Accuracy: ",str(accuracy_score(y_test,ypred_test)))


Training Accuracy:  0.957733333333
Test Accuracy:  0.8816


In [13]:
#SVM

from sklearn import svm

fold_scores=[]
num_folds = 5
k_fold = KFold(num_folds)
param_values = np.logspace(-4, 2, 10)
arg_max = None
max_cross_val_score = -np.inf
for C in param_values:
    nums=[]
    
    print("C = " + str(C))
    clf = svm.LinearSVC(C=C)
    cv_results = cross_validate(clf, X_train, y_train,cv=num_folds ,return_train_score=False)
    nums.append(list(cv_results['test_score']))
    fold_scores.append(np.mean(nums))
    
    cross_val_score = np.mean(nums)
    if cross_val_score > max_cross_val_score:
            max_cross_val_score = cross_val_score
            arg_max = C


best_C = arg_max
print("Best C:",best_C)
print("Cross Validation Score:",max_cross_val_score)


C = 0.0001
C = 0.000464158883361
C = 0.00215443469003
C = 0.01
C = 0.0464158883361
C = 0.215443469003
C = 1.0
C = 4.64158883361
C = 21.5443469003
C = 100.0
Best C: 0.01
Cross Validation Score: 0.897201829156


In [14]:
clf_SVM = svm.LinearSVC(C=C)
clf_SVM.fit(X_train, y_train)
ypred_train = clf_SVM.predict(X_train)
ypred_test = clf_SVM.predict(X_test)
print("Training Accuracy: ",str(accuracy_score(y_train,ypred_train)))
print("Test Accuracy: ",str(accuracy_score(y_test,ypred_test)))


Training Accuracy:  0.999333333333
Test Accuracy:  0.844


In [15]:
# KNN 

from sklearn.neighbors import KNeighborsClassifier

fold_scores=[]
num_folds = 5
k_fold = KFold(num_folds)
param_values = [1,3,5,7,9,11]
arg_max = None
max_cross_val_score = -np.inf
for C in param_values:
    nums=[]
    
    print("C = " + str(C))
    knn_model = KNeighborsClassifier(n_neighbors=C)
    cv_results = cross_validate(knn_model, X_train, y_train,cv=num_folds ,return_train_score=False)
    nums.append(list(cv_results['test_score']))
    fold_scores.append(np.mean(nums))
    
    cross_val_score = np.mean(nums)
    print(cross_val_score)
    if cross_val_score > max_cross_val_score:
            max_cross_val_score = cross_val_score
            arg_max = C


best_C = arg_max
print("Best C:",best_C)
print("Cross Validation Score:",max_cross_val_score)


C = 1
0.624129997391
C = 3
0.629466532741
C = 5
0.633327245331
C = 7
0.62772973268
C = 9
0.620528574102
C = 11
0.613860927227
Best C: 5
Cross Validation Score: 0.633327245331


In [16]:
clf_KNN = KNeighborsClassifier(n_neighbors=3)
clf_KNN.fit(X_train, y_train)
ypred_train = clf_KNN.predict(X_train)
ypred_test = clf_KNN.predict(X_test)
print("Training Accuracy: ",str(accuracy_score(y_train,ypred_train)))
print("Test Accuracy: ",str(accuracy_score(y_test,ypred_test)))


Training Accuracy:  0.7628
Test Accuracy:  0.6188
