In [151]:
import numpy as np
import pandas as pd
import os
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer # copied over from ipynb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

import nltk
from nltk.corpus import stopwords # tried to use this vocab but ended up with lower accuracy than the built in stopwords
nltk.download('stopwords')

SEED = 12345

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/juliewang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [152]:
from matplotlib import pyplot as plt

import seaborn as sns
#This sets the default style for all figures. 
sns.set('notebook', font_scale=1.25, style='whitegrid')

In [153]:
# load the csv files
if __name__ == '__main__':
    data_dir = '../data_reviews'
    x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
    y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))

    N, n_cols = x_train_df.shape
    print("Shape of x_train_df: (%d, %d)" % (N,n_cols))
    print("Shape of y_train_df: %s" % str(y_train_df.shape))

    # Print out the first five rows and last five rows
    tr_text_list = x_train_df['text'].values.tolist()
    rows = np.arange(0, 5)
    for row_id in rows:
        text = tr_text_list[row_id]
        print("row %5d | y = %d | %s" % (row_id, y_train_df.values[row_id,0], text))

    print("...")
    rows = np.arange(N - 5, N)
    for row_id in rows:
        text = tr_text_list[row_id]
        print("row %5d | y = %d | %s" % (row_id, y_train_df.values[row_id,0], text))

Shape of x_train_df: (2400, 2)
Shape of y_train_df: (2400, 1)
row     0 | y = 0 | Oh and I forgot to also mention the weird color effect it has on your phone.
row     1 | y = 0 | THAT one didn't work either.
row     2 | y = 0 | Waste of 13 bucks.
row     3 | y = 0 | Product is useless, since it does not have enough charging current to charge the 2 cellphones I was planning to use it with.
row     4 | y = 0 | None of the three sizes they sent with the headset would stay in my ears.
...
row  2395 | y = 1 | The sweet potato fries were very good and seasoned well.
row  2396 | y = 1 | I could eat their bruschetta all day it is devine.
row  2397 | y = 1 | Ambience is perfect.
row  2398 | y = 1 | We ordered the duck rare and it was pink and tender on the inside with a nice char on the outside.
row  2399 | y = 1 | Service was good and the company was better!


# Problem 2:

In [154]:
# Splitting up training data into training set and validation set
x_train, x_va, y_train, y_va = train_test_split(x_train_df, y_train_df, test_size=0.2, random_state=SEED)
y_train = y_train.values.ravel()
y_va = y_va.values.ravel()

In [155]:
# define logistic regression pipeline function
def make_rfc_pipeline():
    pipeline = Pipeline(
        steps=[
         ('vectorizer', TfidfVectorizer(
             lowercase=True, # make the text uniformly lowercase
             stop_words='english', # remove filler words ('a', 'the', etc.) present in the stopwords nltk library
             analyzer='word', # breakdown text into words for feature analysis
             min_df = 0.05, 
             max_df=0.70, # ignore words with a frequency strictly higher than 50%
             token_pattern=r'\b\w+\b', # removes punctuation and numbers
             ngram_range=(1,2) # extracts unigrams and bigrams
             )
            ),
         ('knn', KNeighborsClassifier()),
        ])
    return pipeline

In [156]:
# initialize pipeline
pipe = make_rfc_pipeline()

# make hyperparameter C grid (regularization strength), 20 logspaced values from 10e-6 to 10e6
N_grid = np.linspace(1, 20, 2, dtype=int)
param_grid = {'knn__n_neighbors': N_grid}

In [157]:
# make hyperparameter C grid (regularization strength), 20 logspaced values from 10e-6 to 10e6
# n_estimators_grid = np.logspace(-6, 6, 20)
# param_grid = {'rfc__C': C_grid}

# perform grid search and fit the model
grid_search = GridSearchCV(
    pipe, # estimator
    param_grid=param_grid, # hyperparameter C
    cv=5, # 5-fold cross validation
    scoring='roc_auc' # calculates AUROC to compare the hyperparameter(s)
)

grid_search.fit(x_train['text'], y_train)
grid_predictions_va = grid_search.predict(x_va['text'])
grid_predictions_tr = grid_search.predict(x_train['text'])

# calculate and print accuracy for training and validation datasets
print("Training Accuracy:", accuracy_score(y_train, grid_predictions_tr))
print("Validation Accuracy:", accuracy_score(y_va, grid_predictions_va))

# best parameter found in the grid search
print("Best C:", grid_search.best_params_)

Training Accuracy: 0.553125
Validation Accuracy: 0.5520833333333334
Best C: {'knn__n_neighbors': 20}


In [158]:
_, fp, fn, _ = confusion_matrix(y_va, grid_predictions_va).ravel()
FP = []
FN = []

for i, pred in enumerate(grid_predictions_va):
    if pred == 1 and y_va[i] == 0:
        FP.append(i)
    if pred == 0 and y_va[i] == 1:
        FN.append(i)

# print(FP)

# print(x_va.iloc[FP[0]])

# pd.set_option('display.max_colwidth', None)
# pd.set_option("display.colheader_justify","left")
pd.set_option('display.max_colwidth', 10000)

va_idx_FP = FP[0:10]
va_sent_predict_FP = grid_predictions_va[va_idx_FP]
va_sent_true_FP = y_va[va_idx_FP]
FP_styled_df = x_va.iloc[va_idx_FP].style.set_properties(**{'text_align': 'right'})

print("FP data:")
print("FP indices chosen:", va_idx_FP)
print("FP predictions and true sentiments", va_sent_predict_FP, va_sent_true_FP)
print("Sentences that were FP: \n")
FP_styled_df

FP data:
FP indices chosen: [4, 6, 7, 11, 12, 13, 14, 19, 22, 25]
FP predictions and true sentiments [1 1 1 1 1 1 1 1 1 1] [0 0 0 0 0 0 0 0 0 0]
Sentences that were FP: 



Unnamed: 0,website_name,text
1907,yelp,"for 40 bucks a head, i really expect better food."
300,amazon,worthless product.
1964,yelp,I also decided not to send it back because our waitress looked like she was on the verge of having a heart attack.
1659,yelp,I've never been treated so bad.
1663,yelp,"The burger had absolutely no flavor - the meat itself was totally bland, the burger was overcooked and there was no charcoal flavor."
1743,yelp,I was disgusted because I was pretty sure that was human hair.
1991,yelp,I think this restaurant suffers from not trying hard enough.
374,amazon,My only complaint is the standard sound volume is a little low even when turned up to 5(of 5)
1946,yelp,The bus boy on the other hand was so rude.
1766,yelp,A lady at the table next to us found a live green caterpillar In her salad.


In [159]:
va_idx_FN = FN[0:10]
va_sent_predict_FN = grid_predictions_va[va_idx_FN]
va_sent_true_FN = y_va[va_idx_FN]
FN_styled_df = x_va.iloc[va_idx_FN].style.set_properties(**{'text_align': 'right'})

print("FN data:")
print("FN indices chosen:", va_idx_FN)
print("FN predictions and true sentiments", va_sent_predict_FN, va_sent_true_FN)
print("Sentences that were FN: \n")
FN_styled_df

FN data:
FN indices chosen: [16, 24, 58, 67, 69, 95, 102, 122, 124, 132]
FN predictions and true sentiments [0 0 0 0 0 0 0 0 0 0] [1 1 1 1 1 1 1 1 1 1]
Sentences that were FN: 



Unnamed: 0,website_name,text
606,amazon,Restored my phone to like new performance.
1313,imdb,Thanks good a movie like this was done and released.
511,amazon,"I have used several phone in two years, but this one is the best."
2374,yelp,The goat taco didn't skimp on the meat and wow what FLAVOR!
1563,imdb,I don't think you will be disappointed.
1215,imdb,I rate this movie 9/10.
544,amazon,Phone is sturdy as all nokia bar phones are.
502,amazon,"I love this phone , It is very handy and has a lot of features ."
483,amazon,This BlueAnt Supertooth hands-free phone speaker is AWESOME.
1477,imdb,"I won't spoil it, but the ending in pretty amazing."


In [160]:
 # print best parameter and best score
print(f'Best C: {grid_search.best_params_["log_regr__C"]}')
print(f'Best AUROC score: {grid_search.best_score_}')

# plot the performance of different regularization strengths
scores = grid_search.cv_results_['mean_test_score']

fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(N_grid, scores, marker='o')
ax.set_xscale('log')
ax.set_xlabel('C (Inverse Regularization Strength)')
ax.set_ylabel('Mean AUROC')
ax.set_title('Effect of C on AUROC (5-fold CV)')
plt.show()

KeyError: 'log_regr__C'

The peak of the graph indicates the best C value - this will prevent over- and under-fitting to the training data. 

In [40]:
x_test = pd.read_csv(os.path.join(data_dir, 'x_test.csv'))
grid_predictions_test = grid_search.predict(x_test['text'])
grid_predictions_test_T = np.transpose(grid_predictions_test)
print(str(grid_predictions_test_T))

file = open("yproba1_test.txt", "w+")
for value in grid_predictions_test:
    line = str(value) + "\n"
    file.write(line)
file.close()



[0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0
 0 0 0 0 0 1 1 0 0 0 0 1 1 1 1 0 0 1 0 1 0 0 1 1 0 1 1 0 1 1 0 0 1 0 0 0 0
 1 0 1 1 0 0 0 0 1 0 0 1 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 1 0 1 1 0
 1 0 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 1 0
 0 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0
 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
 1 1 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0
 1 1 0 0 0 0 0 0 0 1 0 0 

Problem 2: 