In [17]:
import time
start_time = time.time()

In [18]:
import scipy
from scipy.sparse import hstack, coo_matrix
import sklearn
import json
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
import nltk
import pandas as pd
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ashbu\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [19]:
# helper function: write out prediction values into a csv format file
# params:
#     df: dataframe, where each row is a test example, with column 'id' as data id
#     pred: a list or 1-d array of prediction values
#     filepath: the output file path
# return:
#     None

def write_test_prediction(df, pred, filepath):
    with open(filepath, 'w') as outfile:
        outfile.write('{},{}\n'.format('id', 'label'))
        for index, row in df.iterrows():
            outfile.write('{},{}\n'.format(row['id'], pred[index]))
    print (len(df), 'predictions are written to', filepath)

In [20]:
dataframe = pd.read_csv('./train.tsv', sep = '\t')

In [21]:
yelp_data = pd.read_csv('./extra data/yelp_prepped.tsv', sep = '\t', header=None)

In [22]:
yelp_data.rename(columns={0: "label",1: "review"}, inplace=True)

In [23]:
data_combined = dataframe.append(yelp_data)

In [24]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [25]:
%%time
data_combined["rating"] = data_combined["review"].apply(analyzer.polarity_scores)

Wall time: 32.5 s


In [26]:
data_combined = pd.concat([data_combined.drop(['rating'], axis=1), data_combined['rating'].apply(pd.Series)], axis=1)

In [27]:
train_ratio = 0.8 # 80% for training, 20% for validation
random_seed = 100

train_dataframe = data_combined.sample(frac=train_ratio, random_state=random_seed)
valid_dataframe = data_combined.drop(train_dataframe.index)
print('training set size:', len(train_dataframe))
print('validation set size:', len(valid_dataframe))

training set size: 30828
validation set size: 4979


In [28]:
# reading in the test dataframe
test_dataframe = pd.read_csv('./test.tsv', sep = '\t')

In [29]:
%%time
# runnign Vader on the test_dataframe
test_dataframe["rating"] = test_dataframe["review"].apply(analyzer.polarity_scores)
test_dataframe = pd.concat([test_dataframe.drop(['rating'], axis=1), test_dataframe['rating'].apply(pd.Series)], axis=1)

Wall time: 5.69 s


In [30]:
vectorizer = TfidfVectorizer(stop_words={"english"},ngram_range=(1,3))
vectorizer.fit(train_dataframe["review"])

TfidfVectorizer(ngram_range=(1, 3), stop_words={'english'})

In [31]:
%%time
train_X = vectorizer.transform(train_dataframe["review"])
valid_X = vectorizer.transform(valid_dataframe["review"])


Wall time: 8.81 s


In [32]:
sparse_train = coo_matrix(train_dataframe[["neg","neu","pos","compound"]])
sparse_valid = coo_matrix(valid_dataframe[["neg","neu","pos","compound"]])

In [33]:
# stacking the scores onto the sparse matrix
train_X = hstack((sparse_train, train_X))
valid_X =  hstack((sparse_valid, valid_X))

In [34]:
# stacking test dataframe
sparse_test = coo_matrix(test_dataframe[["neg","neu","pos","compound"]])
test_X = vectorizer.transform(test_dataframe["review"])
test_X = hstack((sparse_test, test_X))

In [35]:
train_Y = train_dataframe["label"]
model = LogisticRegression(C = 1, solver='liblinear')
model.fit(train_X, train_Y)

LogisticRegression(C=1, solver='liblinear')

In [36]:
train_Y_hat = model.predict(train_X)
accuracy = accuracy_score(train_dataframe["label"], train_Y_hat)
print ('Logistic regression, accuracy on training set:', accuracy)

Logistic regression, accuracy on training set: 0.9215323731672506


In [37]:
valid_Y_hat = model.predict(valid_X)
accuracy = accuracy_score(valid_dataframe["label"], valid_Y_hat)
print ('Logistic regression, accuracy on validation set:', accuracy)

Logistic regression, accuracy on validation set: 0.8656356698132155


In [38]:
end_time = time.time()

run_time = end_time - start_time

print(run_time)

79.96820759773254


In [25]:
%%time
all_train_Y = data_combined['label']
all_train_X = vectorizer.transform(data_combined['review'])
all_train_sparse = coo_matrix(data_combined[["neg","neu","pos","compound"]])
all_train_X = hstack((all_train_sparse, all_train_X))

Wall time: 2.64 s


In [26]:
model.fit(all_train_X, all_train_Y)
test_Y_hat = model.predict(test_X)
write_test_prediction(test_dataframe, test_Y_hat, './logistic_regression_vader_yelp.csv')

6000 predictions are written to ./logistic_regression_vader_yelp.csv
