In [1]:
import scipy
import sklearn
import json
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from collections import Counter

In [3]:
# helper function: write out prediction values into a csv format file
# params:
#     df: dataframe, where each row is a test example, with column 'id' as data id
#     pred: a list or 1-d array of prediction values
#     filepath: the output file path
# return:
#     None

def write_test_prediction(df, pred, filepath):
    with open(filepath, 'w') as outfile:
        outfile.write('{},{}\n'.format('id', 'label'))
        for index, row in df.iterrows():
            outfile.write('{},{}\n'.format(row['id'], pred[index]))
    print (len(df), 'predictions are written to', filepath)

In [2]:
dataframe = pd.read_csv('./train.tsv', sep = '\t')

In [4]:
yelp_data = pd.read_csv('./extra data/yelp_prepped.tsv', sep = '\t', header=None)

In [5]:
yelp_data.rename(columns={0: "label",1: "review"}, inplace=True)

In [6]:
data_combined = dataframe.append(yelp_data)

In [7]:
train_ratio = 0.8 # 80% for training, 20% for validation
random_seed = 100

train_dataframe = data_combined.sample(frac=train_ratio, random_state=random_seed)
valid_dataframe = data_combined.drop(train_dataframe.index)
print('training set size:', len(train_dataframe))
print('validation set size:', len(valid_dataframe))

training set size: 30828
validation set size: 4979


In [8]:
test_dataframe = pd.read_csv('./test.tsv', sep = '\t')

In [9]:
# switching to bigram and trigrams along with the default stopwords
vectorizer = TfidfVectorizer(stop_words={"english"}, ngram_range=(1,3))
vectorizer.fit(train_dataframe["review"])

TfidfVectorizer(ngram_range=(1, 3), stop_words={'english'})

In [10]:
train_X = vectorizer.transform(train_dataframe["review"])
valid_X = vectorizer.transform(valid_dataframe["review"])


In [11]:
test_X = vectorizer.transform(test_dataframe["review"])


In [12]:
train_Y = train_dataframe["label"]
model = LogisticRegression(C = 1, solver='liblinear')
model.fit(train_X, train_Y)

LogisticRegression(C=1, solver='liblinear')

In [13]:
train_Y_hat = model.predict(train_X)
accuracy = accuracy_score(train_dataframe["label"], train_Y_hat)
print ('Logistic regression, accuracy on training set:', accuracy)

Logistic regression, accuracy on training set: 0.9636369534189698


In [18]:
valid_Y_hat = model.predict(valid_X)
accuracy = accuracy_score(valid_dataframe['label'], valid_Y_hat)
print ('Logistic regression, accuracy on validation set:', accuracy)

Logistic regression, accuracy on validation set: 0.8837115886724242
