In [17]:
%%time
import scipy
import sklearn
import json
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


# helper function: write out prediction values into a csv format file
# params:
#     df: dataframe, where each row is a test example, with column 'id' as data id
#     pred: a list or 1-d array of prediction values
#     filepath: the output file path
# return:
#     None

def write_test_prediction(df, pred, filepath):
    with open(filepath, 'w') as outfile:
        outfile.write('{},{}\n'.format('id', 'label'))
        for index, row in df.iterrows():
            outfile.write('{},{}\n'.format(row['id'], pred[index]))
    print (len(df), 'predictions are written to', filepath)

dataframe = pd.read_csv('./train.tsv', sep = '\t')

yelp_data = pd.read_csv('./extra data/yelp_prepped.tsv', sep = '\t', header=None)

yelp_data.rename(columns={0: "label",1: "review"}, inplace=True)

data_combined = dataframe.append(yelp_data)

train_ratio = 0.8 # 80% for training, 20% for validation
random_seed = 100

train_dataframe = data_combined.sample(frac=train_ratio, random_state=random_seed)
valid_dataframe = data_combined.drop(train_dataframe.index)
print('training set size:', len(train_dataframe))
print('validation set size:', len(valid_dataframe))

test_dataframe = pd.read_csv('./test.tsv', sep = '\t')

vectorizer = TfidfVectorizer(stop_words={"english"})
vectorizer.fit(train_dataframe["review"])

train_X = vectorizer.transform(train_dataframe["review"])
valid_X = vectorizer.transform(valid_dataframe["review"])


test_X = vectorizer.transform(test_dataframe["review"])


train_Y = train_dataframe["label"]
model = LogisticRegression(C = 1, solver='liblinear')
model.fit(train_X, train_Y)

train_Y_hat = model.predict(train_X)
accuracy = accuracy_score(train_dataframe["label"], train_Y_hat)
print ('Logistic regression, accuracy on training set:', accuracy)

valid_Y_hat = model.predict(valid_X)
accuracy = accuracy_score(valid_dataframe["label"], valid_Y_hat)
print ('Logistic regression, accuracy on validation set:', accuracy)






training set size: 30828
validation set size: 4979
Logistic regression, accuracy on training set: 0.9285714285714286
Logistic regression, accuracy on validation set: 0.8817031532436232
Wall time: 5.25 s


In [1]:
# transforming by choosing the best features in the feature set
selector = SelectKBest(k=1000)
x_new = selector.fit_transform(train_X, train_Y)
x_new_valid = selector.transform(valid_X)

model.fit(x_new,train_Y)
valid_Y_hat = model.predict(x_new_valid)
accuracy = accuracy_score(valid_dataframe["label"], valid_Y_hat)
print ('Logistic regression, accuracy on validation set:', accuracy)


NameError: name 'SelectKBest' is not defined

62939

In [48]:
k_list = np.arange(1000, train_X.shape[1],500)
k_dict = {}
    

In [49]:
%%time
for i in k_list:
    selector = SelectKBest(chi2, k=i)
    x_new = selector.fit_transform(train_X, train_Y)
    x_new_valid = selector.transform(valid_X)

    model.fit(x_new,train_Y)
    valid_Y_hat = model.predict(x_new_valid)
    accuracy = accuracy_score(valid_dataframe["label"], valid_Y_hat)
    k_dict.setdefault(i, accuracy)


Wall time: 47 s


In [50]:
dict(sorted(k_dict.items(), key=lambda item: item[1], reverse=True))

{12000: 0.8845149628439446,
 6500: 0.8841132757581843,
 7000: 0.8841132757581843,
 8000: 0.8839124322153042,
 11500: 0.8837115886724242,
 7500: 0.8835107451295441,
 9500: 0.8835107451295441,
 9000: 0.883309901586664,
 10000: 0.883309901586664,
 6000: 0.8831090580437839,
 10500: 0.8831090580437839,
 8500: 0.8829082145009038,
 11000: 0.8829082145009038,
 5000: 0.8827073709580237,
 15000: 0.8825065274151436,
 17500: 0.8825065274151436,
 5500: 0.8823056838722635,
 17000: 0.8823056838722635,
 18000: 0.8823056838722635,
 62500: 0.8823056838722635,
 16000: 0.8821048403293834,
 16500: 0.8821048403293834,
 62000: 0.8821048403293834,
 12500: 0.8819039967865033,
 14000: 0.8819039967865033,
 14500: 0.8819039967865033,
 18500: 0.8819039967865033,
 20000: 0.8819039967865033,
 60000: 0.8819039967865033,
 60500: 0.8819039967865033,
 61500: 0.8819039967865033,
 13500: 0.8817031532436232,
 15500: 0.8817031532436232,
 19000: 0.8817031532436232,
 19500: 0.8817031532436232,
 28000: 0.8817031532436232,
 565