In [2]:
import os
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [4]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'
TRAIN_CLEAN_DATA = 'train_clean.csv'

train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)

reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

In [5]:
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="char", sublinear_tf=True, ngram_range=(1,3), max_features=5000)

X = vectorizer.fit_transform(reviews)

X

<25000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 17862871 stored elements in Compressed Sparse Row format>

In [6]:
features = vectorizer.get_feature_names()

y = np.array(sentiments)

x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
lgs = LogisticRegression(class_weight='balanced')
lgs.fit(x_train, y_train)

LogisticRegression(class_weight='balanced')

In [8]:
predicted = lgs.predict(x_val)
print(lgs.score(x_val, y_val))

0.8598


In [9]:
TEST_CLEAN_DATA = 'test_clean.csv'
test_data = pd.read_csv(DATA_IN_PATH+TEST_CLEAN_DATA)

In [10]:
testDataVecs = vectorizer.transform(test_data['review'])

In [11]:
test_predicted = lgs.predict(testDataVecs)
print(test_predicted)

[1 0 1 ... 0 1 0]


In [12]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

answer_dataset = pd.DataFrame({'id':test_data['id'], 'sentiment' : test_predicted})
answer_dataset.to_csv(DATA_OUT_PATH + 'lgs_tfidf_answer.csv', index = False, quoting=3)