In [1]:
import os

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


In [5]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'
TRAIN_CLEAN_DATA = 'train_clean.npy' # csv

RANDOM_SEED = 42
TEST_SPLIT = 0.2

In [6]:
train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)

In [9]:
train_data

Unnamed: 0,review,sentiment
0,stuff going moment mj started listening music ...,1
1,classic war worlds timothy hines entertaining ...,1
2,film starts manager nicholas bell giving welco...,0
3,must assumed praised film greatest filmed oper...,0
4,superbly trashy wondrously unpretentious explo...,1
...,...,...
24995,seems like consideration gone imdb reviews fil...,0
24996,believe made film completely unnecessary first...,0
24997,guy loser get girls needs build picked stronge...,0
24998,minute documentary bu uel made early one spain...,0


In [8]:
reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

In [10]:
# tf-idf 값으로 벡터화
# min-df : 설정한 값보다 특정 토큰의 df값이 적게 나오면 벡터화 과정에서 제거
# analyzer : 분석하기 위한 기준 단위(word:단어기준, char:문자기준)
# sublinear_tf : tf(문서 빈도수)에 대한 스무딩(smoothing) 여부 설정
# ngram_range : 단어 묶음에 대한 범위 설정
# max_features : 벡터의 최대 길이
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="char", \
                sublinear_tf=True, ngram_range=(1,3), max_features=5000)
X = vectorizer.fit_transform(reviews)
y = np.array(sentiments)

In [11]:
features = vectorizer.get_feature_names()



In [12]:
features

[' ',
 ' a',
 ' aa',
 ' ab',
 ' ac',
 ' ad',
 ' ae',
 ' af',
 ' ag',
 ' ah',
 ' ai',
 ' ak',
 ' al',
 ' am',
 ' an',
 ' ap',
 ' ar',
 ' as',
 ' at',
 ' au',
 ' av',
 ' aw',
 ' ax',
 ' az',
 ' b',
 ' b ',
 ' ba',
 ' bb',
 ' be',
 ' bi',
 ' bl',
 ' bo',
 ' br',
 ' bu',
 ' by',
 ' c',
 ' c ',
 ' ca',
 ' ce',
 ' cg',
 ' ch',
 ' ci',
 ' cl',
 ' co',
 ' cr',
 ' cu',
 ' cy',
 ' d',
 ' da',
 ' de',
 ' di',
 ' do',
 ' dr',
 ' du',
 ' dv',
 ' dw',
 ' dy',
 ' e',
 ' e ',
 ' ea',
 ' eb',
 ' ec',
 ' ed',
 ' ee',
 ' ef',
 ' eg',
 ' ei',
 ' el',
 ' em',
 ' en',
 ' ep',
 ' eq',
 ' er',
 ' es',
 ' et',
 ' eu',
 ' ev',
 ' ex',
 ' ey',
 ' f',
 ' f ',
 ' fa',
 ' fb',
 ' fe',
 ' fi',
 ' fl',
 ' fo',
 ' fr',
 ' fu',
 ' fx',
 ' g',
 ' g ',
 ' ga',
 ' ge',
 ' gh',
 ' gi',
 ' gl',
 ' go',
 ' gr',
 ' gu',
 ' gw',
 ' gy',
 ' h',
 ' h ',
 ' ha',
 ' hb',
 ' he',
 ' hi',
 ' hm',
 ' ho',
 ' hu',
 ' hy',
 ' i',
 ' ia',
 ' ic',
 ' id',
 ' ig',
 ' ii',
 ' il',
 ' im',
 ' in',
 ' ir',
 ' is',
 ' it',
 ' iv',
 ' j',
 ' j

In [13]:
X_train, X_eval, y_train, y_eval = train_test_split(X,y,test_size=TEST_SPLIT, \
                                    random_state=RANDOM_SEED)

In [14]:
# class_weight='balanced' : 각 레이블에 대해 균형있게 학습
lgs = LogisticRegression(class_weight='balanced')
lgs.fit(X_train, y_train)

In [15]:
predicted = lgs.predict(X_eval)

In [16]:
print("Accuracy: %f" % lgs.score(X_eval, y_eval))

Accuracy: 0.859800


In [17]:
TEST_CLEAN_DATA = 'test_clean.csv'

test_data = pd.read_csv(DATA_IN_PATH + TEST_CLEAN_DATA)

In [18]:
testDataVecs = vectorizer.transform(test_data['review'])

In [19]:
test_predicted = lgs.predict(testDataVecs)
print(test_predicted)

[1 0 1 ... 0 1 0]


In [20]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

answer_dataset = pd.DataFrame({'id':test_data['id'],'sentiment': \
                test_predicted})

answer_dataset.to_csv(DATA_OUT_PATH + 'lgs_tfidf_answer.csv', \
                index=False, quoting=3)

In [21]:
answer_dataset

Unnamed: 0,id,sentiment
0,"""12311_10""",1
1,"""8348_2""",0
2,"""5828_4""",1
3,"""7186_2""",0
4,"""12128_7""",1
...,...,...
24995,"""2155_10""",1
24996,"""59_10""",1
24997,"""2531_1""",0
24998,"""7772_8""",1
