In [1]:
import pandas as pd
import numpy as np

DATA_IN_PATH = "./processed_data/"
TRAIN_CLEAN_DATA = "train_clean.csv"

train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA, header=0, delimiter="\t", quoting=3)
print(train_data.head())

reviews = list(train_data["review"])
sentiments = list(train_data["sentiment"])

                                              review  sentiment
0  stuff going moment mj started listening music ...          1
1  classic war worlds timothy hines entertaining ...          1
2  film starts manager nicholas bell giving welco...          0
3  must assumed praised film greatest filmed oper...          0
4  superbly trashy wondrously unpretentious explo...          1


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF 벡터화 메소드

vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(1, 3), max_features=5000)
# min_df: 최소 df ( min_df 값보다 적게 나온 토큰은 벡터화 과정에서 제외된다. )
# analyzer: 'word' or 'char'
# sublinear_tf: 문서 내 단어 빈도 수에 대한 smoothing 여부
# ngram_range: n-gram 빈도의 기본 단위 범위
# max_features: 각 벡터의 최대 길이, 특징의 길이


X = vectorizer.fit_transform(reviews)

In [3]:
from sklearn.model_selection import train_test_split  # 학습용, 검증용 데이터로 나누는 메소드 호출
import numpy as np
RANDOM_SEED = 42
TEST_SPLIT = 0.2  # 검증용 데이터는 20%로 설정

y = np.array(sentiments)
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

In [4]:
from sklearn.linear_model import LogisticRegression

lgs = LogisticRegression(class_weight="balanced")
lgs.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [5]:
print("Accuracy: {}".format(lgs.score(X_eval, y_eval)))  # 검증 데이터로 성능 측정

Accuracy: 0.8596


In [11]:
# 평가용 데이터 호출
TEST_CLEAN_DATA = "test_clean.csv"

test_data = pd.read_csv(DATA_IN_PATH + TEST_CLEAN_DATA, header=0, delimiter="\t", quoting=0)

In [14]:
# vectorizer.fit_transform 으로 만든 벡터화 transformer로 평가용 데이터 벡터화
testDataVecs = vectorizer.transform(test_data["review"])

In [15]:
test_predicted = lgs.predict(testDataVecs)
print(test_predicted)

[1 0 1 ... 0 1 0]


In [18]:
import os
import csv

DATA_OUT_PATH = "./result_data/"

# 경로에 폴더 없으면 생성
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

ids = list(test_data["id"])
answer_dataset = pd.DataFrame({"id":ids, "sentiment":test_predicted})
answer_dataset.to_csv(DATA_OUT_PATH + "lgs_tfidf_answer.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)

In [13]:
test_data

Unnamed: 0,review,id
0,naturally film main themes mortality nostalgia...,12311_10
1,movie disaster within disaster film full great...,8348_2
2,movie kids saw tonight child loved one point k...,5828_4
3,afraid dark left impression several different ...,7186_2
4,accurate depiction small time mob life filmed ...,12128_7
5,valuable king tut tomb ok maybe valuable worth...,2913_8
6,one biggest misfires ever script nice could en...,4396_1
7,one movies watched wondered watch find interes...,395_2
8,worst movie seen years seen lot movies acting ...,10616_1
9,five medical students kevin bacon david labrac...,9074_9
