In [2]:
import numpy as np
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'
TRAIN_CLEAN_DATA = 'train_clean.csv'
TEST_CLEAN_DATA = 'test_clean.csv'

TEST_SIZE = 0.2
RANDOM_SEED = 42

train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)
test_data = pd.read_csv(DATA_IN_PATH + TEST_CLEAN_DATA)

reviews = list(train_data['review'])
y = np.array(train_data['sentiment'])

test_reviews = list(test_data['review'])
ids = list(test_data['id'])

# 벡터화: feature extraction - tfidf
vectorizer = TfidfVectorizer(min_df=0.0,
                             analyzer='word',
                             sublinear_tf=True,
                             ngram_range=(1,3),
                             max_features=5000)
train_data_features = vectorizer.fit_transform(reviews)

# 학습/평가 데이터 분리
train_input, eval_input, train_label, eval_label = \
   train_test_split(train_data_features, y, test_size=TEST_SIZE,
                    random_state=RANDOM_SEED)

# 모델 설정
forest = RandomForestClassifier(n_estimators = 100)

# 모델 학습
forest.fit(train_input, train_label)

# 정확도 측정
print("Accuracy: %f" % forest.score(eval_input, eval_label))

# 사용했던 vectorizer 가지고 fit하지 않고 그냥 transform
test_data_features = vectorizer.transform(test_reviews)

# 학습시킨 랜덤포레스트분류기로 예측
result = forest.predict(test_data_features)
# array([1, 0, 1, ..., 0, 1, 0], dtype=int64)

# 데이터 제출 
output = pd.DataFrame(data={'id':ids, 'sentiment':result})
output.to_csv(DATA_OUT_PATH + "randomforest_tfidf.csv",
              index=False, quoting=3)

Accuracy: 0.850200


- tfidf + 랜덤포레스트 : 0.8461
- bow + 랜덤포레스트 : 0.8443
    - 얼마 차이 안나는 모습 