# IMDB 리뷰 감성 분류 : TF-IDF 활용 로지스틱 회귀모델
- 데이터 전처리 : imdb_preprocessing.ipynb

## 라이브러리 임포트

In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### TF-IDF 값으로 벡터화를 진행하므로 텍스트 데이터(train_clean.csv) 사용

In [3]:
DATA_PATH = '/content/drive/MyDrive/nlpdata/imdb/'
TRAIN_CLEAN_DATA = 'train_clean.csv'

In [4]:
train_data = pd.read_csv( DATA_PATH + TRAIN_CLEAN_DATA )

In [5]:
reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

In [6]:
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="char", sublinear_tf=True, ngram_range=(1,3), max_features=5000) 

X = vectorizer.fit_transform(reviews)
y = np.array(sentiments)

In [7]:
features = vectorizer.get_feature_names()

## 학습과 검증 데이터셋 분리

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## 로지스틱 회귀 모델 선언 및 학습

In [None]:
lr = LogisticRegression(class_weight='balanced') 
lr.fit(X_train, y_train) 

In [10]:
predicted = lr.predict(X_test)
print(f"Accuracy: {lr.score(X_test, y_test):.2f}")

Accuracy: 0.86


In [11]:
TEST_CLEAN_DATA = 'test_clean.csv'

test_data = pd.read_csv(DATA_PATH + TEST_CLEAN_DATA)
testDataVecs = vectorizer.transform(test_data['review'])
test_predicted = lr.predict(testDataVecs)
print(test_predicted)

[1 0 1 ... 0 1 0]


In [12]:
answer_dataset = pd.DataFrame({'id': test_data['id'], 'sentiment': test_predicted})
answer_dataset.to_csv(DATA_PATH + 'answer_lr_tfidf.csv', index=False, quoting=3)

## kgggle에 결과 제출 및 스코어 확인
- https://www.kaggle.com/c/word2vec-nlp-tutorial