## 4.1.4 Logistic Regression Example with TF-IDF

### TF-IDF Feature Example

In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [26]:
DATA_IN_PATH = '../dataset/' 
DATA_OUT_PATH = './data_out/'
TRAIN_CLEAN_DATA = 'train_clean.csv' # 전처리한 텍스트

RANDOM_SEED = 42
TEST_SPLIT = 0.2

In [3]:
train_data = pd.read_csv( DATA_IN_PATH + TRAIN_CLEAN_DATA )

In [4]:
train_data

Unnamed: 0,review,sentiment
0,stuff going moment mj started listening music ...,1
1,classic war worlds timothy hines entertaining ...,1
2,film starts manager nicholas bell giving welco...,0
3,must assumed praised film greatest filmed oper...,0
4,superbly trashy wondrously unpretentious explo...,1
...,...,...
24995,seems like consideration gone imdb reviews fil...,0
24996,believe made film completely unnecessary first...,0
24997,guy loser get girls needs build picked stronge...,0
24998,minute documentary bu uel made early one spain...,0


In [5]:
reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])


In [7]:
# min_df : 특정 토큰의 df값이 0보다 작으면 제거
# analyzer: char or word 단위로 설정
# sublinear_tf: 문서의 단어 빈도수에 대한 스무딩(smoothing)여부
# ngram_range: 빈도의 기본 단위를 어느 범위의 ngram으로 할지 
# max_features: 각 벡터의 최대 길이, 특징의 길이 설정
vectorizer = TfidfVectorizer(min_df=0.0, analyzer='char', sublinear_tf=True, ngram_range=(1, 3), max_features=5000)

X = vectorizer.fit_transform(reviews)
y = np.array(sentiments)


In [10]:
X.data

array([0.01912439, 0.01438654, 0.02347039, ..., 0.05237479, 0.05875291,
       0.06170144])

In [33]:
len(X.data)

17862871

In [14]:
# 학습, 검증 데이터셋 분리
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

In [15]:
lgs = LogisticRegression(class_weight='balanced') # 각 레이블에 대해 균형있게 학습
lgs.fit(X_train, y_train)

LogisticRegression(class_weight='balanced')

In [18]:
predicted = lgs.predict(X_eval)
print(predicted)

[0 1 0 ... 0 0 0]


In [17]:
print("Accuracy: %f" % lgs.score(X_eval, y_eval))

Accuracy: 0.859800


In [19]:
TEST_CLEAN_DATA = 'test_clean.csv'

test_data = pd.read_csv(DATA_IN_PATH + TEST_CLEAN_DATA)

In [20]:
testDataVecs = vectorizer.transform(test_data['review'])

In [21]:
test_predicted = lgs.predict(testDataVecs)
print(test_predicted)

[1 0 1 ... 0 1 0]


In [22]:
test_data

Unnamed: 0,review,id
0,naturally film main themes mortality nostalgia...,"""12311_10"""
1,movie disaster within disaster film full great...,"""8348_2"""
2,movie kids saw tonight child loved one point k...,"""5828_4"""
3,afraid dark left impression several different ...,"""7186_2"""
4,accurate depiction small time mob life filmed ...,"""12128_7"""
...,...,...
24995,sony pictures classics looking sony got rights...,"""2155_10"""
24996,always felt ms merkerson never gotten role fit...,"""59_10"""
24997,disappointed movie familiar case read mark fuh...,"""2531_1"""
24998,opening sequence filled black white shots remi...,"""7772_8"""


In [27]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

answer_dataset = pd.DataFrame({'id': test_data['id'], 'sentiment': test_predicted})
answer_dataset.to_csv(DATA_OUT_PATH + 'lgs_tfidf_answer.csv', index=False, quoting=3)

In [28]:
answer_dataset

Unnamed: 0,id,sentiment
0,"""12311_10""",1
1,"""8348_2""",0
2,"""5828_4""",1
3,"""7186_2""",0
4,"""12128_7""",1
...,...,...
24995,"""2155_10""",1
24996,"""59_10""",1
24997,"""2531_1""",0
24998,"""7772_8""",1


In [29]:
DATA_OUT_PATH

'./data_out/'

### 캐글에 올리기
```
kaggle competitions submit -c word2vec-nlp-tutorial -f submission.csv -m "Message"
```