# IMDB 영화평 감성분석

- Pipeline
- TfidVectorizer + LogisticRegression


In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/labeledTrainData.tsv', sep ='\t',quoting =3)    #3 :QUOTE-None
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


- 텍스트 전처리


In [4]:
# <br /> 태그는 공백으로 처리

In [5]:
df.review = df.review.str.replace('<br />', ' ')

In [6]:
# 영문자 이외는 공백으로 변환
df.review = df.review.str.replace('[^A-Za-z]', ' ').str.strip()

  df.review = df.review.str.replace('[^A-Za-z]', ' ').str.strip()


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(      # X_train, X_test, y_train y_test 는 시리즈, 만약에 
    df.review, df.sentiment, stratify= df.sentiment, random_state=2022
)


- Pipeline :TfidVectorizer + LogisticRegression

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

- 하나의 명령어의 output이 다른 명령어의 input이 되는 경우가 많다.


In [None]:
pipeLine = Pipeline([
    ('tvect',TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
    ('lr', LogisticRegression(random_state=2022))
])

In [13]:
tvect = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
lr = LogisticRegression(random_state=2022)
pipeline = Pipeline([('tvect', tvect), ('lr', lr)]) # 리스트에 튜플


In [14]:
# 학습
%time pipeline.fit(X_train, y_train)

Wall time: 29.9 s


Pipeline(steps=[('tvect',
                 TfidfVectorizer(ngram_range=(1, 2), stop_words='english')),
                ('lr', LogisticRegression(random_state=2022))])

In [15]:
# 평가
pipeline.score(X_test, y_test)

0.87472

- 최적 하이퍼 파라메터 찾기


In [17]:
from sklearn.model_selection import GridSearchCV
params = {
    'tvect__max_df' : [100,500], 
    'lr__C' : [1, 10],
    
}

In [18]:
grid_pipe = GridSearchCV(
    pipeline, param_grid = params, scoring = 'accuracy', cv =3, n_jobs =-1 # 가용한 쓰레드를 모두 이상해서 돌려라.
)
%time grid_pipe.fit(X_train, y_train)

Wall time: 2min 38s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('tvect',
                                        TfidfVectorizer(ngram_range=(1, 2),
                                                        stop_words='english')),
                                       ('lr',
                                        LogisticRegression(random_state=2022))]),
             n_jobs=-1,
             param_grid={'lr__C': [1, 10], 'tvect__max_df': [100, 500]},
             scoring='accuracy')

In [19]:
grid_pipe.best_params_

{'lr__C': 10, 'tvect__max_df': 500}

In [20]:
params = {
    'tvect__max_df' : [500,1000], 
    'lr__C' : [10, 20],
    
}
%time grid_pipe.fit(X_train, y_train)

Wall time: 2min 34s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('tvect',
                                        TfidfVectorizer(ngram_range=(1, 2),
                                                        stop_words='english')),
                                       ('lr',
                                        LogisticRegression(random_state=2022))]),
             n_jobs=-1,
             param_grid={'lr__C': [1, 10], 'tvect__max_df': [100, 500]},
             scoring='accuracy')

In [21]:
grid_pipe.best_params_

{'lr__C': 10, 'tvect__max_df': 500}

In [22]:
grid_pipe.best_estimator_.score(X_test, y_test)

0.87552

- 모델 저장

In [23]:
import joblib
joblib.dump(grid_pipe.best_estimator_, 'model/imdb_pipe.pkl')


['model/imdb_pipe.pkl']

In [24]:
best_pipe = joblib.load('model/imdb_pipe.pkl')