# IMDB 영화평 감성분석

In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('../../../machine-learning/00.data/IMDB/labeledTrainData.tsv', header=0, sep='\t', quoting=3)
df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


In [5]:
# <br /> 태그는 공백으로 변환
df['review'] = df.review.str.replace('<br />', ' ')

In [6]:
# 영어 이외의 문자는 공백으로 변환(숫자 등)
# 앞에 들어가는 ^는 not의 의미 -> a-z, A-Z가 '아닌 놈들은' 공백으로 바꿔라
df['review'] = df.review.apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))

In [7]:
df.isna().sum()

id           0
sentiment    0
review       0
dtype: int64

In [8]:
df[df.review == ''].count()

id           0
sentiment    0
review       0
dtype: int64

In [9]:
feature_df = df.drop(['id', 'sentiment'], axis=1, inplace=False)

X_train, X_test, y_train, y_test = train_test_split(
    feature_df, df.sentiment, test_size =0.25, random_state=2021
)

X_train.shape, X_test.shape

((18750, 1), (6250, 1))

In [10]:
X_train.head(2)

Unnamed: 0,review
14475,There s no shortage of bad dialogue in David ...
22605,This film takes what could have been a good i...


In [11]:
df_test = pd.DataFrame(X_test, columns=['review'])
df_test['sentiment'] = y_test
df_test.head(2)

Unnamed: 0,review,sentiment
13895,I was expecting a B Movie French musical Aft...,0
20903,Disappearance is about a couple who take thei...,0


In [12]:
df_test.to_csv('../static/data/imdb_test.csv', index=False)

### 피처 벡터화 변환과 머신러닝 모델 학습/예측/평가

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import joblib

- Case 1. Count Vectorizer + LogisticRegression

In [14]:
pipeline = Pipeline([
    ('count_vect', CountVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('lr_clf', LogisticRegression(C=10))
])

In [15]:
params = {
    'count_vect__max_df': [200, 300],
    'lr_clf__C': [0.5, 1, 5, 10]
}

In [16]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_pipe.fit(X_train.review, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed: 10.4min finished
{'count_vect__max_df': 300, 'lr_clf__C': 0.5} 0.8654933333333332


In [17]:
best_count_lr = grid_pipe.best_estimator_ # best_estimator_ : 최적의 파라미터로 모델 생성
pred_count_lr = best_count_lr.predict(X_test.review.values)
acc_count_lr = accuracy_score(y_test, pred_count_lr)
print(f'Count Vectorizer + LogisticRegression 평균 정확도 : {acc_count_lr:.4f}')

Count Vectorizer + LogisticRegression 평균 정확도 : 0.8715


In [21]:
joblib.dump(best_count_lr, '../static/model/imdb_count_lr.pkl')

['../static/model/imdb_count_lr.pkl']

- Case 2. Tf - idf vectorizer + LogisticRegression

In [22]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('lr_clf', LogisticRegression())
])

In [27]:
params = {
    'tfidf_vect__max_df': [100, 300, 500],
    'lr_clf__C': [5, 10, 15]
}

In [28]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_pipe.fit(X_train.review, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed: 11.0min finished
{'lr_clf__C': 15, 'tfidf_vect__max_df': 500} 0.8783466666666667


In [29]:
best_tfid_lr = grid_pipe.best_estimator_ # best_estimator_ : 최적의 파라미터로 모델 생성
pred_tfid_lr = best_tfid_lr.predict(X_test.review.values)
acc_tfid_lr = accuracy_score(y_test, pred_tfid_lr)
print(f'Tf - idf vectorizer + LogisticRegression 평균 정확도 : {acc_tfid_lr:.4f}')

Tf - idf vectorizer + LogisticRegression 평균 정확도 : 0.8814


In [30]:
joblib.dump(best_tfid_lr, '../static/model/imdb_tfid_lr.pkl')

['../static/model/imdb_tfid_lr.pkl']

### test

In [31]:
index = 500

In [32]:
df = pd.read_csv('../static/data/imdb_test.csv')
df.tail(3)

Unnamed: 0,review,sentiment
6247,To me movies and acting is all about telling ...,1
6248,Another episode from childhood that as an ad...,1
6249,Some people have made a point of dissing this...,0


In [33]:
label = df.sentiment[index]
label

1

In [34]:
test_data = []
test_data.append(df.review[index])
test_data

[' Absolutely wonderful drama and Ros is top notch   I highly recommend this movie  Her performance  in my opinion  was Academy Award material  The only real sad fact here is that Universal hasn t seen to it that this movie was ever available on any video format  whether it be tape or DVD  They are ignoring a VERY good movie  But Universal has little regard for its library on DVD  which is sad  If you get the chance to see this somewhere  not sure why it is rarely even run on cable   see it  I won t go into the story because I think most people would rather have an opinion on the film  and too many   reviewers   spend hours writing about the story  which is available anywhere   a     ']

In [20]:
imdb_count_lr = joblib.load('../static/model/imdb_count_lr.pkl')

In [35]:
pred = imdb_count_lr.predict(test_data)

In [22]:
pred[0], label

(1, 1)

In [23]:
result_dict = {'index':index, 'label':label, 
                        'pred':f'{pred[0]}'}
result_dict

{'index': 500, 'label': 1, 'pred': '1'}

## 리뷰입력

In [32]:
import re

In [24]:
review_string = """ Started promising and quickly went downhill only to end absolutely laughably. A childish fantasy about overcoming every inner and outer hurdle and conquering the world through the power of the mind. Repetitive, boring: a buglike looking girl in front of a chessboard countless times. And everything through a feminist perspective, of course. Or shall I say "misogynic"? Boys, babies, and nests, all naturally dear to the female heart, are portrayed as pitiful occupations, only the mathematical brain and winning the game count. Ah, and designer clothes! """

In [25]:
test_data = []
test_data.append(review_string)
test_data

[' Started promising and quickly went downhill only to end absolutely laughably. A childish fantasy about overcoming every inner and outer hurdle and conquering the world through the power of the mind. Repetitive, boring: a buglike looking girl in front of a chessboard countless times. And everything through a feminist perspective, of course. Or shall I say "misogynic"? Boys, babies, and nests, all naturally dear to the female heart, are portrayed as pitiful occupations, only the mathematical brain and winning the game count. Ah, and designer clothes! ']

In [26]:
pred = imdb_count_lr.predict(test_data)

In [29]:
pred[0]

0

In [30]:
result_dict = {'pred':f'{pred[0]}'}                      
result_dict

{'pred': '0'}