In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from konlpy.tag import Okt
import re
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("../../data/naver_shopping_tokenized.csv", encoding = 'utf-8', index_col = 0)

In [5]:
df.head()

Unnamed: 0,rating,text,y,tokenized
0,5,배공빠르고 굿,1,배공
1,2,택배가 엉망이네용 저희집 밑에층에 말도없이 놔두고가고,0,택배 엉망
2,5,아주좋아요 바지 정말 좋아서2개 더 구매했어요 이가격에 대박입니다. 바느질이 조금 ...,1,아주 바지 정말 구매 가격 대박 바느질 조금 가성 최고
3,2,선물용으로 빨리 받아서 전달했어야 하는 상품이었는데 머그컵만 와서 당황했습니다. 전...,0,선물 전달 상품 머그컵 당황 바로 누락 확인 바로 선물 큰일 다시 생각
4,5,민트색상 예뻐요. 옆 손잡이는 거는 용도로도 사용되네요 ㅎㅎ,1,민트 색상 손잡이 도로 사용


In [6]:
stopwords = pd.read_csv("https://raw.githubusercontent.com/yoonkt200/FastCampusDataset/master/korean_stopwords.txt").values.tolist()
stopwords.append('배송')

In [10]:
df = df[df['tokenized'].notnull()]

In [11]:
from sklearn.model_selection import train_test_split
y = df['y'].values.tolist()
X_train_texts, X_test_texts, y_train, y_test = train_test_split(df['tokenized'], y, test_size = 0.2, random_state = 0)

## DTM + GridSearchCV

In [27]:
vect = CountVectorizer(min_df = 3, ngram_range=(1,2))
X_train_tf = vect.fit_transform(X_train_texts)
X_test_tf = vect.transform(X_test_texts)

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

lr = LogisticRegression(random_state = 0, solver = 'liblinear')
params = { 'C': [1 ,3.5, 4.5, 5.5, 10 ] }
grid_cv = GridSearchCV(lr , param_grid=params , cv=3 ,scoring='accuracy', verbose=1 )
grid_cv.fit(X_train_tf , y_train)
print(grid_cv.best_params_ , round(grid_cv.best_score_,4))

Fitting 3 folds for each of 5 candidates, totalling 15 fits
{'C': 1} 0.7869


In [29]:
best_estimator = grid_cv.best_estimator_
y_pred = best_estimator.predict(X_test_tf)

In [30]:
print("accuracy: %.2f" % accuracy_score(y_test, y_pred))
print("Precision : %.3f" % precision_score(y_test, y_pred))
print("Recall : %.3f" % recall_score(y_test, y_pred))
print("F1 : %.3f" % f1_score(y_test, y_pred))

accuracy: 0.79
Precision : 0.768
Recall : 0.828
F1 : 0.796


## TF-IDF + GridSearchCV

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df = 3, max_df = 0.8)
tfidf_vectorizer.fit(X_train_texts)
tfidf_matrix_train = tfidf_vectorizer.transform(X_train_texts)
tfidf_matrix_test = tfidf_vectorizer.transform(X_test_texts)

In [35]:
lr_tfidf = LogisticRegression(random_state = 0, solver = 'liblinear')
lr_tfidf.fit(tfidf_matrix_train, y_train)

params = { 'C': [1 ,3.5, 4.5, 5.5, 10 ] }
grid_cv = GridSearchCV(lr_tfidf , param_grid=params , cv=3 ,scoring='accuracy', verbose=1 )
grid_cv.fit(tfidf_matrix_train , y_train )
print(grid_cv.best_params_ , round(grid_cv.best_score_,4))

Fitting 3 folds for each of 5 candidates, totalling 15 fits
{'C': 1} 0.7875


In [36]:
best_estimator = grid_cv.best_estimator_
y_pred = best_estimator.predict(tfidf_matrix_test)

print("accuracy: %.2f" % accuracy_score(y_test, y_pred))
print("Precision : %.3f" % precision_score(y_test, y_pred))
print("Recall : %.3f" % recall_score(y_test, y_pred))
print("F1 : %.3f" % f1_score(y_test, y_pred))

accuracy: 0.79
Precision : 0.777
Recall : 0.806
F1 : 0.791


## DTM + 전처리를 형태소 변환만 사용 + GridSearchCV

In [40]:
okt = Okt()
def okt_tokenizer(text):
    # 입력 인자로 들어온 text 를 형태소 단어로 토큰화 하여 list 객체 반환
    tokens_ko = okt.morphs(text)
    return tokens_ko

In [38]:
df.head()

Unnamed: 0,rating,text,y,tokenized
0,5,배공빠르고 굿,1,배공
1,2,택배가 엉망이네용 저희집 밑에층에 말도없이 놔두고가고,0,택배 엉망
2,5,아주좋아요 바지 정말 좋아서2개 더 구매했어요 이가격에 대박입니다. 바느질이 조금 ...,1,아주 바지 정말 구매 가격 대박 바느질 조금 가성 최고
3,2,선물용으로 빨리 받아서 전달했어야 하는 상품이었는데 머그컵만 와서 당황했습니다. 전...,0,선물 전달 상품 머그컵 당황 바로 누락 확인 바로 선물 큰일 다시 생각
4,5,민트색상 예뻐요. 옆 손잡이는 거는 용도로도 사용되네요 ㅎㅎ,1,민트 색상 손잡이 도로 사용


In [39]:
y = df['y'].values.tolist()
X_train_texts, X_test_texts, y_train, y_test = train_test_split(df['text'], y, test_size = 0.2, random_state = 0)

In [41]:
vect = CountVectorizer(tokenizer = okt_tokenizer, min_df = 3, ngram_range=(1,2))
X_train_tf = vect.fit_transform(X_train_texts)
X_test_tf = vect.transform(X_test_texts)



In [42]:
lr = LogisticRegression(random_state = 0, solver = 'liblinear')
params = { 'C': [1 ,3.5, 4.5, 5.5, 10 ] }
grid_cv = GridSearchCV(lr , param_grid=params , cv=3 ,scoring='accuracy', verbose=1 )
grid_cv.fit(X_train_tf , y_train)
print(grid_cv.best_params_ , round(grid_cv.best_score_,4))

Fitting 3 folds for each of 5 candidates, totalling 15 fits
{'C': 1} 0.9157


In [43]:
best_estimator = grid_cv.best_estimator_
y_pred = best_estimator.predict(X_test_tf)

In [44]:
print("accuracy: %.2f" % accuracy_score(y_test, y_pred))
print("Precision : %.3f" % precision_score(y_test, y_pred))
print("Recall : %.3f" % recall_score(y_test, y_pred))
print("F1 : %.3f" % f1_score(y_test, y_pred))

accuracy: 0.92
Precision : 0.923
Recall : 0.918
F1 : 0.921


## TF-IDF + 전처리를 기본 형태소 변환만 사용 + GridSearchCV

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df = 3, max_df = 0.8, tokenizer = okt_tokenizer)
tfidf_vectorizer.fit(X_train_texts)
tfidf_matrix_train = tfidf_vectorizer.transform(X_train_texts)
tfidf_matrix_test = tfidf_vectorizer.transform(X_test_texts)



In [47]:
lr_tfidf = LogisticRegression(random_state = 0, solver = 'liblinear')
lr_tfidf.fit(tfidf_matrix_train, y_train)

params = { 'C': [1 ,3.5, 4.5, 5.5, 10 ] }
grid_cv = GridSearchCV(lr_tfidf , param_grid=params , cv=3 ,scoring='accuracy', verbose=1 )
grid_cv.fit(tfidf_matrix_train , y_train )
print(grid_cv.best_params_ , round(grid_cv.best_score_,4))

Fitting 3 folds for each of 5 candidates, totalling 15 fits
{'C': 3.5} 0.9192


In [48]:
best_estimator = grid_cv.best_estimator_
y_pred = best_estimator.predict(tfidf_matrix_test)

print("accuracy: %.2f" % accuracy_score(y_test, y_pred))
print("Precision : %.3f" % precision_score(y_test, y_pred))
print("Recall : %.3f" % recall_score(y_test, y_pred))
print("F1 : %.3f" % f1_score(y_test, y_pred))

accuracy: 0.92
Precision : 0.929
Recall : 0.916
F1 : 0.923


## max_df를 0.9로 수정 후 다시 테스트

In [49]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df = 3, max_df = 0.9, tokenizer = okt_tokenizer)
tfidf_vectorizer.fit(X_train_texts)
tfidf_matrix_train = tfidf_vectorizer.transform(X_train_texts)
tfidf_matrix_test = tfidf_vectorizer.transform(X_test_texts)



In [50]:
lr_tfidf = LogisticRegression(random_state = 0, solver = 'liblinear')
lr_tfidf.fit(tfidf_matrix_train, y_train)

params = { 'C': [1 ,3.5, 4.5, 5.5, 10 ] }
grid_cv = GridSearchCV(lr_tfidf , param_grid=params , cv=3 ,scoring='accuracy', verbose=1 )
grid_cv.fit(tfidf_matrix_train , y_train )
print(grid_cv.best_params_ , round(grid_cv.best_score_,4))

Fitting 3 folds for each of 5 candidates, totalling 15 fits
{'C': 3.5} 0.9192


In [51]:
best_estimator = grid_cv.best_estimator_
y_pred = best_estimator.predict(tfidf_matrix_test)

print("accuracy: %.2f" % accuracy_score(y_test, y_pred))
print("Precision : %.3f" % precision_score(y_test, y_pred))
print("Recall : %.3f" % recall_score(y_test, y_pred))
print("F1 : %.3f" % f1_score(y_test, y_pred))

accuracy: 0.92
Precision : 0.929
Recall : 0.916
F1 : 0.923


## DTM + 하이퍼파라미터 튜닝 + 전처리 수정 후 성능 평가
* accuracy: 0.92
* Precision : 0.923
* Recall : 0.918
* F1 : 0.921


## TF-IDF + 하이퍼파라미터 튜닝 + 전처리 수정 후 성능평가
* accuracy: 0.92
* Precision : 0.929
* Recall : 0.916
* F1 : 0.923