In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

os.chdir('/content/drive/MyDrive/Colab Notebooks/캡스톤')

In [None]:
import pandas as pd
import numpy as np
import re

data = pd.read_csv("emotionDataFin.csv")

data = data[data["emotion"]!=5]
data.loc[data['emotion'] == 1, 'emotion'] = 0
data.loc[data['emotion'] == 2, 'emotion'] = 0
data.loc[data['emotion'] == 3, 'emotion'] = 0
data.loc[data['emotion'] == 4, 'emotion'] = 0
data.loc[data['emotion'] == 6, 'emotion'] = 1
data.loc[data['emotion'] == 7, 'emotion'] = 1

data = data.dropna(axis=0).reset_index(drop=True)

# 정규 표현식을 이용하여 숫자를 공백으로 변경(정규 표현식으로 \d 는 숫자를 의미함.)
data['sentence'] = data['sentence'].apply( lambda x : re.sub(r"\d+", " ", x) )

data = data[["emotion", "sentence"]]

In [None]:
data["emotion"].value_counts()

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size = 0.2,
                               shuffle = True,random_state = 777)

train.reset_index(drop=True,inplace=True)
test.reset_index(drop=True,inplace=True)

In [None]:
!pip install konlpy

In [None]:
from konlpy.tag import Twitter

twitter = Twitter()
def tw_tokenizer(text):
    # 입력 인자로 들어온 text 를 형태소 단어로 토큰화 하여 list 객체 반환
    tokens_ko = twitter.morphs(text)
    return tokens_ko

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Twitter 객체의 morphs( ) 객체를 이용한 tokenizer를 사용. ngram_range는 (1,2)
tfidf_vect = TfidfVectorizer(tokenizer=tw_tokenizer, ngram_range=(1,2), min_df=3, max_df=0.9)
tfidf_vect.fit(train['sentence'])
tfidf_matrix_train = tfidf_vect.transform(train['sentence'])

In [None]:
import joblib

# 모델을 저장
joblib.dump(tfidf_vect, 'tfidf_vect.pkl')

# tw_tokenizer도 함께 저장
joblib.dump(tw_tokenizer, 'tw_tokenizer.pkl')

In [None]:
# Logistic Regression 을 이용하여 감성 분석 Classification 수행.
lg_clf = LogisticRegression(random_state=0)

# Parameter C 최적화를 위해 GridSearchCV 를 이용.
params = { 'C': [1 ,3.5, 4.5, 5.5, 10 ] }
grid_cv = GridSearchCV(lg_clf , param_grid=params , cv=3 ,scoring='accuracy', verbose=1 )
grid_cv.fit(tfidf_matrix_train , train['emotion'] )

In [None]:
print(grid_cv.best_params_ , round(grid_cv.best_score_,4))

In [None]:
from sklearn.metrics import accuracy_score

# 학습 데이터를 적용한 TfidfVectorizer를 이용하여 테스트 데이터를 TF-IDF 값으로 Feature 변환함.
tfidf_matrix_test = tfidf_vect.transform(test['sentence'])

# classifier 는 GridSearchCV에서 최적 파라미터로 학습된 classifier를 그대로 이용
best_estimator = grid_cv.best_estimator_
preds = best_estimator.predict(tfidf_matrix_test)

In [None]:
print('Logistic Regression 정확도: ',accuracy_score(test['emotion'],preds))

In [None]:
text = '원하는 대학에 합격했어'
tfidf_matrix = tfidf_vect.transform([text])

prediction = best_estimator.predict(tfidf_matrix)

if prediction == 1:
    print("Positive sentiment")
else:
    print("Negative sentiment")

In [None]:
import pickle

# 모델 저장
with open('modelMyData.pkl', 'wb') as f:
    pickle.dump(best_estimator, f)