## Down Stream Task

### 문서 분류

* 데이터 : 네이버 평점 데이터 (train)
* 분석 목적 : 네이버 평점에 대한 긍정과 부정 (이진 분류)
* 분석 성능 기준 : 정확도

In [1]:
# 정규 표현식 패키지
import re

# 토크나이저 패키지 
from nltk.tokenize import word_tokenize
from ckonlpy.tag import Twitter
from konlpy.tag import Kkma

from tokenizers import Tokenizer
from nltk.tokenize import RegexpTokenizer

# 벡터화 패키지
from sklearn.feature_extraction.text import TfidfVectorizer

# 분류 모델 패키지
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier

from xgboost import XGBClassifier

# 분류 모델 평가
from sklearn.metrics import classification_report

# 데이터 핸들링 
import numpy as np
import pandas as pd

# 기타
from tqdm import tqdm 

In [2]:
train_df = pd.read_csv('./data/ratings_train.csv')
train_df.dropna(inplace=True)
train_df

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1
...,...,...,...
149995,6222902,인간이 문제지.. 소는 뭔죄인가..,0
149996,8549745,평점이 너무 낮아서...,1
149997,9311800,이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다?,0
149998,2376369,청춘 영화의 최고봉.방황과 우울했던 날들의 자화상,1


데이터 분리

In [3]:
X_train, y_train = train_df['document'][:100000], train_df['label'][:100000]
X_test, y_test = train_df['document'][100000:], train_df['label'][100000:]

In [4]:
X_train.shape, X_test.shape

((100000,), (49995,))

## 사용자 토크나이저 만들기

In [5]:
class UserTokenizers :
    def __init__(self) -> None :
        self.okt = Twitter()
        self.kkma = Kkma()
        self.bpe_tokenier_pretrained = Tokenizer.from_file('./tokenizer_data/bpe_tokenizer.json')

    @staticmethod
    def whitespaceTokenizer(data : str) -> list :
        token_rs = data.split(' ')
        return token_rs
    
    @staticmethod
    def regexsplitToken(data : str, pat : str = '[\,\.!?\n]') -> list :
        re_rs = re.split(pat, data, maxsplit=0)
        token_rs = [rs_unit.strip() for rs_unit in re_rs if len(rs_unit.strip()) > 1]
        return token_rs
    
    @staticmethod
    def regexselectToken(data : str, pat : str = '[\w]+') -> list :
        token_rs = RegexpTokenizer(pat).tokenize(data)
        return token_rs
    
    def BPETokenizer(self, data : str) -> list :
        token_rs = self.bpe_tokenier_pretrained.encode(data).tokens
        return token_rs
    
    # 한글, 영어 같이 사용
    def tokenizingKorEng(self, data : str) -> list :
        kor_re = re.findall('[ㄱ-ㅎㅏ-ㅣ가-힣]+', data)
        kor_str = ' '.join(kor_re)

        eng_re = re.findall('[a-zA-Z]+', data)
        eng_str = ' '.join(eng_re)

        kor_rs = self.kot.morphs(kor_str)
        eng_rs = word_tokenize(eng_str)

        token_rs = kor_rs + eng_rs

        return token_rs
    
    # 명사만 뽑는 tokenizer
    def konlpyNounsTokenizer(self, data : str) -> list : 
        token_rs = self.kkma.nouns(data)
        return token_rs

In [6]:
ut_cls = UserTokenizers()
ut_cls

  warn('"Twitter" has changed to "Okt" since KoNLPy v0.4.5.')


<__main__.UserTokenizers at 0x172e3d68610>

In [7]:
ut_cls.konlpyNounsTokenizer(X_train[5])

['걸음마',
 '3',
 '3세',
 '세',
 '초등학교',
 '1',
 '1학년생인',
 '학년',
 '생인',
 '8',
 '8살용영화',
 '살',
 '용',
 '영화',
 '별',
 '별반개',
 '반개']

In [8]:
ut_cls.BPETokenizer(X_train[5])

['막</w>',
 '걸',
 '음',
 '마</w>',
 '<unk>',
 '3',
 '세',
 '부터</w>',
 '초등학교</w>',
 '1',
 '학년',
 '생',
 '인</w>',
 '8',
 '살',
 '용',
 '영화</w>',
 '.</w>',
 'ㅋㅋㅋ</w>',
 '.</w>',
 '.</w>',
 '.</w>',
 '별',
 '반개도</w>',
 '아까움</w>',
 '.</w>']

불용어 적용하기

In [9]:
stopwords = ['의', '가', '은', '는', '이', '을', '를', '으로', '에', '과']

벡터화

In [10]:
tfidf_vect = TfidfVectorizer(tokenizer=ut_cls.BPETokenizer, stop_words=stopwords)#, ngram_range=(1,2))

In [11]:
tfidf_vect.fit(X_train)



In [12]:
tfidf_vect.get_feature_names_out()

array(['!</w>', '"</w>', '%</w>', ..., '힘들었다</w>', '힘을</w>', '힘이</w>'],
      dtype=object)

In [13]:
class TransformVect : 
    def __init__(self, vec_model) -> None :
        # 서비스로 개발한다면, 저장된 vec_model을 로딩하는 부분으로 변경 
        self.vec_model = vec_model
    
    def transVect_run(self, chunk_size : int=500, data : pd.DataFrame=None) -> np.array :

        data_len = len(data)

        for st_idx in tqdm(range(0, data_len, chunk_size)):
            tmp_data = data[st_idx : st_idx + chunk_size]

            if st_idx == 0 :
                vec_arr = self.vec_model.transform(tmp_data).toarray()
            
            else : 
                tmp_data_arr = self.vec_model.transform(tmp_data).toarray()
                vec_arr = np.concatenate([vec_arr, tmp_data_arr], 0)
        
        return vec_arr

In [14]:
tfv_cls = TransformVect(tfidf_vect)

In [15]:
# X_train, y_train = X_train[:500], y_train[:500]
# X_test, y_test = X_test[:100], y_test[:100]

# x_train = tfidf_vect.transform(X_train).toarray()
# x_test = tfidf_vect.transform(X_test).toarray()

x_train = tfv_cls.transVect_run(chunk_size=100, data=X_train)
x_test = tfv_cls.transVect_run(chunk_size=100, data=X_test)

100%|██████████| 1000/1000 [22:16<00:00,  1.34s/it]
100%|██████████| 500/500 [03:51<00:00,  2.16it/s]


In [16]:
x_train.shape, x_test.shape

((100000, 9859), (49995, 9859))

In [36]:
x_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [37]:
import pickle

## Save pickle
# with open("x_train.pickle","wb") as fw:
#    pickle.dump(x_train, fw)

# with open("x_test.pickle","wb") as fw:
#   pickle.dump(x_test, fw)
 
## Load pickle
# with open("x_test.pickle","rb") as fr:
#    data = pickle.load(fr)
# print(data)
#['a', 'b', 'c']

# 분류 모델 적용

In [17]:
# 고정된 모델 결과
random_seed_num = 42

In [18]:
# Decision Tree
clf_decision = DecisionTreeClassifier(random_state=random_seed_num, max_depth=5)
clf_decision.fit(x_train, y_train)

y_train_pred = clf_decision.predict(x_train)
y_test_pred = clf_decision.predict(x_test)

print(classification_report(y_train, y_train_pred))
print('💕')
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.54      0.95      0.69     50115
           1       0.80      0.19      0.31     49885

    accuracy                           0.57    100000
   macro avg       0.67      0.57      0.50    100000
weighted avg       0.67      0.57      0.50    100000

💕
              precision    recall  f1-score   support

           0       0.54      0.95      0.69     25055
           1       0.78      0.19      0.30     24940

    accuracy                           0.57     49995
   macro avg       0.66      0.57      0.50     49995
weighted avg       0.66      0.57      0.50     49995



In [19]:
# Random Forest

rfc_decision = RandomForestClassifier(random_state=random_seed_num, max_depth=5)
rfc_decision.fit(x_train, y_train)

y_train_pred = rfc_decision.predict(x_train)
y_test_pred = rfc_decision.predict(x_test)

print(classification_report(y_train, y_train_pred))
print('💕')
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.66      0.88      0.76     50115
           1       0.82      0.55      0.66     49885

    accuracy                           0.72    100000
   macro avg       0.74      0.72      0.71    100000
weighted avg       0.74      0.72      0.71    100000

💕
              precision    recall  f1-score   support

           0       0.66      0.87      0.75     25055
           1       0.80      0.55      0.65     24940

    accuracy                           0.71     49995
   macro avg       0.73      0.71      0.70     49995
weighted avg       0.73      0.71      0.70     49995



In [20]:
# k-neighbors

clf_kneighbors = KNeighborsClassifier(n_neighbors=3, weights='distance', leaf_size=50)
clf_kneighbors.fit(x_train, y_train)

y_train_pred = clf_kneighbors.predict(x_train)
y_test_pred = clf_kneighbors.predict(x_test)

print(classification_report(y_train, y_train_pred))
print('💕')
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     50115
           1       1.00      1.00      1.00     49885

    accuracy                           1.00    100000
   macro avg       1.00      1.00      1.00    100000
weighted avg       1.00      1.00      1.00    100000

💕
              precision    recall  f1-score   support

           0       0.77      0.76      0.76     25055
           1       0.76      0.77      0.76     24940

    accuracy                           0.76     49995
   macro avg       0.76      0.76      0.76     49995
weighted avg       0.76      0.76      0.76     49995



In [21]:
# logistic 

clf_logistic = LogisticRegression(max_iter=3000, random_state=random_seed_num)
clf_logistic.fit(x_train, y_train)

y_train_pred = clf_logistic.predict(x_train)
y_test_pred = clf_logistic.predict(x_test)

print(classification_report(y_train, y_train_pred))
print('💕')
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.87      0.88      0.88     50115
           1       0.88      0.87      0.88     49885

    accuracy                           0.88    100000
   macro avg       0.88      0.88      0.88    100000
weighted avg       0.88      0.88      0.88    100000

💕
              precision    recall  f1-score   support

           0       0.85      0.85      0.85     25055
           1       0.85      0.84      0.85     24940

    accuracy                           0.85     49995
   macro avg       0.85      0.85      0.85     49995
weighted avg       0.85      0.85      0.85     49995



In [22]:
# xgb

clf_xgb = XGBClassifier(random_state=random_seed_num, max_depth=5)
clf_xgb.fit(x_train, y_train)

y_train_pred = clf_xgb.predict(x_train)
y_test_pred = clf_xgb.predict(x_test)

print(classification_report(y_train, y_train_pred))
print('💕')
print(classification_report(y_test, y_test_pred))

  if is_sparse(data):


              precision    recall  f1-score   support

           0       0.73      0.86      0.79     50115
           1       0.83      0.68      0.75     49885

    accuracy                           0.77    100000
   macro avg       0.78      0.77      0.77    100000
weighted avg       0.78      0.77      0.77    100000

💕
              precision    recall  f1-score   support

           0       0.71      0.84      0.77     25055
           1       0.80      0.66      0.73     24940

    accuracy                           0.75     49995
   macro avg       0.76      0.75      0.75     49995
weighted avg       0.76      0.75      0.75     49995



In [23]:
# Ensemble

# voting 방식
## soft : 각 모델의 확률값을 사용해서 결과 도출 
## hard : 각 모델의 최종 레이블에서 다수로 분류 된 값으로 결과 도출 

clf_ensemble = VotingClassifier(estimators=[('logistic', clf_logistic), 
                                            ('KNN', clf_kneighbors), 
                                            ('XGB', clf_xgb)], 
                                            voting='soft')
clf_ensemble.fit(x_train, y_train)

y_train_pred = clf_ensemble.predict(x_train)
y_test_pred = clf_ensemble.predict(x_test)

print(classification_report(y_train, y_train_pred))
print('💕')
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     50115
           1       0.99      0.99      0.99     49885

    accuracy                           0.99    100000
   macro avg       0.99      0.99      0.99    100000
weighted avg       0.99      0.99      0.99    100000

💕
              precision    recall  f1-score   support

           0       0.83      0.84      0.83     25055
           1       0.84      0.83      0.83     24940

    accuracy                           0.83     49995
   macro avg       0.83      0.83      0.83     49995
weighted avg       0.83      0.83      0.83     49995



## 메모리 에러가 나는 경우의 대처 방법

1. 하드웨어 변경
    - 메모리 up
    - 메모리가 넉넉한 서버로 이동 후 분석
2. 코드로 제어
    - chunk_size : 분석 분할
    - 모델 제어 : word2vec / countVect / tfidfVect(통계 기반으로 메모리 사용이 많을 수 있음)
    <br>=> vocab 사이즈 제어
    <br>=> 피처 추출 : 전체 토큰화된 텍스트 데이터 -> 명사만 분석 

- 에러가 나는 케이스 
    - numpy : index 값이 특정값 이상(int64)인 경우 sorting 등에서 에러 발생 