In [1]:
import sys
sys.version

import re
from konlpy.tag import Okt          
from konlpy.tag import Komoran
from collections import Counter
import pandas as pd 

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
from wordcloud import WordCloud

from hanspell import spell_checker
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.utils.fixes import loguniform
SEED = 42

### 형태소 분석 및 품사 태깅 라이브러리
* https://konlpy.org/ko/v0.4.3/morph/#

## 한글 폰트 설정

In [None]:
font_path ="C:/Windows/fonts/malgun.ttf"
font_name = font_manager.FontProperties(fname=font_path).get_name()
matplotlib.rc('font',family=font_name)

In [83]:
train = pd.read_csv('_data/train.csv', index_col=0)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 1 to 5000
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   document  5000 non-null   object
 1   label     5000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 117.2+ KB


In [84]:
test = pd.read_csv('_data/test.csv', index_col=0)
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 1 to 5000
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   document  5000 non-null   object
dtypes: object(1)
memory usage: 78.1+ KB


# 형태소 및 단어 분리
* 영어만으로 작성된 리뷰도 있기 때문에 문자열을 제외한 나머지를 공백으로 치환 `re.sub(r'[^\w]+', ' ', x` -->자음,모음삭제하고 숫자는 남기기 위해 `[^0-9a-zA-Zㅏ-ㅣ가-힣]`로 변경

In [4]:
okt = Okt()
def preprocessing(df):
    df['document'] = df.document.apply(lambda x : re.sub(r'[^;!~0-9a-zA-Zㅏ-ㅣ가-힣]+', ' ', x).strip())
    df['spell_ck'] = df.document.apply(lambda x : spell_checker.check(x).checked)
    #df['tokenized_stem'] = df.document.apply(lambda x : ' '.join(okt.morphs(x, stem=True)))
    df['tokenized_stem'] = df.spell_ck.apply(lambda x : ' '.join(okt.morphs(x, stem=True)))

    return df

In [136]:
train_df = preprocessing(train)
train_df.head()

Unnamed: 0_level_0,document,label,spell_ck,tokenized_stem
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,영상이나 음악이 이쁘다 해도 미화시킨 불륜일뿐,0,영상이나 음악이 이쁘다 해도 미화시킨 불륜일 뿐,영상 이나 음악 이 이쁘다 해도 미화 시키다 불륜 일 뿐
2,히치콕이 이 영화를 봤다면 분명 박수를 쳤을듯,1,히치콕이 이 영화를 봤다면 분명 손뼉을 쳤을 듯,히치콕 이 이 영화 를 보다 분명 손뼉 을 치다 듯
3,괜찮은 음악영화가 또 나왔군요!!! 따뜻한 겨울이 될 것 같아요~,1,괜찮은 음악영화가 또 나왔군요!!! 따뜻한 겨울이 될 것 같아요~,괜찮다 음악 영화 가 또 나오다 !!! 따뜻하다 겨울 이 되다 것 같다 ~
4,아무래도 20년도지난작품이라 지금보기는너무유치하다,0,아무래도 20년도 지난 작품이라 지금 보기는 너무 유치하다,아무래도 20년 도 지난 작품 이라 지금 보기 는 너무 유치하다
5,지금까지의 영화들이 그랬듯 이 영화역시 일본에 대한 미화는 여전하다,0,지금까지의 영화들이 그랬듯 이 영화 역시 일본에 대한 미화는 여전하다,지금 까지의 영화 들 이 그렇다 이 영화 역시 일본 에 대한 미화 는 여전하다


In [9]:
test_df = preprocessing(test)
test_df.head()

Unnamed: 0_level_0,document,spell_ck,tokenized_stem
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,시간 때우기 좋은 영화 지루함,시간 때우기 좋은 영화 지루함,시간 때우다 좋다 영화 지루함
2,훈훈한 정이 느껴지는 영화! 가족끼리 드라마 보듯이 보면 딱~!,훈훈한 정이 느껴지는 영화! 가족끼리 드라마 보듯이 보면 딱~!,훈훈하다 정이 느껴지다 영화 ! 가족 끼리 드라마 보다 보다 딱 ~!
3,Childhood fantasy,Childhood fantasy,Childhood fantasy
4,멋있는 영화입니다 잊을 수 없는!,멋있는 영화입니다 잊을 수 없는!,멋있다 영화 이다 잊다 수 없다 !
5,너무 감동적이네요 펑펑 울었습니다,너무 감동적이네요 펑펑 울었습니다,너무 감동 적다 펑펑 울다


In [10]:
submission = pd.read_csv('./_data/sample_submission.csv')
submission

Unnamed: 0,id,label
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
4995,4996,0
4996,4997,0
4997,4998,0
4998,4999,0


In [11]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from tqdm.auto import tqdm
import numpy as np

In [12]:
def return_kfold_accuarcy(model, k: int = 5) -> float:
    "모델을 입력받아 KFold 예측 후 accuracy score를 반환하는 함수"
    kfold = StratifiedKFold(k, shuffle=True, random_state=SEED)
    result = []
    for train_idx, test_idx in kfold.split(train_df["tokenized_stem"], train_df["label"]):
        train, val = train_df.iloc[train_idx], train_df.iloc[test_idx]
        model.fit(train["tokenized_stem"], train["label"])
        pred = model.predict(val["tokenized_stem"])
        acc = accuracy_score(val["label"], pred)
        result.append(acc)

    return np.mean(result)

In [13]:
tfidf = TfidfVectorizer(ngram_range=(1,2),
                       min_df=3,
                       max_df=0.9,
                       tokenizer=okt.morphs,
                       token_pattern=None)

In [21]:
models = [
    ("naive_bayes", BernoulliNB()),
    ("SGD", SGDClassifier(random_state=SEED, n_jobs=-1)),
    ("LR", LogisticRegression(class_weight='balanced', max_iter=1000)),
    ("rfc", RandomForestClassifier(random_state=SEED, n_jobs=-1)),
    ("SVC", SVC(random_state=SEED)),
   ("gdb", GradientBoostingClassifier(n_estimators=1000)),
]

model_pipes = [(name, Pipeline([('tfidf',tfidf),(name, model)])) for name, model in models]


In [22]:
scores =[]
for model_name, model in tqdm(model_pipes, leave=False):
    acc = return_kfold_accuarcy(model)
    scores.append([model_name, f"{acc:0.3f}"])
print(scores)

  0%|          | 0/3 [00:00<?, ?it/s]

[['naive_bayes', '0.877'], ['LR', '0.866'], ['SVC', '0.877']]


In [18]:
top3 =sorted(scores, key=lambda x: x[1], reverse=True)[:3]
top3

[['naive_bayes', '0.877'], ['SVC', '0.877'], ['LR', '0.866']]

# 1. 가장 점수가 높은 3개모델 stacking

In [21]:
models = [
    ("naive_bayes", BernoulliNB()),
    #("SGD", SGDClassifier(random_state=SEED, n_jobs=-1)),
    ("LR", LogisticRegression(class_weight='balanced', max_iter=1000)),
    #("rfc", RandomForestClassifier(random_state=SEED, n_jobs=-1)),
    ("SVC", SVC(random_state=SEED)),
   # ("gdb", GradientBoostingClassifier(n_estimators=1000)),
]

model_pipes = [(name, Pipeline([('tfidf',tfidf),(name, model)])) for name, model in models]


In [22]:
scores =[]
for model_name, model in tqdm(model_pipes, leave=False):
    acc = return_kfold_accuarcy(model)
    scores.append([model_name, f"{acc:0.3f}"])
print(scores)

  0%|          | 0/3 [00:00<?, ?it/s]

[['naive_bayes', '0.877'], ['LR', '0.866'], ['SVC', '0.877']]


In [19]:
from sklearn.ensemble import StackingClassifier


In [24]:
stack_models = [(name, Pipeline([('tfidf',tfidf),(name, model)])) for name, model in models]

stacking = StackingClassifier(stack_models)
acc = return_kfold_accuarcy(stacking)
print(acc)

0.8795999999999999


In [25]:
stacking.fit(train_df['tokenized_stem'], train_df['label'])
stacking_pred = stacking.predict(test_df["tokenized_stem"])

In [26]:
stacking_pred

array([1, 1, 1, ..., 1, 0, 1], dtype=int64)

In [27]:
submission['label'] = stacking_pred

In [28]:
submission

Unnamed: 0,id,label
0,1,1
1,2,1
2,3,1
3,4,1
4,5,1
...,...,...
4995,4996,0
4996,4997,0
4997,4998,1
4998,4999,0


In [86]:
submission.to_csv('submission4.csv', index=False)

# 2. 각각 predict 후 앙상블

In [35]:
tfidf.fit(train_df['tokenized_stem'])
X_train_okt = tfidf.transform(train_df['tokenized_stem'])
X_test_okt = tfidf.transform(test_df['tokenized_stem'])

In [40]:
nb = BernoulliNB()
nb.fit(X_train_okt, train_df['label'])
nb_pred = nb.predict(X_test_okt)
nb_pred

array([0, 1, 0, ..., 1, 0, 1], dtype=int64)

In [41]:
lr = LogisticRegression(class_weight='balanced', max_iter=1000)
lr.fit(X_train_okt, train_df['label'])
lr_pred = lr.predict(X_test_okt)
lr_pred

array([1, 1, 0, ..., 1, 0, 1], dtype=int64)

In [42]:
svc = SVC(random_state=SEED)
svc.fit(X_train_okt, train_df['label'])
svc_pred = svc.predict(X_test_okt)
svc_pred

array([1, 1, 1, ..., 1, 0, 1], dtype=int64)

In [81]:
concat_pred = pd.concat([pd.Series(nb_pred), pd.Series(lr_pred), pd.Series(svc_pred)], axis=1)

In [98]:
concat_pred['label'] = round(concat_pred.sum(axis=1)/3)

In [99]:
concat_pred.label = concat_pred.label.apply(lambda x: int(x))

In [100]:
concat_pred

Unnamed: 0,0,1,2,label
0,0,1,1,1
1,1,1,1,1
2,0,0,1,0
3,1,1,1,1
4,1,1,1,1
...,...,...,...,...
4995,0,0,0,0
4996,0,0,0,0
4997,1,1,1,1
4998,0,0,0,0


# 3. 전처리 다르게 해보기

In [125]:
tfidf = TfidfVectorizer(ngram_range=(1,3),
                       min_df= 2,
                       max_df=0.9,
                       tokenizer=okt.morphs,
                       token_pattern=None)

models = [
    ("naive_bayes", BernoulliNB()),
    ("SGD", SGDClassifier(random_state=SEED, n_jobs=-1)),
    ("LR", LogisticRegression(class_weight='balanced', max_iter=1000)),
    ("rfc", RandomForestClassifier(random_state=SEED, n_jobs=-1)),
    ("SVC", SVC(random_state=SEED)),
   ("gdb", GradientBoostingClassifier(n_estimators=1000)),
]

model_pipes = [(name, Pipeline([('tfidf',tfidf),(name, model)])) for name, model in models]

scores =[]
for model_name, model in tqdm(model_pipes, leave=False):
    acc = return_kfold_accuarcy(model)
    scores.append([model_name, f"{acc:0.3f}"])
print(scores)

  0%|          | 0/6 [00:00<?, ?it/s]

[['naive_bayes', '0.881'], ['SGD', '0.869'], ['LR', '0.870'], ['rfc', '0.838'], ['SVC', '0.875'], ['gdb', '0.848']]


In [126]:
tfidf = TfidfVectorizer(ngram_range=(1,2),
                       min_df= 1,
                       max_df=0.9,
                       tokenizer=okt.morphs,
                       token_pattern=None)

models = [
    ("naive_bayes", BernoulliNB()),
    ("SGD", SGDClassifier(random_state=SEED, n_jobs=-1)),
    ("LR", LogisticRegression(class_weight='balanced', max_iter=1000)),
    ("rfc", RandomForestClassifier(random_state=SEED, n_jobs=-1)),
    ("SVC", SVC(random_state=SEED)),
   ("gdb", GradientBoostingClassifier(n_estimators=1000)),
]

model_pipes = [(name, Pipeline([('tfidf',tfidf),(name, model)])) for name, model in models]

scores =[]
for model_name, model in tqdm(model_pipes, leave=False):
    acc = return_kfold_accuarcy(model)
    scores.append([model_name, f"{acc:0.3f}"])
print(scores)

  0%|          | 0/6 [00:00<?, ?it/s]

[['naive_bayes', '0.884'], ['SGD', '0.878'], ['LR', '0.865'], ['rfc', '0.840'], ['SVC', '0.872'], ['gdb', '0.846']]


In [127]:
tfidf = TfidfVectorizer(ngram_range=(1,2),
                       tokenizer=okt.morphs,
                       token_pattern=None)

models = [
    ("naive_bayes", BernoulliNB()),
    ("SGD", SGDClassifier(random_state=SEED, n_jobs=-1)),
    ("LR", LogisticRegression(class_weight='balanced', max_iter=1000)),
    ("rfc", RandomForestClassifier(random_state=SEED, n_jobs=-1)),
    ("SVC", SVC(random_state=SEED)),
   ("gdb", GradientBoostingClassifier(n_estimators=1000)),
]

model_pipes = [(name, Pipeline([('tfidf',tfidf),(name, model)])) for name, model in models]

scores =[]
for model_name, model in tqdm(model_pipes, leave=False):
    acc = return_kfold_accuarcy(model)
    scores.append([model_name, f"{acc:0.3f}"])
print(scores)

  0%|          | 0/6 [00:00<?, ?it/s]

[['naive_bayes', '0.884'], ['SGD', '0.878'], ['LR', '0.865'], ['rfc', '0.840'], ['SVC', '0.872'], ['gdb', '0.851']]


In [128]:
tfidf = TfidfVectorizer(ngram_range=(1,3),
                       tokenizer=okt.morphs,
                       token_pattern=None)

models = [
    ("naive_bayes", BernoulliNB()),
    ("SGD", SGDClassifier(random_state=SEED, n_jobs=-1)),
    ("LR", LogisticRegression(class_weight='balanced', max_iter=1000)),
    ("rfc", RandomForestClassifier(random_state=SEED, n_jobs=-1)),
    ("SVC", SVC(random_state=SEED)),
   ("gdb", GradientBoostingClassifier(n_estimators=1000)),
]

model_pipes = [(name, Pipeline([('tfidf',tfidf),(name, model)])) for name, model in models]

scores =[]
for model_name, model in tqdm(model_pipes, leave=False):
    acc = return_kfold_accuarcy(model)
    scores.append([model_name, f"{acc:0.3f}"])
print(scores)

  0%|          | 0/6 [00:00<?, ?it/s]

[['naive_bayes', '0.882'], ['SGD', '0.881'], ['LR', '0.861'], ['rfc', '0.834'], ['SVC', '0.869'], ['gdb', '0.845']]


# 최종

In [129]:
tfidf = TfidfVectorizer(ngram_range=(1,2),
                       tokenizer=okt.morphs,
                       token_pattern=None)

models = [
    ("naive_bayes", BernoulliNB()),
    ("SGD", SGDClassifier(random_state=SEED, n_jobs=-1)),
    ("SVC", SVC(random_state=SEED)),
]

model_pipes = [(name, Pipeline([('tfidf',tfidf),(name, model)])) for name, model in models]

scores =[]
for model_name, model in tqdm(model_pipes, leave=False):
    acc = return_kfold_accuarcy(model)
    scores.append([model_name, f"{acc:0.3f}"])
print(scores)

  0%|          | 0/3 [00:00<?, ?it/s]

[['naive_bayes', '0.884'], ['SGD', '0.878'], ['SVC', '0.872']]


In [130]:
stack_models = [(name, Pipeline([('tfidf',tfidf),(name, model)])) for name, model in models]

stacking = StackingClassifier(stack_models)
acc = return_kfold_accuarcy(stacking)
print(acc)

stacking.fit(train_df['tokenized_stem'], train_df['label'])
stacking_pred = stacking.predict(test_df["tokenized_stem"])

submission['label'] = stacking_pred
submission

0.8874000000000001


Unnamed: 0,id,label
0,1,0
1,2,1
2,3,0
3,4,1
4,5,1
...,...,...
4995,4996,0
4996,4997,0
4997,4998,1
4998,4999,0


In [131]:
submission.to_csv('submission5.csv', index=False)

# public 0.882 (15위) private 0.88833(24위)

### 특수문자, 자음 모음 전처리 안 할 경우 어떻게 될지 궁금하다. 
### 뒤늦게 참여해서 아쉬운 점이 많다!