In [1]:
import pandas as pd
from konlpy.tag import Okt; t = Okt()
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
train = pd.read_csv('./datas/train_ver1', index_col=[0])
train.tail()

Unnamed: 0,comments,bias_label,gender_label,hate_label,news_title,comment_pos,title_pos
7891,"힘내세요,응원합니다",none,False,none,"허지웅 허투루 넘길 말 없었다,솔직하게 드러냈던 속 사정","[('힘내세요', 'Verb'), (',', 'Punctuation'), ('응원'...","[('허지웅', 'Noun'), ('허투루', 'Noun'), ('넘길', 'Ver..."
7892,"힘내세요,삼가 고인의 명복을 빕니다",none,False,none,이혜경 오 캐롤 공연 중 남편 오정욱 부 고 오열 속 발인 종합,"[('힘내세요', 'Verb'), (',', 'Punctuation'), ('삼가'...","[('이혜경', 'Noun'), ('오', 'Noun'), ('캐롤', 'Noun'..."
7893,힘내세용 항상 응원합니닷,none,False,none,설경구 송윤아 아들과 즐거운 하루 전 엄마니까요,"[('힘내세용', 'Verb'), ('항상', 'Noun'), ('응원', 'Nou...","[('설경구', 'Noun'), ('송윤아', 'Noun'), ('아들', 'Nou..."
7894,"힘내 소연기로 답해요,나도 53살 인데 이런 일 저런 일 다 있더라구 요인격을 믿습...",none,False,none,SC 현장 연예인 인생 협박 유감 미소 잃은 최민수 보복운전 혐의 2차 공판 종합,"[('힘내', 'Verb'), ('소', 'Modifier'), ('연기', 'No...","[('SC', 'Alpha'), ('현장', 'Noun'), ('연예인', 'Nou..."
7895,힘들면 관뒀어야지 그게 현명한 거다,none,False,none,단독 스태프 사망 사고 서른 이지만 결국 오늘 촬영 취소,"[('힘들면', 'Adjective'), ('관', 'Noun'), ('뒀어야지',...","[('단독', 'Noun'), ('스태프', 'Noun'), ('사망', 'Noun..."


In [3]:
dev = pd.read_csv('./datas/dev.hate.csv')
dev.rename(columns={'label': 'hate_label'}, inplace=True)
dev.tail()

Unnamed: 0,comments,hate_label
466,지현우 범죄 저지르지 않았나요?,offensive
467,여자인생 망칠 일 있나 ㅋㅋ,hate
468,근데 전라도에서 사고가 났는데 굳이 서울까지 와서 병원에 가느 이유는?,offensive
469,"할매젖x, 뱃살x, 몸매 s라인, 유륜은 적당해야됨(너무크거나 너무 작아도 x), ...",hate
470,남자가 잘못한거라면... 반성도 없다면...나였다면 ... 여자처럼 아주 못되게 할...,none


In [4]:
def t_tokenizer(text):
    tokens_ko = t.morphs(text)
    return tokens_ko

In [5]:
tfidf_vect = TfidfVectorizer(tokenizer=t_tokenizer, ngram_range=(1,2), min_df=3, max_df=0.9)
tfidf_vect.fit(train['comments'])
tfidf_matrix_train = tfidf_vect.transform(train['comments'])

## LGBMClassifier compare train and dev data

In [8]:
from lightgbm import LGBMClassifier
import time

start_time = time.time()
lgbm_clf = LGBMClassifier(n_estimators=400)
lgbm_clf.fit(tfidf_matrix_train, train['hate_label'])
print('fit time:', time.time() - start_time)

fit time: 7.578863859176636


In [9]:
from sklearn.metrics import f1_score, accuracy_score, recall_score

tfidf_matrix_dev = tfidf_vect.transform(dev['comments'])
preds = lgbm_clf.predict(tfidf_matrix_dev)
accuracy_score(dev['hate_label'], preds), f1_score(preds, dev['hate_label'], average='macro')

(0.5414012738853503, 0.5360114752270562)

## DecisionTreeClassifier compare train and dev data

In [10]:
from sklearn.tree import DecisionTreeClassifier

decision_clf = DecisionTreeClassifier(max_depth=2, random_state=13)
decision_clf.fit(tfidf_matrix_train, train['hate_label'])
print('fit time:', time.time() - start_time)

fit time: 8.65710735321045


In [11]:
preds = decision_clf.predict(tfidf_matrix_dev)
accuracy_score(dev['hate_label'], preds), f1_score(preds, dev['hate_label'], average='macro')

(0.4118895966029724, 0.33302547396349347)

## LogisticRegression compare train and dev data

In [12]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='liblinear', random_state=13)
lr.fit(tfidf_matrix_train, train['hate_label'])
print('fit time:', time.time() - start_time)

fit time: 8.741127967834473


In [13]:
preds = lr.predict(tfidf_matrix_dev)
accuracy_score(dev['hate_label'], preds), f1_score(preds, dev['hate_label'], average='macro')

(0.5944798301486199, 0.5861227653087044)

## GradientBoostingClassifier compare train and dev data

In [53]:
from sklearn.ensemble import GradientBoostingClassifier

gb_clf = GradientBoostingClassifier(random_state=13)
gb_clf.fit(tfidf_matrix_train, train['hate_label'])
print('fit time:', time.time() - start_time)

fit time: 944.8058462142944


In [54]:
preds = gb_clf.predict(tfidf_matrix_dev)
accuracy_score(dev['hate_label'], preds), f1_score(preds, dev['hate_label'], average='macro')

(0.5095541401273885, 0.48810533169257236)

In [56]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.3.1-py3-none-win_amd64.whl (95.2 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.3.1


## XGBClassifier compare train and dev data

In [12]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)
xgb_clf.fit(tfidf_matrix_train, train['hate_label'])
print('fit time:', time.time() - start_time)

fit time: 153.06296706199646


In [13]:
preds = xgb_clf.predict(tfidf_matrix_dev)
accuracy_score(dev['hate_label'], preds), f1_score(preds, dev['hate_label'], average='macro')

(0.5222929936305732, 0.5038136262817716)

In [59]:
from sklearn.model_selection import train_test_split

msg_train, msg_test, class_train, class_test = train_test_split(tfidf_matrix_train, train['hate_label'], test_size=0.2, stratify=train['hate_label'])

In [60]:
from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier,
                              RandomForestClassifier)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

models = []
models.append(('RandomForestClassifier', RandomForestClassifier()))
models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))
models.append(('AdaBoostClassifier', AdaBoostClassifier()))
models.append(('GradientBoostingClassifier', GradientBoostingClassifier()))
models.append(('LogisticRegression', LogisticRegression()))
models.append(('LGBMClassifier', LGBMClassifier()))

In [61]:
from sklearn.metrics import f1_score, accuracy_score, recall_score

for name, model in models:
    clf = model
    clf.fit(msg_train, class_train)
    
    y_pred_tr = clf.predict(msg_train)
    y_pred_test = clf.predict(msg_test)
    
    print('Train Acc : ', accuracy_score(class_train, y_pred_tr))
    print('Test Acc : ', accuracy_score(class_test, y_pred_test))

Train Acc :  0.9988917036098797
Test Acc :  0.5360759493670886
Train Acc :  0.9988917036098797
Test Acc :  0.46392405063291137
Train Acc :  0.5582647245091831
Test Acc :  0.510759493670886
Train Acc :  0.652786573780874
Test Acc :  0.5259493670886076
Train Acc :  0.8530715642811906
Test Acc :  0.5430379746835443
Train Acc :  0.8096896770107663
Test Acc :  0.5316455696202531


In [62]:
train_score = []
test_score = []
names = []
f1score = []

for name, model in models:
    clf = model
    clf.fit(msg_train, class_train)
    
    y_pred_tr = clf.predict(msg_train)
    y_pred_test = clf.predict(msg_test)
    
    names.append(name)
    test_score.append(accuracy_score(class_test, y_pred_test))
    train_score.append(accuracy_score(class_train, y_pred_tr))
    f1score.append(f1_score(class_test, y_pred_test, average='macro'))

In [63]:
result = pd.DataFrame({'model name':names, 
                       'train score':train_score, 
                       'test score':test_score,
                       'f1 score': f1score}) 
result

Unnamed: 0,model name,train score,test score,f1 score
0,RandomForestClassifier,0.998892,0.532911,0.474092
1,DecisionTreeClassifier,0.998892,0.473418,0.441096
2,AdaBoostClassifier,0.558265,0.510759,0.44448
3,GradientBoostingClassifier,0.652787,0.524684,0.451602
4,LogisticRegression,0.853072,0.543038,0.507301
5,LGBMClassifier,0.80969,0.531646,0.498601
