In [1]:
import pandas as pd
from konlpy.tag import Okt; t = Okt()
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
train = pd.read_csv('./datas/train_ver1', index_col=[0])
train.tail()

Unnamed: 0,comments,bias_label,gender_label,hate_label,news_title,comment_pos,title_pos
7891,"힘내세요,응원합니다",none,False,none,"허지웅 허투루 넘길 말 없었다,솔직하게 드러냈던 속 사정","[('힘내세요', 'Verb'), (',', 'Punctuation'), ('응원'...","[('허지웅', 'Noun'), ('허투루', 'Noun'), ('넘길', 'Ver..."
7892,"힘내세요,삼가 고인의 명복을 빕니다",none,False,none,이혜경 오 캐롤 공연 중 남편 오정욱 부 고 오열 속 발인 종합,"[('힘내세요', 'Verb'), (',', 'Punctuation'), ('삼가'...","[('이혜경', 'Noun'), ('오', 'Noun'), ('캐롤', 'Noun'..."
7893,힘내세용 항상 응원합니닷,none,False,none,설경구 송윤아 아들과 즐거운 하루 전 엄마니까요,"[('힘내세용', 'Verb'), ('항상', 'Noun'), ('응원', 'Nou...","[('설경구', 'Noun'), ('송윤아', 'Noun'), ('아들', 'Nou..."
7894,"힘내 소연기로 답해요,나도 53살 인데 이런 일 저런 일 다 있더라구 요인격을 믿습...",none,False,none,SC 현장 연예인 인생 협박 유감 미소 잃은 최민수 보복운전 혐의 2차 공판 종합,"[('힘내', 'Verb'), ('소', 'Modifier'), ('연기', 'No...","[('SC', 'Alpha'), ('현장', 'Noun'), ('연예인', 'Nou..."
7895,힘들면 관뒀어야지 그게 현명한 거다,none,False,none,단독 스태프 사망 사고 서른 이지만 결국 오늘 촬영 취소,"[('힘들면', 'Adjective'), ('관', 'Noun'), ('뒀어야지',...","[('단독', 'Noun'), ('스태프', 'Noun'), ('사망', 'Noun..."


In [3]:
dev = pd.read_csv('./datas/dev.hate.csv')
dev.rename(columns={'label': 'hate_label'}, inplace=True)
dev.tail()

Unnamed: 0,comments,hate_label
466,지현우 범죄 저지르지 않았나요?,offensive
467,여자인생 망칠 일 있나 ㅋㅋ,hate
468,근데 전라도에서 사고가 났는데 굳이 서울까지 와서 병원에 가느 이유는?,offensive
469,"할매젖x, 뱃살x, 몸매 s라인, 유륜은 적당해야됨(너무크거나 너무 작아도 x), ...",hate
470,남자가 잘못한거라면... 반성도 없다면...나였다면 ... 여자처럼 아주 못되게 할...,none


In [4]:
def t_tokenizer(text):
    tokens_ko = t.morphs(text)
    return tokens_ko

In [12]:
tfidf_vect = TfidfVectorizer(tokenizer=t_tokenizer, ngram_range=(1,2), min_df=5, max_df=0.9, sublinear_tf=True, max_features=50000)
tfidf_vect.fit(train['comments'])
tfidf_matrix_train = tfidf_vect.transform(train['comments'])



## LGBMClassifier compare train and dev data

In [13]:
from lightgbm import LGBMClassifier
import time

start_time = time.time()
lgbm_clf = LGBMClassifier(n_estimators=400)
lgbm_clf.fit(tfidf_matrix_train, train['hate_label'])
print('fit time:', time.time() - start_time)

fit time: 9.332458257675171


In [14]:
from sklearn.metrics import f1_score, accuracy_score, recall_score

tfidf_matrix_dev = tfidf_vect.transform(dev['comments'])
preds = lgbm_clf.predict(tfidf_matrix_dev)
accuracy_score(dev['hate_label'], preds), f1_score(preds, dev['hate_label'], average='macro')

(0.5414012738853503, 0.5370598485758923)

## DecisionTreeClassifier compare train and dev data

In [15]:
from sklearn.tree import DecisionTreeClassifier

decision_clf = DecisionTreeClassifier(max_depth=2, random_state=13)
decision_clf.fit(tfidf_matrix_train, train['hate_label'])
print('fit time:', time.time() - start_time)

fit time: 10.518273115158081


In [16]:
preds = decision_clf.predict(tfidf_matrix_dev)
accuracy_score(dev['hate_label'], preds), f1_score(preds, dev['hate_label'], average='macro')

(0.4118895966029724, 0.33302547396349347)

## LogisticRegression compare train and dev data

In [17]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='sag', penalty='l2', random_state= 1000)
lr.fit(tfidf_matrix_train, train['hate_label'])
print('fit time:', time.time() - start_time)

fit time: 10.634006261825562


In [18]:
preds = lr.predict(tfidf_matrix_dev)
accuracy_score(dev['hate_label'], preds), f1_score(preds, dev['hate_label'], average='macro')

(0.5944798301486199, 0.5866447728516694)

In [22]:
len(preds)

471

In [23]:
wrong_result = []

for n in range(0, len(dev['hate_label'])):
    if preds[n] != dev['hate_label'][n]:
        wrong_result.append(n)
len(wrong_result)

193

In [24]:
import random

samples = random.choices(population=wrong_result, k=300)
samples

[3,
 199,
 171,
 219,
 70,
 324,
 273,
 70,
 55,
 414,
 393,
 425,
 301,
 225,
 50,
 433,
 92,
 325,
 199,
 270,
 224,
 325,
 173,
 358,
 96,
 186,
 352,
 44,
 154,
 312,
 345,
 100,
 332,
 329,
 230,
 95,
 438,
 173,
 219,
 159,
 111,
 66,
 426,
 329,
 96,
 387,
 272,
 114,
 394,
 92,
 431,
 61,
 157,
 281,
 359,
 309,
 217,
 163,
 41,
 230,
 15,
 2,
 193,
 252,
 113,
 37,
 173,
 147,
 147,
 220,
 76,
 272,
 41,
 71,
 442,
 241,
 347,
 466,
 52,
 210,
 262,
 100,
 43,
 50,
 252,
 151,
 225,
 100,
 193,
 49,
 325,
 434,
 271,
 95,
 431,
 258,
 309,
 239,
 469,
 225,
 226,
 322,
 11,
 352,
 341,
 309,
 391,
 469,
 259,
 11,
 163,
 181,
 217,
 50,
 401,
 173,
 76,
 198,
 453,
 312,
 322,
 345,
 387,
 171,
 308,
 403,
 216,
 181,
 172,
 298,
 380,
 431,
 224,
 217,
 409,
 281,
 24,
 138,
 43,
 166,
 19,
 332,
 394,
 74,
 322,
 458,
 470,
 171,
 312,
 235,
 453,
 387,
 403,
 322,
 220,
 199,
 331,
 149,
 179,
 273,
 47,
 224,
 469,
 401,
 225,
 156,
 285,
 66,
 147,
 52,
 272,
 29,
 89,
 4

In [25]:
df = pd.DataFrame(preds[wrong_result])
df

Unnamed: 0,0
0,offensive
1,none
2,offensive
3,offensive
4,none
...,...
188,none
189,none
190,none
191,none


In [26]:
df1 = dev.iloc[wrong_result]
df1['error'] = preds[wrong_result]
df1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,comments,hate_label,error
2,알바쓰고많이만들면되지 돈욕심없으면골목식당왜나온겨 기댕기게나하고 산에가서팔어라,hate,offensive
3,설마 ㅈ 현정 작가 아니지,hate,none
10,이경규가 이런거보면 세련되긴함 저 나이에 차은우 누가 알꼬 아무리 잘생겼다해도 배워...,none,offensive
11,아c발 어쩌라고 뭔기사가계속나오냐,hate,offensive
15,둘이 화장실가서 싸우길,offensive,none
...,...,...,...
463,강용석도 찌질하네 과거들추기는 추하다,offensive,none
466,지현우 범죄 저지르지 않았나요,offensive,none
468,근데 전라도에서 사고가 났는데 굳이 서울까지 와서 병원에 가느 이유는,offensive,none
469,할매젖x 뱃살x 몸매 s라인 유륜은 적당해야됨너무크거나 너무 작아도 x 엉덩이가 빵...,hate,none


In [27]:
df1.to_csv('./datas/df1_wrong_result', index=False)

In [28]:
df1['hate_label'].value_counts().to_frame()

Unnamed: 0,hate_label
offensive,117
hate,62
none,14


In [29]:
df1['error'].value_counts().to_frame()

Unnamed: 0,error
none,131
offensive,46
hate,16


In [30]:
np.unique(preds, return_counts=True)

(array(['hate', 'none', 'offensive'], dtype=object),
 array([ 76, 277, 118], dtype=int64))

## GradientBoostingClassifier compare train and dev data

In [65]:
from sklearn.ensemble import GradientBoostingClassifier

gb_clf = GradientBoostingClassifier(random_state=13)
gb_clf.fit(tfidf_matrix_train, train['hate_label'])
print('fit time:', time.time() - start_time)

fit time: 1442.2418930530548


In [66]:
preds = gb_clf.predict(tfidf_matrix_dev)
accuracy_score(dev['hate_label'], preds), f1_score(preds, dev['hate_label'], average='macro')

(0.5010615711252654, 0.46740606218993697)

## XGBClassifier compare train and dev data

In [67]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)
xgb_clf.fit(tfidf_matrix_train, train['hate_label'])
print('fit time:', time.time() - start_time)





fit time: 1450.349557876587


In [68]:
preds = xgb_clf.predict(tfidf_matrix_dev)
accuracy_score(dev['hate_label'], preds), f1_score(preds, dev['hate_label'], average='macro')

(0.5307855626326964, 0.5075256284933704)

In [36]:
## RandomForestClassifier compare train and dev data

In [69]:
from sklearn.ensemble import RandomForestClassifier

r_clf = RandomForestClassifier(max_depth=4, random_state=13)
r_clf.fit(tfidf_matrix_train, train['hate_label'])

preds = r_clf.predict(tfidf_matrix_dev)
accuracy_score(dev['hate_label'], preds), f1_score(preds, dev['hate_label'], average='macro')

(0.33970276008492567, 0.16904384574749076)

In [70]:
from sklearn.model_selection import train_test_split

msg_train, msg_test, class_train, class_test = train_test_split(tfidf_matrix_train, train['hate_label'], test_size=0.2, stratify=train['hate_label'])

In [39]:
from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier,
                              RandomForestClassifier)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

models = []
models.append(('RandomForestClassifier', RandomForestClassifier()))
models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))
models.append(('AdaBoostClassifier', AdaBoostClassifier()))
models.append(('GradientBoostingClassifier', GradientBoostingClassifier()))
models.append(('LogisticRegression', LogisticRegression()))
models.append(('LGBMClassifier', LGBMClassifier()))

In [40]:
from sklearn.metrics import f1_score, accuracy_score, recall_score

for name, model in models:
    clf = model
    clf.fit(msg_train, class_train)
    
    y_pred_tr = clf.predict(msg_train)
    y_pred_test = clf.predict(msg_test)
    
    print('Train Acc : ', accuracy_score(class_train, y_pred_tr))
    print('Test Acc : ', accuracy_score(class_test, y_pred_test))

Train Acc :  0.9990500316656111
Test Acc :  0.5379746835443038
Train Acc :  0.9990500316656111
Test Acc :  0.47025316455696203
Train Acc :  0.5508233058898037
Test Acc :  0.5082278481012659
Train Acc :  0.6489867004433185
Test Acc :  0.5208860759493671
Train Acc :  0.8552881570614312
Test Acc :  0.5620253164556962
Train Acc :  0.8180810639645345
Test Acc :  0.5430379746835443


In [41]:
train_score = []
test_score = []
names = []
f1score = []

for name, model in models:
    clf = model
    clf.fit(msg_train, class_train)
    
    y_pred_tr = clf.predict(msg_train)
    y_pred_test = clf.predict(msg_test)
    
    names.append(name)
    test_score.append(accuracy_score(class_test, y_pred_test))
    train_score.append(accuracy_score(class_train, y_pred_tr))
    f1score.append(f1_score(class_test, y_pred_test, average='macro'))

In [42]:
result = pd.DataFrame({'model name':names, 
                       'train score':train_score, 
                       'test score':test_score,
                       'f1 score': f1score}) 
result

Unnamed: 0,model name,train score,test score,f1 score
0,RandomForestClassifier,0.99905,0.538608,0.49302
1,DecisionTreeClassifier,0.99905,0.478481,0.452573
2,AdaBoostClassifier,0.550823,0.508228,0.463536
3,GradientBoostingClassifier,0.64867,0.527848,0.466697
4,LogisticRegression,0.855288,0.562025,0.53279
5,LGBMClassifier,0.818081,0.543038,0.516057
