In [102]:
import pandas as pd
from konlpy.tag import Okt; t = Okt()
from sklearn.feature_extraction.text import TfidfVectorizer

In [103]:
train = pd.read_csv('./datas/train_ver1', index_col=[0])
dev = pd.read_csv('./datas/dev.hate.csv')
dev.rename(columns={'label': 'hate_label'}, inplace=True)

In [104]:
def t_tokenizer(text):
    tokens_ko = t.morphs(text)
    return tokens_ko

In [105]:
tfidf_vect = TfidfVectorizer(tokenizer=t_tokenizer, ngram_range=(1,2), min_df=3, max_df=0.9)
tfidf_vect.fit(train['comments'])
tfidf_matrix_train = tfidf_vect.transform(train['comments'])
tfidf_matrix_dev = tfidf_vect.transform(dev['comments'])

In [106]:
from sklearn.linear_model import LogisticRegression
import time

start_time = time.time()
lr = LogisticRegression()
lr.fit(tfidf_matrix_train, train['hate_label'])
print('fit time:', time.time() - start_time)

fit time: 0.701613187789917


In [107]:
from sklearn.metrics import f1_score, accuracy_score, recall_score

preds = lr.predict(tfidf_matrix_dev)
accuracy_score(dev['hate_label'], preds), f1_score(preds, dev['hate_label'], average='macro')

(0.5923566878980892, 0.5862217149718904)

## using GridSearchCV with LogisticRegression

In [112]:
from sklearn.model_selection import GridSearchCV

params = {'C': [1, 3.5, 4.5, 5.5, 10]}
gcv_lr = GridSearchCV(lr, param_grid=params, cv=3, scoring='accuracy', verbose=1)
gcv_lr.fit(tfidf_matrix_train, train['hate_label'])
gcv_lr.best_params_

Fitting 3 folds for each of 5 candidates, totalling 15 fits


{'C': 1}

In [113]:
best_estimator = gcv_lr.best_estimator_
preds = best_estimator.predict(tfidf_matrix_dev)
accuracy_score(dev['hate_label'], preds), f1_score(preds, dev['hate_label'], average='macro')

(0.5923566878980892, 0.5862217149718904)

## KMeans Clustering to LogisticRegression

In [114]:
from sklearn.cluster import KMeans

km_cluster = KMeans(n_clusters=3, max_iter=10000, random_state=13)
km_cluster.fit(tfidf_matrix_train)
cluster_label = km_cluster.labels_
train['cluster_label'] = cluster_label
train.sort_values(by='cluster_label')

Unnamed: 0,comments,bias_label,gender_label,hate_label,news_title,comment_pos,title_pos,cluster_label
0,현재 호텔 주인 심정 아18 난 마른 하늘에 날 벼락 맞고 호텔 망하게 생겼는데 누...,others,False,hate,밤새 조문 행렬 전 미선 동료들이 그리워하는 따뜻한 배우 종합,"[('현재', 'Noun'), ('호텔', 'Noun'), ('주인', 'Noun'...","[('밤새', 'Noun'), ('조문', 'Noun'), ('행렬', 'Noun'...",0
4568,"엄하게 꾸짖는 것 보다,따뜻한 말 한마디가 나를 타인을 세상을 바꾸는 진리 감동ㅜㅜ",none,False,none,TV는 사랑을 싣고 김범룡 45억 빚 청산 후 찾은 첫사랑 국사 선생님과 재회 어저...,"[('엄하', 'Noun'), ('게', 'Josa'), ('꾸짖는', 'Verb'...","[('TV', 'Alpha'), ('는', 'Verb'), ('사랑', 'Noun'...",0
4566,엄연히 따지면 승리는 돈 벌자 고 범죄를 묵인 한 거고 얘는 지가 재미로 저지른 거지,none,False,none,공식 입장 정준영과 친분 No 모델 허현 측 몰카 논란 법적 대응 전문,"[('엄연히', 'Adjective'), ('따지면', 'Verb'), ('승리',...","[('공식', 'Noun'), ('입장', 'Noun'), ('정준영', 'Noun...",0
4565,엄벌에 처해 주세요,none,False,none,무면허 음주 뺑소니 손승원 보석 신청 술에 의 지하는 삶 살지 않겠다,"[('엄벌', 'Noun'), ('에', 'Josa'), ('처', 'Noun'),...","[('무면허', 'Noun'), ('음주', 'Noun'), ('뺑소니', 'Nou...",0
4564,엄마한테 잘해야 해요,none,False,none,장윤정 도 경완 폭풍성장 아들과 캠핑 근황언젠간 엄마도 함께 SHOT,"[('엄마', 'Noun'), ('한테', 'Josa'), ('잘해야', 'Noun...","[('장윤정', 'Noun'), ('도', 'Noun'), ('경완', 'Noun'...",0
...,...,...,...,...,...,...,...,...
6004,이젠 뭐라 해도 진심이 1도 읍 다 달게 받기는 확인 안 하면 질책도 안 보고 못 ...,none,False,offensive,이종현 BJ 박민정 DM 논란 죄송 씨엔블루 탈퇴 결정 전문,"[('이', 'Determiner'), ('젠', 'Noun'), ('뭐라', 'V...","[('이종현', 'Noun'), ('BJ', 'Alpha'), ('박민정', 'No...",2
7034,집사부 일체에 사부로 애들 잠깐 양념으로 나오는 건 그렇다 쳐도 이 영애 양심이 있...,none,False,offensive,집사부 일체 이영애 이상윤에 자녀 교육 상담 수학 어려워,"[('집사부', 'Noun'), ('일체', 'Noun'), ('에', 'Josa'...","[('집사부', 'Noun'), ('일체', 'Noun'), ('이영애', 'Nou...",2
549,계집들 댓글 보소 누가 아깝니 그런 개소리는 여자들 왜 하는 겨 여자 종특인가,gender,True,hate,단독 장현승 신수지 4개월째 열애 볼링이 공통분모,"[('계집', 'Noun'), ('들', 'Suffix'), ('댓글', 'Noun...","[('단독', 'Noun'), ('장현승', 'Noun'), ('신수지', 'Nou...",2
6001,"이제는 이렇게 퍼 오는 거 신고해서 걸러내야 한다,그냥 기자들에게 저들의 일상은 돈...",none,False,offensive,차세찌 한 채 아 달콤한 키즈카페 데이트 커플 운동화 SHOT,"[('이제', 'Noun'), ('는', 'Josa'), ('이렇게', 'Adver...","[('차세', 'Verb'), ('찌', 'Noun'), ('한', 'Verb'),...",2


In [123]:
train['cluster_label'].value_counts()

0    4668
1    1962
2    1266
Name: cluster_label, dtype: int64

In [115]:
km_train = km_cluster.transform(tfidf_matrix_train)
km_dev = km_cluster.transform(tfidf_matrix_dev)

In [116]:
km_cluster = KMeans(n_clusters=3, max_iter=10000, random_state=13)
km_cluster.fit(tfidf_matrix_dev)
cluster_label = km_cluster.labels_
dev['cluster_label'] = cluster_label

In [117]:
params = {'C': [0.01, 0.1, 1, 5, 10]}
gcv_lr = GridSearchCV(lr, param_grid=params, cv=5, scoring='accuracy', verbose=1)
gcv_lr.fit(km_train, train['cluster_label'])
gcv_lr.best_params_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


{'C': 10}

In [118]:
best_estimator = gcv_lr.best_estimator_
preds = best_estimator.predict(km_dev)
accuracy_score(dev['cluster_label'], preds), f1_score(preds, dev['cluster_label'], average='macro')

(0.6518046709129511, 0.5328976494010175)

In [66]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(tfidf_matrix_train, train['hate_label'])

KNeighborsClassifier(n_neighbors=3)

In [133]:
preds = knn.predict(tfidf_matrix_dev)
accuracy_score(dev['cluster_label'], preds)

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 9974 while Y.shape[1] == 31704

In [125]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

In [126]:
train = pd.read_csv('./datas/train_ver1', index_col=[0])
dev = pd.read_csv('./datas/dev.hate.csv')
dev.rename(columns={'label': 'hate_label'}, inplace=True)

In [127]:
count_vec = CountVectorizer()
train_count = count_vec.fit_transform(train['comments'])
tfidf_trans = TfidfTransformer()
train_tfidf = tfidf_trans.fit_transform(train_count)

In [128]:
dev_count = count_vec.transform(dev['comments'])
dev_tfidf = tfidf_trans.transform(dev_count)

## countvect -> tfidf transformer -> knn

In [129]:
knn = KNeighborsClassifier()
clf = knn.fit(train_tfidf, train['hate_label'])

In [130]:
preds = clf.predict(dev_tfidf)
accuracy_score(dev['hate_label'], preds), f1_score(preds, dev['hate_label'], average='macro')

(0.37154989384288745, 0.26709237989354595)

In [131]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', knn),
])

text_clf.fit(train['comments'], train['hate_label'])
test = dev['comments']
pred = text_clf.predict(test)
np.mean(pred == dev['hate_label'])*100

37.15498938428875

## countvect -> tfidf transformer -> lr

In [100]:
from sklearn.linear_model import LogisticRegression
import time

start_time = time.time()
lr = LogisticRegression()
lr.fit(train_tfidf, train['hate_label'])
print('fit time:', time.time() - start_time)

fit time: 1.3053138256072998


In [101]:
from sklearn.metrics import f1_score, accuracy_score, recall_score

preds = lr.predict(dev_tfidf)
accuracy_score(dev['hate_label'], preds), f1_score(preds, dev['hate_label'], average='macro')

(0.46496815286624205, 0.4121162570736947)

In [None]:
n