In [1]:
import pandas as pd
from konlpy.tag import Okt; t = Okt()
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
train = pd.read_csv('./datas/train_ver2', index_col=[0])
dev = pd.read_csv('./datas/dev.hate.csv')
dev.rename(columns={'label': 'hate_label'}, inplace=True)

In [3]:
def t_tokenizer(text):
    tokens_ko = t.morphs(text)
    return tokens_ko

In [4]:
tfidf_vect = TfidfVectorizer(tokenizer=t_tokenizer, sublinear_tf=True, ngram_range=(1,3), min_df=3, max_df=0.9, max_features=500000)
tfidf_vect.fit(train['comments'])
tfidf_matrix_train = tfidf_vect.transform(train['comments'])
tfidf_matrix_dev = tfidf_vect.transform(dev['comments'])

In [5]:
from sklearn.linear_model import LogisticRegression
import time

start_time = time.time()
lr = LogisticRegression(solver='sag', C=1, penalty='l2', random_state=350)
lr.fit(tfidf_matrix_train, train['hate_label'])
print('fit time:', time.time() - start_time)

fit time: 0.09601950645446777


In [6]:
from sklearn.metrics import f1_score, accuracy_score, recall_score

preds = lr.predict(tfidf_matrix_dev)
accuracy_score(dev['hate_label'], preds), f1_score(preds, dev['hate_label'], average='macro')

(0.6050955414012739, 0.5950876222099244)

In [7]:
test = pd.read_csv('./datas/test.hate.no_label.csv')
test.tail()

Unnamed: 0,comments
969,대박 게스트... 꼭 봐야징~ 컨셉이 바뀌니깐 재미지넹
970,성형으로 다 뜯어고쳐놓고 예쁜척. 성형 전 니 얼굴 다 알고있다. 순자처럼 된장냄새...
971,분위기는 비슷하다만 전혀다른 전개던데 무슨ㅋㅋㄱ 우리나라사람들은 분위기만 비슷하면 ...
972,입에 손가릭이 10개 있으니 징그럽다
973,난 조보아 이뻐서 보는데 백종원 관심무


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score

tfidf_matrix_test = tfidf_vect.transform(test['comments'])
lr = LogisticRegression(solver='sag', C=1, penalty='l2', random_state=350)
lr.fit(tfidf_matrix_train, train['hate_label'])

preds = lr.predict(tfidf_matrix_test)
len(preds)

974

In [9]:
preds = pd.DataFrame(preds)
preds[preds[0] == 'none'] = 0
preds[preds[0] == 'offensive'] = 1
preds[preds[0] == 'hate'] = 2

In [10]:
preds['comments'] = test['comments']

In [11]:
preds = preds[['comments', 0]]
preds.rename(columns={0: 'label'}, inplace=True)
preds.tail()

Unnamed: 0,comments,label
969,대박 게스트... 꼭 봐야징~ 컨셉이 바뀌니깐 재미지넹,0
970,성형으로 다 뜯어고쳐놓고 예쁜척. 성형 전 니 얼굴 다 알고있다. 순자처럼 된장냄새...,1
971,분위기는 비슷하다만 전혀다른 전개던데 무슨ㅋㅋㄱ 우리나라사람들은 분위기만 비슷하면 ...,0
972,입에 손가릭이 10개 있으니 징그럽다,2
973,난 조보아 이뻐서 보는데 백종원 관심무,0


In [14]:
preds.to_csv('./datas/NaiveBayeji_prediction3.csv', index=False)

## using GridSearchCV with LogisticRegression

In [26]:
train = pd.read_csv('./datas/train_ver1', index_col=[0])
dev = pd.read_csv('./datas/dev.hate.csv')
dev.rename(columns={'label': 'hate_label'}, inplace=True)

In [27]:
def t_tokenizer(text):
    tokens_ko = t.morphs(text)
    return tokens_ko

In [35]:
tfidf = TfidfVectorizer(tokenizer=t_tokenizer, sublinear_tf=True)

params = {'vect__ngram_range': [(1, 3)],
          'vect__min_df': [3],
          'vect__max_df': [0.9],
          'vect__max_features': [50000],
          'lr__C': [0, 1, 5, 10],
          'lr__solver': ['sag', 'saga', 'lbfgs', 'newton-cg'],
          'lr__penalty': ['l1', 'l2', 'elasticnet']}

In [36]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

lr_tfidf = Pipeline([('vect', tfidf),
                    ('lr', LogisticRegression(random_state=13))])

gcv_lr = GridSearchCV(lr_tfidf, param_grid=params, cv=5, scoring='accuracy', verbose=1)
gcv_lr.fit(train['comments'], train['hate_label'])
gcv_lr.best_params_

Fitting 5 folds for each of 48 candidates, totalling 240 fits


{'lr__C': 1,
 'lr__penalty': 'l2',
 'lr__solver': 'sag',
 'vect__max_df': 0.9,
 'vect__max_features': 50000,
 'vect__min_df': 3,
 'vect__ngram_range': (1, 3)}

## trial with knn

In [53]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(tfidf_matrix_train, train['hate_label'])

KNeighborsClassifier(n_neighbors=3)

In [54]:
preds = knn.predict(tfidf_matrix_dev)
accuracy_score(dev['hate_label'], preds), f1_score(preds, dev['hate_label'], average='macro')

(0.3481953290870488, 0.19275026507161486)

In [55]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

In [56]:
train = pd.read_csv('./datas/train_ver1', index_col=[0])
dev = pd.read_csv('./datas/dev.hate.csv')
dev.rename(columns={'label': 'hate_label'}, inplace=True)

In [57]:
count_vec = CountVectorizer()
train_count = count_vec.fit_transform(train['comments'])
tfidf_trans = TfidfTransformer()
train_tfidf = tfidf_trans.fit_transform(train_count)

In [58]:
dev_count = count_vec.transform(dev['comments'])
dev_tfidf = tfidf_trans.transform(dev_count)

## countvect -> tfidf transformer -> knn

In [59]:
knn = KNeighborsClassifier()
clf = knn.fit(train_tfidf, train['hate_label'])

In [60]:
preds = clf.predict(dev_tfidf)
accuracy_score(dev['hate_label'], preds), f1_score(preds, dev['hate_label'], average='macro')

(0.37154989384288745, 0.26709237989354595)

In [61]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', knn),
])

text_clf.fit(train['comments'], train['hate_label'])
test = dev['comments']
pred = text_clf.predict(test)
np.mean(pred == dev['hate_label'])*100

37.15498938428875

## countvect -> tfidf transformer -> lr

In [62]:
from sklearn.linear_model import LogisticRegression
import time

start_time = time.time()
lr = LogisticRegression()
lr.fit(train_tfidf, train['hate_label'])
print('fit time:', time.time() - start_time)

fit time: 2.2475013732910156


In [63]:
from sklearn.metrics import f1_score, accuracy_score, recall_score

preds = lr.predict(dev_tfidf)
accuracy_score(dev['hate_label'], preds), f1_score(preds, dev['hate_label'], average='macro')

(0.46496815286624205, 0.4121162570736947)