In [2]:
import warnings
warnings.filterwarnings(action='ignore')

def kaggle_format(df):
    df['label'][df['label'] == 'none'] = 0
    df['label'][df['label'] == 'offensive'] = 1
    df['label'][df['label'] == 'hate'] = 2
    return df

In [3]:
import pandas as pd
# train 데이터 / 7893/ 
# 'none'/ 'offensive'/ 'hate' : 3486/ 2498/ 1909
train = pd.read_csv('total_20210121.csv')
train = train[['comments', 'hate']]
train.columns = ['comments', 'label']
train = kaggle_format(train)
train = train.astype({'label': 'str'})

# dev 데이터 / 471/ 
# 'none'/ 'offensive'/ 'hate' : 160/ 189/ 122
dev = pd.read_csv('./korean-hate-speech-master/labeled/dev.tsv', sep='\t')
dev = dev[['comments', 'hate']]
dev.columns = ['comments', 'label']
dev = kaggle_format(dev)
dev = dev.astype({'label': 'str'})

test = pd.read_csv('./korean-hate-speech-master/test.no_label.tsv', sep='\t')

In [4]:
X, y = train.comments, train.label
X_test, y_test = dev.comments, dev.label

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(min_df=0.0, analyzer='char', ngram_range=(1,3), 
                      sublinear_tf=True, max_features=100000)
X_tf = vec.fit_transform(X)
print(X_tf.shape)

(7893, 146518)


In [14]:
from sklearn.linear_model import LogisticRegression
lgs = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=1, 
                     class_weight='balanced', 
                     max_iter=6000, random_state=10)
lgs.fit(X_tf, y)
X_test_tf = vec.transform(X_test)
pred =  lgs.predict(X_test_tf)

In [15]:
from sklearn.metrics import f1_score
f1_score(y_test, pred, average='macro')

0.6190162674997876

#### TF-IDF Vectorizer
- max_feature 값 변경 테스트 
    - 최대 말뭉치 : 146518
    - 값 : 10000 ~ 150000, 단위 10000

In [43]:
def get_f1_score(pre_data, mf):
    vec = TfidfVectorizer(min_df=0.0, analyzer='char', ngram_range=(1,3), sublinear_tf=True,
               max_features=mf)
    X_tf = vec.fit_transform(pre_data)
    print(X_tf.shape)
    lgs = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=1, 
                         class_weight='balanced', 
                         max_iter=6000, random_state=10)
    lgs.fit(X_tf, y)
    X_test_tf = vec.transform(X_test)
    pred =  lgs.predict(X_test_tf)
    score = f1_score(y_test, pred, average='macro')
    return score

In [44]:
score_compare = pd.DataFrame()
mf_ls = list(range(10000, 150001, 10000))

In [45]:
score_ls = []
for f in mf_ls:
    score = get_f1_score(X, f)
    score_ls.append(score)


(7893, 10000)
(7893, 20000)
(7893, 30000)
(7893, 40000)
(7893, 50000)
(7893, 60000)
(7893, 70000)
(7893, 80000)
(7893, 90000)
(7893, 100000)
(7893, 110000)
(7893, 120000)
(7893, 130000)
(7893, 140000)
(7893, 146518)


In [46]:
score_compare['score'] = score_ls
score_compare['max_feature'] = mf_ls 

In [47]:
score_compare

Unnamed: 0,score,max_feature
0,0.591066,10000
1,0.609017,20000
2,0.610105,30000
3,0.613206,40000
4,0.61378,50000
5,0.616511,60000
6,0.620896,70000
7,0.61715,80000
8,0.615056,90000
9,0.61715,100000


In [51]:
score_compare.sort_values(by='score', ascending=False).head()

Unnamed: 0,score,max_feature
6,0.620896,70000
13,0.619016,140000
14,0.619016,150000
7,0.61715,80000
9,0.61715,100000


In [92]:
def get_test(df, X_train, mf=100000, ngram=(1,3)):
    vec = TfidfVectorizer(min_df=0.0, analyzer='char', ngram_range=ngram, sublinear_tf=True,
               max_features=mf)
    X_tf = vec.fit_transform(X_train)
    print(X_tf.shape)
    lgs = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=1, 
                         class_weight='balanced', 
                         max_iter=6000, random_state=10).fit(X_tf, y)
    X_test_tf = vec.transform(df['comments'])
    df['label'] = lgs.predict(X_test_tf)
    return df

In [54]:
test = pd.read_csv('./korean-hate-speech-master/test.no_label.tsv', sep='\t')
test_df = get_test(test, X)
test_df.to_csv('mf_70000.csv', index=False)

(7893, 70000)


### ngram
    - (2, 3) : (7893, 144860)
    - (1, 4) : (7893, 337257)
    - (2, 4) : (7893, 335599)
    - (1, 5) : (7893, 572846)
    - (2, 5) : (7893, 571188)
    - (1, 6) : (7893, 824542)

In [88]:
def get_f1_score(pre_data, mf, ngram):
    vec = TfidfVectorizer(min_df=0.0, analyzer='char', ngram_range=ngram, sublinear_tf=True,
               max_features=mf)
    X_tf = vec.fit_transform(pre_data)
    print(X_tf.shape)
    lgs = LogisticRegression(multi_class='multinomial', solver='lbfgs', C=1, 
                         class_weight='balanced', 
                         max_iter=6000, random_state=10)
    lgs.fit(X_tf, y)
    X_test_tf = vec.transform(X_test)
    pred =  lgs.predict(X_test_tf)
    score = f1_score(y_test, pred, average='macro')
    return score

In [63]:
ngr = [(2, 3), (1, 4), (2, 4), (1, 5), (2, 5), (1, 6)]


In [64]:
mf_3 = list(range(70000, 150000, 10000))
mf_4 = list(range(170000, 340000, 10000))
mf_5 = list(range(290000, 580000, 10000))
mf_6 = list(range(420000, 850000, 10000))

In [69]:
score_ls = []
ngram_ls = []
mf_ls = []
for ng in ngr:
    print(ng)
    if ng[1] == 3:
        mf = mf_3
    elif ng[1] == 4:
        mf = mf_4
    elif ng[1] == 5:
        mf = mf_5
    elif ng[1] == 6:
        mf = mf_6
    for f in mf:
        score = get_f1_score(X, f, ng)
        ngram_ls.append(ng)
        mf_ls.append(f)
        score_ls.append(score)


(2, 3)
(7893, 70000)
(7893, 80000)
(7893, 90000)
(7893, 100000)
(7893, 110000)
(7893, 120000)
(7893, 130000)
(7893, 140000)
(1, 4)
(7893, 170000)
(7893, 180000)
(7893, 190000)
(7893, 200000)
(7893, 210000)
(7893, 220000)
(7893, 230000)
(7893, 240000)
(7893, 250000)
(7893, 260000)
(7893, 270000)
(7893, 280000)
(7893, 290000)
(7893, 300000)
(7893, 310000)
(7893, 320000)
(7893, 330000)
(2, 4)
(7893, 170000)
(7893, 180000)
(7893, 190000)
(7893, 200000)
(7893, 210000)
(7893, 220000)
(7893, 230000)
(7893, 240000)
(7893, 250000)
(7893, 260000)
(7893, 270000)
(7893, 280000)
(7893, 290000)
(7893, 300000)
(7893, 310000)
(7893, 320000)
(7893, 330000)
(1, 5)
(7893, 290000)
(7893, 300000)
(7893, 310000)
(7893, 320000)
(7893, 330000)
(7893, 340000)
(7893, 350000)
(7893, 360000)
(7893, 370000)
(7893, 380000)
(7893, 390000)
(7893, 400000)
(7893, 410000)
(7893, 420000)
(7893, 430000)
(7893, 440000)
(7893, 450000)
(7893, 460000)
(7893, 470000)
(7893, 480000)
(7893, 490000)
(7893, 500000)
(7893, 510000)


In [81]:
ngram_df = pd.DataFrame()
ngram_df['score'] = score_ls
ngram_df['max_feature'] = mf_ls
ngram_df['ngram_range'] = ngram_ls

In [82]:
ngram_df

Unnamed: 0,score,max_feature,ngram_range
0,0.596407,70000,"(2, 3)"
1,0.601048,80000,"(2, 3)"
2,0.603002,90000,"(2, 3)"
3,0.597915,100000,"(2, 3)"
4,0.600517,110000,"(2, 3)"
...,...,...,...
138,0.594967,800000,"(1, 6)"
139,0.594967,810000,"(1, 6)"
140,0.592731,820000,"(1, 6)"
141,0.588531,830000,"(1, 6)"


In [85]:
ngram_df.sort_values(by='score', ascending=False).head(15)

Unnamed: 0,score,max_feature,ngram_range
15,0.619642,240000,"(1, 4)"
14,0.619401,230000,"(1, 4)"
10,0.619127,190000,"(1, 4)"
101,0.617765,430000,"(1, 6)"
9,0.617755,180000,"(1, 4)"
16,0.617579,250000,"(1, 4)"
8,0.617455,170000,"(1, 4)"
13,0.617263,220000,"(1, 4)"
11,0.617057,200000,"(1, 4)"
19,0.61701,280000,"(1, 4)"


In [94]:
test = pd.read_csv('./korean-hate-speech-master/test.no_label.tsv', sep='\t')
test_df = get_test(test, X, 190000, ngram=(1,4))
test_df.to_csv('ngram_14_mf_190000.csv', index=False)

(7893, 190000)
