In [11]:
from sklearn.ensemble                import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model            import LogisticRegression
from sklearn.model_selection         import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics                 import accuracy_score, recall_score, roc_auc_score, f1_score
from sklearn.pipeline                import Pipeline
from sklearn.svm                     import SVC
from lightgbm                        import LGBMClassifier
import warnings
warnings.filterwarnings(action='ignore')

In [17]:
raw_data = pd.read_csv('./new_train.csv')
test = pd.read_csv('../../data/dev.hate.csv')
train = raw_data.copy()

### 데이터셋 분리

In [18]:
X_train = train['comments']
X_test = test['comments']
y_train = train['label']
y_test = test['label']

### 자모 / ngram / 모델 GridSearch (전처리X)

In [14]:
from jamo import h2j, j2hcj

def jamo_tokenizer(text):
    return j2hcj(h2j(text))

jamo_tokenizer(train['comments'][184])

'ㅅㅂ ㅁㅗㄹㅡㄱㅗㅈㅓㄹㅓㄴㄱㅓㅈㅣ ㅇㅏㄹㄱㅗ ㅈㅓㄹㅐㅆㄱㅔㅆㄴㅑ ㄲㅗㄴㄷㅐ ㅅㅐㅋㅣㄷㅡㄹ ㄷㅡㄹㅓㅂㄱㅔ ㅁㅏㄹㅁㅏㄶㄴㅔ ㅋㅋㅋ'

### 모델 설정

In [5]:
models = [LogisticRegression(), RandomForestClassifier(), SVC(), LGBMClassifier()]

In [27]:
for model in models:
    for ngram in range(2, 13):
        vec_pipe = Pipeline([
                            ("vec", TfidfVectorizer(tokenizer=jamo_tokenizer)), 
                            ("model", model)
                            ])
        # Setting the VEC hyperparameters
        vec_pipe_params = {"vec__ngram_range" : [(1,ngram)], 
                            "vec__stop_words"  : [None],
                            "vec__min_df" : [3],
                            "vec__max_df" : [0.9]}  

        # Instantiating the grid search
        vec_gs = GridSearchCV(vec_pipe,
                                param_grid=vec_pipe_params,
                                cv=3)

        vec_gs.fit(X_train, y_train);
        train_pred = vec_gs.predict(X_train)
        test_pred = vec_gs.predict(X_test)
        result = ["train : {}".format(f1_score(train_pred, y_train, average='macro')),
                "test : {}".format(f1_score(test_pred, y_test, average='macro'))]
        print("model : {}".format(model), "ngram : (1, {})".format(ngram), '\n', result)

model : LogisticRegression() ngram : (1, 2) 
 ['train : 0.6289701548621682', 'test : 0.5407363740006657']
model : LogisticRegression() ngram : (1, 3) 
 ['train : 0.7530548922610246', 'test : 0.5598319803320194']
model : LogisticRegression() ngram : (1, 4) 
 ['train : 0.8285115749991395', 'test : 0.5637178344327568']
model : LogisticRegression() ngram : (1, 5) 
 ['train : 0.866858004759186', 'test : 0.558321573749179']
model : LogisticRegression() ngram : (1, 6) 
 ['train : 0.884342531760867', 'test : 0.5516118479375248']
model : LogisticRegression() ngram : (1, 7) 
 ['train : 0.896918039194181', 'test : 0.5640981120927043']
model : LogisticRegression() ngram : (1, 8) 
 ['train : 0.904529165835803', 'test : 0.5542973971848706']
model : LogisticRegression() ngram : (1, 9) 
 ['train : 0.9059971161877013', 'test : 0.5521487406139399']
model : LogisticRegression() ngram : (1, 10) 
 ['train : 0.9078608155387831', 'test : 0.549361132694466']
model : LogisticRegression() ngram : (1, 11) 
 ['tr

### 전처리

In [15]:
# 특수문자 제거
import re
def cleanse(text):
    pattern = re.compile(r'\s+')
    text = re.sub(pattern, ' ', text)
    text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z0-9]', ' ', text)
    return text
train['comments'] = train['comments'].apply(cleanse)
print(train['comments'][185])

# 띄어쓰기
from pykospacing import spacing
train['comments'] = train['comments'].apply(spacing)
print(train['comments'][185])

# 문장 분리
import kss
train['comments'] = train['comments'].apply(kss.split_sentences)
train['comments'] = [','.join(map(str, ls)) for ls in train['comments']]
print(train['comments'][185])

# 중복 제거
from soynlp.normalizer import *
train['comments'] = [repeat_normalize(comment, num_repeats=2) for comment in train['comments']]
print(train['comments'][185])

X_train = train['comments']
X_test = test['comments']
y_train = train['label']
y_test = test['label']

ㅅㅅ 한게 자랑  ㅈㅅ로 그것도
ㅅㅅ 한 게 자랑 ㅈㅅ로 그것도
ㅅㅅ 한 게 자랑 ㅈㅅ로 그것도
ㅅㅅ 한 게 자랑 ㅈㅅ로 그것도


### 자모, 로지스틱 파라미터 튜닝, ngram 조절(전처리 X)

In [19]:
for ngram in range(2, 11):
    vec_pipe = Pipeline([
                        ("vec", TfidfVectorizer(tokenizer=jamo_tokenizer)), 
                        ("model", LogisticRegression(multi_class='multinomial', class_weight='balanced'))
                        ])

    # Setting the VEC hyperparameters
    vec_pipe_params = {"vec__ngram_range" : [(1,ngram)], 
                        "vec__stop_words"  : [None],
                        "vec__min_df" : [3],
                        "vec__max_df" : [0.9]}    

    # Instantiating the grid search
    vec_gs = GridSearchCV(vec_pipe,
                            param_grid=vec_pipe_params,
                            cv=3)

    vec_gs.fit(X_train, y_train)
    train_pred = vec_gs.predict(X_train)
    test_pred = vec_gs.predict(X_test)
    result = ["train : {}".format(f1_score(train_pred, y_train, average='macro')),
              "test : {}".format(f1_score(test_pred, y_test, average='macro'))]
    print("ngram : (1, {})".format(ngram), result)

ngram : (1, 2) ['train : 0.6332185142154709', 'test : 0.5300778301980638']
ngram : (1, 3) ['train : 0.769700696503487', 'test : 0.5844183813990914']
ngram : (1, 4) ['train : 0.8507049741827865', 'test : 0.6130502867121838']
ngram : (1, 5) ['train : 0.8890713994695544', 'test : 0.6000741886614462']
ngram : (1, 6) ['train : 0.9104170772016915', 'test : 0.611143258597239']
ngram : (1, 7) ['train : 0.9222442600578363', 'test : 0.6082232272176408']
ngram : (1, 8) ['train : 0.9262657366998369', 'test : 0.6053783665243234']
ngram : (1, 9) ['train : 0.9291297037844443', 'test : 0.6056927598845108']
ngram : (1, 10) ['train : 0.9299383291922437', 'test : 0.6036794415557777']


### 자모, 로지스틱 파라미터 튜닝, ngram 조절(전처리 O)

In [16]:
for ngram in range(4, 11):
    vec_pipe = Pipeline([
                        ("vec", TfidfVectorizer(tokenizer=jamo_tokenizer)), 
                        ("model", LogisticRegression(multi_class='multinomial', class_weight='balanced'))
                        ])

    # Setting the VEC hyperparameters
    vec_pipe_params = {"vec__ngram_range" : [(1,ngram)], 
                        "vec__stop_words"  : [None],
                        "vec__min_df" : [3],
                        "vec__max_df" : [0.9]}    

    # Instantiating the grid search
    vec_gs = GridSearchCV(vec_pipe,
                            param_grid=vec_pipe_params,
                            cv=3)

    vec_gs.fit(X_train, y_train)
    train_pred = vec_gs.predict(X_train)
    test_pred = vec_gs.predict(X_test)
    result = ["train : {}".format(f1_score(train_pred, y_train, average='macro')),
              "test : {}".format(f1_score(test_pred, y_test, average='macro'))]
    print("ngram : (1, {})".format(ngram), result)

ngram : (1, 4) ['train : 0.8373754539260437', 'test : 0.5809322350063493']
ngram : (1, 5) ['train : 0.8834284132587712', 'test : 0.59977834464169']
ngram : (1, 6) ['train : 0.908548067048668', 'test : 0.6178219515229056']
ngram : (1, 7) ['train : 0.9216788971015873', 'test : 0.6170750919932341']
ngram : (1, 8) ['train : 0.9273456873822958', 'test : 0.6084726096755447']
ngram : (1, 9) ['train : 0.9313901774887348', 'test : 0.6098143611817685']
ngram : (1, 10) ['train : 0.9332830210833913', 'test : 0.6081627365867254']


### 자모, 로지스틱/TFIDF 파라미터 튜닝, ngram 조절(전처리 O)

In [9]:
for ngram in range(4, 11):
    vec_pipe = Pipeline([
                        ("vec", TfidfVectorizer(tokenizer=jamo_tokenizer)), 
                        ("model", LogisticRegression(multi_class='multinomial', class_weight='balanced'))
                        ])

    # Setting the VEC hyperparameters
    vec_pipe_params = {"vec__ngram_range" : [(1,ngram)], 
                        "vec__stop_words"  : [None],
#                         "vec__min_df" : [3],
#                         "vec__max_df" : [0.9]
                      }    

    # Instantiating the grid search
    vec_gs = GridSearchCV(vec_pipe,
                            param_grid=vec_pipe_params,
                            cv=3)

    vec_gs.fit(X_train, y_train)
    train_pred = vec_gs.predict(X_train)
    test_pred = vec_gs.predict(X_test)
    result = ["train : {}".format(f1_score(train_pred, y_train, average='macro')),
              "test : {}".format(f1_score(test_pred, y_test, average='macro'))]
    print("ngram : (1, {})".format(ngram), result)

ngram : (1, 4) ['train : 0.8480346912201067', 'test : 0.5775456651925654']
ngram : (1, 5) ['train : 0.905544460453064', 'test : 0.5897255419745057']
ngram : (1, 6) ['train : 0.9425901957347599', 'test : 0.5908834564185872']
ngram : (1, 7) ['train : 0.9591581153454053', 'test : 0.5913136021800683']
ngram : (1, 8) ['train : 0.9718227601156647', 'test : 0.5923780631678296']
ngram : (1, 9) ['train : 0.9794762408269505', 'test : 0.5830090456980482']
ngram : (1, 10) ['train : 0.983741479262577', 'test : 0.5739632782219531']
