In [1]:
from sklearn.naive_bayes             import MultinomialNB
from sklearn.ensemble                import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model            import LogisticRegression
from sklearn.model_selection         import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics                 import accuracy_score, recall_score, roc_auc_score, f1_score
from sklearn.pipeline                import Pipeline
from sklearn.svm                     import SVC
from sklearn.tree                    import DecisionTreeClassifier
from lightgbm                        import LGBMClassifier
import warnings
warnings.filterwarnings(action='ignore')

In [6]:
raw_data = pd.read_csv('./new_train.csv')
test = pd.read_csv('../../data/dev.hate.csv')
train = raw_data.copy()

### 기본 전처리

In [7]:
# 특수문자 제거
import re
def cleanse(text):
    pattern = re.compile(r'\s+')
    text = re.sub(pattern, ' ', text)
    text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z0-9]', ' ', text)
    return text
train['comments'] = train['comments'].apply(cleanse)
print(train['comments'][184])

# 띄어쓰기
from pykospacing import spacing
train['comments'] = train['comments'].apply(spacing)
print(train['comments'][184])

# 문장 분리
import kss
train['comments'] = train['comments'].apply(kss.split_sentences)
train['comments'] = [','.join(map(str, ls)) for ls in train['comments']]
print(train['comments'][184])

# 중복 제거
from soynlp.normalizer import *
train['comments'] = [repeat_normalize(comment, num_repeats=2) for comment in train['comments']]
print(train['comments'][184])

ㅅㅂ 모르고저런거지 알고 저랬겠냐 꼰대 새키들 드럽게 말많네 ㅋㅋㅋ
ㅅㅂ 모르고 저런 거지 알고 저랬겠냐 꼰대 새키들 드럽게 말 많네 ㅋㅋㅋ
ㅅㅂ 모르고 저런 거지 알고 저랬겠냐 꼰대 새키들 드럽게 말 많네 ㅋㅋㅋ
ㅅㅂ 모르고 저런 거지 알고 저랬겠냐 꼰대 새키들 드럽게 말 많네 ㅋㅋㅋ


### 형태소 분석기별 토크나이저 함수 생성

In [8]:
from khaiii import KhaiiiApi; k = KhaiiiApi()
from konlpy.tag import Okt; t = Okt()
from konlpy.tag import Mecab; m = Mecab()

# khai
def k_tokenizer(text):
    k.analyze(text)
    return [word.lex for word in k.analyze(text)]

# Okt
def t_tokenizer(text):
    tokens_ko = t.morphs(text)
    return tokens_ko

# Mecab
def m_tokenizer(text):
    tokens_ko = m.morphs(text)
    return tokens_ko

### 모델, 토크나이저 설정

In [9]:
models = [LogisticRegression(), RandomForestClassifier(), SVC(), LGBMClassifier()]
tokenizers = [None, k_tokenizer, t_tokenizer, m_tokenizer]

### 데이터셋 분리

In [10]:
X_train = train['comments']
X_test = test['comments']
y_train = train['label']
y_test = test['label']

### 모델별/토큰별 함수 선언

In [13]:
def get_score(model, tokenizer):
    # 훈련: train 전체 / 테스트: dev 전체
    X_train = train['comments']
    X_test = test['comments']
    y_train = train['label']
    y_test = test['label']
    
    # Setting up the pipeline
    vec_pipe = Pipeline([
                        ("vec", TfidfVectorizer(tokenizer=tokenizer)), 
                        ("model", model)
                        ])
    
    # Setting the VEC hyperparameters
    vec_pipe_params = {"vec__ngram_range" : [(1,2)], 
                       "vec__stop_words"  : [None],
                       "vec__min_df" : [3],
                       "vec__max_df" : [0.9]}    
    
    # Instantiating the grid search
    vec_gs = GridSearchCV(vec_pipe,
                          param_grid=vec_pipe_params,
                          cv=3)

    # Fitting the model to the training data
    vec_gs.fit(X_train, y_train);
    
    # Predicting
    train_pred = vec_gs.predict(X_train)
    test_pred = vec_gs.predict(X_test)

    # Score
    result = ["train : ", f1_score(train_pred, y_train, average='macro'),
              "test : ", f1_score(test_pred, y_test, average='macro')]    

    return result

In [14]:
results = []
for model in models:
    for tokenizer in tokenizers:
        results.append(get_score(model, tokenizer))
        
        if tokenizer == None:
            print("Model : {}".format(model),
                  "Tokenizer : None",
                  get_score(model, tokenizer),
                  sep='\n')
        else:
            print("Model : {}".format(model),
                  'Tokenizer : {}'.format(tokenizer.__name__),
                  get_score(model, tokenizer),
                  sep='\n')

Model : LogisticRegression()
Tokenizer : None
['train : ', 0.7561002272617734, 'test : ', 0.4199712324941378]
Model : LogisticRegression()
Tokenizer : k_tokenizer
['train : ', 0.7798562593904027, 'test : ', 0.41844679583362815]
Model : LogisticRegression()
Tokenizer : t_tokenizer
['train : ', 0.8322829113342763, 'test : ', 0.579778334351531]
Model : LogisticRegression()
Tokenizer : m_tokenizer
['train : ', 0.8413548351609329, 'test : ', 0.5403916839167854]
Model : RandomForestClassifier()
Tokenizer : None
['train : ', 0.9664996096631214, 'test : ', 0.36422576727970696]
Model : RandomForestClassifier()
Tokenizer : k_tokenizer
['train : ', 0.9844063126867181, 'test : ', 0.36234109656514923]
Model : RandomForestClassifier()
Tokenizer : t_tokenizer
['train : ', 0.9986766860902726, 'test : ', 0.5051114757421531]
Model : RandomForestClassifier()
Tokenizer : m_tokenizer
['train : ', 0.999328139864184, 'test : ', 0.5008706184025419]
Model : SVC()
Tokenizer : None
['train : ', 0.928083172971726

### 토크나이저 미적용

### 토크나이저 None / 음절 단위 / ngram(1, 2)

In [15]:
vec_pipe = Pipeline([
                    ("vec", TfidfVectorizer(tokenizer=None, analyzer='char')), 
                    ("model", LogisticRegression())
                    ])
    
    # Setting the VEC hyperparameters
vec_pipe_params = {"vec__ngram_range" : [(1,2)], 
                    "vec__stop_words"  : [None],
                    "vec__min_df" : [3],
                    "vec__max_df" : [0.9]}    
    
    # Instantiating the grid search
vec_gs = GridSearchCV(vec_pipe,
                        param_grid=vec_pipe_params,
                        cv=3)

vec_gs.fit(X_train, y_train);
train_pred = vec_gs.predict(X_train)
test_pred = vec_gs.predict(X_test)
result = ["train : ", f1_score(train_pred, y_train, average='macro'),
        "test : ", f1_score(test_pred, y_test, average='macro')]  

In [16]:
result

['train : ', 0.79848958223854, 'test : ', 0.5648313726340545]

### 토크나이저 None / 음절 단위 / ngram(1, 3)

In [17]:
vec_pipe = Pipeline([
                    ("vec", TfidfVectorizer(analyzer='char')), 
                    ("model", LogisticRegression())
                    ])
    
    # Setting the VEC hyperparameters
vec_pipe_params = {"vec__ngram_range" : [(1,3)], 
                    "vec__stop_words"  : [None],
                    "vec__min_df" : [3],
                    "vec__max_df" : [0.9]}    
    
    # Instantiating the grid search
vec_gs = GridSearchCV(vec_pipe,
                        param_grid=vec_pipe_params,
                        cv=3)

vec_gs.fit(X_train, y_train);
train_pred = vec_gs.predict(X_train)
test_pred = vec_gs.predict(X_test)
result = ["train : ", f1_score(train_pred, y_train, average='macro'),
        "test : ", f1_score(test_pred, y_test, average='macro')]  

In [18]:
result

['train : ', 0.858892743897349, 'test : ', 0.5551886385219719]