### KR-BERT Tokenizer로 특징 단어를 추출하여 나이브 베이즈와 로지스틱 회귀 분류기를 구축함.

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [None]:
from transformers import AutoTokenizer

model_name = "snunlp/KR-BERT-char16424"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)

In [None]:
def tokenize_function(example):
    return tokenizer(example, padding=False, truncation=False)

In [None]:
def vectorize(data_train, data_val, data_test):
    train_doc = data_train["document"].str.replace("[[문단]] ","", regex=True)
    val_doc = data_val["document"].str.replace("[[문단]] ","", regex=True)
    test_doc = data_test["document"].str.replace("[[문단]] ","", regex=True)
    
    encodings_train = train_doc.map(tokenize_function)
    encodings_val = val_doc.map(tokenize_function)
    encodings_test = test_doc.map(tokenize_function)

    result_train = [' '.join(str(x) for x in each['input_ids']) for each in encodings_train]
    result_val = [' '.join(str(x) for x in each['input_ids']) for each in encodings_val]
    result_test = [' '.join(str(x) for x in each['input_ids']) for each in encodings_test]
    #print(result_val)
    
    vect = CountVectorizer(lowercase=False)
    X_train = vect.fit_transform(result_train)
    X_val = vect.transform(result_val)
    X_test = vect.transform(result_test)

    return X_train, X_val, X_test

### 실험 데이터는 아래의 URL에서 다운로드 받을 수 있음

http://aihumanities.org/ko/archive/data/?vid=1

바로 밑의 셀을 실행하여 다운로드한 데이터의 압축을 풀고, 폴더명을 'data'로 변경함.

이 때 폴더 구조와 파일의 위치(예시)는 다음과 같음.

`data/job/train_0.txt `


In [None]:
#! unzip ./korean_essay_score_range_prediction_dataset.zip -d ./

In [None]:
# 나이브 베이즈 또는 로지스틱 회귀 실험 결과를 취득
# 아래 clf 에서 분류기 하나를 선택

#clf = MultinomialNB()
clf = LogisticRegression(random_state=0, max_iter=5000)

experiments = ["job", "job_econ", "job_succ", "job_happ", "job_econ_succ_happ", 
               "happiness", "happiness_econ", "happiness_succ", "happiness_job", "happiness_econ_succ_job", 
               "all"]

for exp in experiments:
    print("======================")
    print("result_{}".format(exp))
    print("======================")
    avg_acc_train = []
    avg_acc_val = []
    avg_acc_test = []    
    for i in range(7):
        folder = ''
        if exp.startswith("job"):
            folder = 'job'
        elif exp.startswith("happiness"):
            folder = 'happiness'
        else:
            folder = 'all'
        
        data_train = pd.read_csv("data/{}/train_{}.txt".format(folder, i), sep='\t')
        data_val = pd.read_csv("data/{}/val_{}.txt".format(folder, i), sep='\t')
        data_test = pd.read_csv("data/{}/test_{}.txt".format(folder, i), sep='\t')
        
        if 'econ' in exp:
            data_train_econ = pd.read_csv("data/economic.txt", sep='\t')
            data_train = pd.concat([data_train, data_train_econ], ignore_index=True)

        if 'succ' in exp:
            data_train_succ = pd.read_csv("data/success.txt", sep='\t')
            data_train = pd.concat([data_train, data_train_succ], ignore_index=True)
            
        if (folder == 'job') and 'happ' in exp:            
            data_train_happ = pd.read_csv("data/happiness.txt", sep='\t')
            data_train = pd.concat([data_train, data_train_happ], ignore_index=True)

        if (folder == 'happiness') and 'job' in exp:            
            data_train_job = pd.read_csv("data/job.txt", sep='\t')
            data_train = pd.concat([data_train, data_train_job], ignore_index=True)            

        train_label = data_train["label"]
        val_label = data_val["label"]
        test_label = data_test["label"]
        
        X_train, X_val, X_test = vectorize(data_train, data_val, data_test)
        
        print("X_train", X_train.shape)

        clf.fit(X_train, train_label)
        pred_train = clf.predict(X_train)
        pred_val = clf.predict(X_val)
        pred_test = clf.predict(X_test)

        '''
        print("X_test", X_test.shape)
        print("y_test", len(test_label))
        print("X_val", X_val.shape)
        print("y_val", len(val_label))
        print("X_train", X_train.shape)
        print("y_train", len(train_label))
        '''

        acc_train = accuracy_score(pred_train, train_label)
        avg_acc_train.append(acc_train)

        acc_val = accuracy_score(pred_val, val_label)
        avg_acc_val.append(acc_val)

        acc_test = accuracy_score(pred_test, test_label)
        avg_acc_test.append(acc_test)

        print("acc_train:", round(acc_train, 5))
        print("acc_val:", round(acc_val, 5))
        print("acc_test:", round(acc_test, 5))
        print("-------------------")

    avg_train = sum(avg_acc_train) / len(avg_acc_train)
    avg_val = sum(avg_acc_val) / len(avg_acc_val)
    avg_test = sum(avg_acc_test) / len(avg_acc_test)

    print("AVG_TRAIN:", round(avg_train, 5))
    print("AVG_VAL:", round(avg_val, 5))
    print("AVG_TEST:", round(avg_test, 5))

### 로지스틱 회귀와 나이브 베이즈 모델 구축 시 사용된 특징 단어 확인 및 로지스틱 회귀에서의 각 클래스 별 특징 단어 상위 10위 표시

In [None]:
# https://medium.com/@cristhianboujon/how-to-list-the-most-common-words-from-text-corpus-using-scikit-learn-dad4d0cab41d

train_data_file = "data/all/train_6.txt"
data_train = pd.read_csv(train_data_file, sep='\t')
train_doc = data_train["document"].str.replace("[[문단]] ","", regex=True)
train_label = data_train["label"]

parser = Komoran()

temp_train = []
for doc in train_doc:
    temp_train.append(parser.morphs(doc))
result_train = [' '.join(tokens) for tokens in temp_train]

vect = CountVectorizer()
X_train = vect.fit_transform(result_train)

In [None]:
sum_words = X_train.sum(axis=0)

In [None]:
sum_words

In [None]:
sum_words.shape

In [None]:
words_freq = [(word, sum_words[0, idx]) for word, idx in vect.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

In [None]:
words_freq

In [None]:
len(words_freq)

In [None]:
clf = LogisticRegression(random_state=0, max_iter=5000)

In [None]:
clf.fit(X_train, train_label)

In [None]:
clf.classes_

In [None]:
weight = clf.coef_
weight

In [None]:
weight.shape

In [None]:
import numpy as np

In [None]:
# https://stackoverflow.com/questions/6910641/how-do-i-get-indices-of-n-maximum-values-in-a-numpy-array
vocab_idx = {y:x for x,y in vect.vocabulary_.items()}

In [None]:
for i in range(4):
    print("\nLabel:", str(i))
    sel_weights = np.argsort(-weight[i])[:10]
    for w in sel_weights:
        print(vocab_idx[w])

In [None]:
# deep learning tokenizer
encodings_train = train_doc.map(tokenize_function)
result_train = [' '.join(str(x) for x in each['input_ids']) for each in encodings_train]

vect = CountVectorizer(lowercase=False)
X_train = vect.fit_transform(result_train)

clf = LogisticRegression(random_state=0, max_iter=1000)
clf.fit(X_train, train_label)

clf.classes_
weight = clf.coef_
weight

In [None]:
weight.shape

In [None]:
vocab_idx = {y:x for x,y in vect.vocabulary_.items()}
for i in range(4):
    print("\nLabel:", str(i))
    sel_weights = np.argsort(-weight[i])[:10]
    for w in sel_weights:
        print(tokenizer.decode(int(vocab_idx[w])))