In [28]:
import torch

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import time
import datetime

In [29]:
# 범용 라이브러리
import pandas as pd     # DataFrame, Series 및 데이터 분석
import numpy as np      # Array 및 
from tqdm import tqdm   # 진행상황 Progress Bar를 위한 tqdm library
import re               # Regular Expression 사용
import pickle           # 토큰화된 단어목록의 인덱스를 저장, 불러오기 위해 사용
from sklearn.model_selection import train_test_split

# 토크나이징, 인코딩 관련 라이브러리
from eunjeon import Mecab           # Mecab 형태소 분석기의 한국어+윈도우용 버전인 은전한닢 프로젝트
from collections import Counter     
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 그래프 관련 라이브러리
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import platform

# 한글 폰트 설정
if (platform.system() == 'Windows'):
    plt.rc('font', family='Malgun Gothic')
else:
    plt.rc('font', family='AppleGothic')

# 음수(-)가 깨지는 현상 방지
plt.rcParams['axes.unicode_minus'] = False 

In [47]:
from sklearn import preprocessing

In [30]:
# 한국표준산업분류 딕셔너리 불러오기

with open('./data/dictionary/digit_1_dict.pickle', 'rb') as handle:
    digit_1_dict = pickle.load(handle)

with open('./data/dictionary/digit_2_dict.pickle', 'rb') as handle:
    digit_2_dict = pickle.load(handle)

with open('./data/dictionary/digit_3_dict.pickle', 'rb') as handle:
    digit_3_dict = pickle.load(handle)

# 레이블 인코딩을 위한 산업분류 리스트, 데이터 프레임 만들기

digit_1_list = list(digit_1_dict.keys())
digit_1_df = pd.DataFrame([], columns=['digit_1'], index=[0])
for i in range(0, len(digit_1_dict)):
    digit_1_df.loc[i, 'digit_1'] = digit_1_list[i]

digit_2_list = list(digit_2_dict.keys())
digit_2_df = pd.DataFrame([], columns=['digit_2'], index=[0])
for i in range(0, len(digit_2_dict)):
    digit_2_df.loc[i, 'digit_2'] = digit_2_list[i]

digit_3_list = list(digit_3_dict.keys())
digit_3_df = pd.DataFrame([], columns=['digit_3'], index=[0])
for i in range(0, len(digit_3_dict)):
    digit_3_df.loc[i, 'digit_3'] = digit_3_list[i]

In [31]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [32]:
# 구두점, 오타(ㅋ, ㅡ 등 한글자) 제거

for col in tqdm(train[['text_obj', 'text_mthd', 'text_deal']].columns):
    train[col] = train[col].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣a-zA-z ]',' ', regex=True).str.replace('[* , .]', ' ', regex=True)
    test[col] = test[col].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣a-zA-z ]','', regex=True).str.replace('[* , .]', ' ', regex=True)

100%|██████████| 3/3 [00:42<00:00, 14.32s/it]


In [33]:
# 결측치를 ""로 대치 

train.fillna("",inplace=True)
test.fillna("", inplace=True)

In [34]:
# 작성해둔 한국어 불용어 사전 불러오기

stopwords = pd.read_csv("./data/stopwords/stopwords.csv", encoding='CP949')
stopwords = stopwords['stopwords']
print(stopwords, len(stopwords))

0        고
1        업
2        아
3        휴
4      아이구
      ... 
661      원
662      잘
663     통하
664     소리
665      놓
Name: stopwords, Length: 666, dtype: object 666


In [36]:
# train 데이터
train['text'] = train['text_obj'] + train['text_mthd'] + train['text_deal']
# test 데이터
test['text']  = test['text_obj'] + test['text_mthd'] + test['text_deal']

In [37]:
train['text'] = "[CLS] " + train['text'] + " [SEP]"

test['text'] = "[CLS] " + test['text'] + " [SEP]"

In [38]:
tokenizer = BertTokenizer.from_pretrained('klue/bert-base')

Downloading: 100%|██████████| 243k/243k [00:00<00:00, 290kB/s]  
Downloading: 100%|██████████| 125/125 [00:00<00:00, 41.6kB/s]
Downloading: 100%|██████████| 289/289 [00:00<00:00, 57.8kB/s]
Downloading: 100%|██████████| 425/425 [00:00<00:00, 106kB/s]


In [42]:
def convert_examples_to_features(examples, labels, max_seq_len, tokenizer):
    
    input_ids, attention_masks, token_type_ids, data_labels = [], [], [], []
    
    for example, label in tqdm(zip(examples, labels), total=len(examples)):
        input_id = tokenizer.encode(example, max_length=max_seq_len, pad_to_max_length=True)
        padding_count = input_id.count(tokenizer.pad_token_id)
        attention_mask = [1] * (max_seq_len - padding_count) + [0] * padding_count
        token_type_id = [0] * max_seq_len

        assert len(input_id) == max_seq_len, "Error with input length {} vs {}".format(len(input_id), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_id) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_id), max_seq_len)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
        data_labels.append(label)

    input_ids = np.array(input_ids, dtype=int)
    attention_masks = np.array(attention_masks, dtype=int)
    token_type_ids = np.array(token_type_ids, dtype=int)

    data_labels = np.asarray(data_labels, dtype=np.int32)

    return (input_ids, attention_masks, token_type_ids), data_labels

In [44]:
max_seq_len = 128

In [52]:
y_train_1 = pd.DataFrame(train['digit_1'])
y_train_2 = pd.DataFrame(train['digit_2'])
y_train_3 = pd.DataFrame(train['digit_3'])

In [53]:
# 대분류 원-핫 인코딩

ohe1 = preprocessing.OneHotEncoder(sparse=False, dtype=int)
ohe1.fit(digit_1_df)

y_train_1 = ohe1.transform(y_train_1)

# ohe1 객체에 담긴 인코딩 정보가 ohe1.pickle에 저장
with open('./data/mecab/ohe1.pickle', 'wb') as handle:
    pickle.dump(ohe1, handle)


# 중분류 원-핫 인코딩

ohe2 = preprocessing.OneHotEncoder(sparse=False, dtype=int)
ohe2.fit(digit_2_df)

y_train_2 = ohe2.transform(y_train_2)

# ohe2 객체에 담긴 인코딩 정보가 ohe2.pickle에 저장
with open('./data/mecab/ohe2.pickle', 'wb') as handle:
    pickle.dump(ohe2, handle)


# 소분류 원-핫 인코딩

ohe3 = preprocessing.OneHotEncoder(sparse=False, dtype=int)
ohe3.fit(digit_3_df)

y_train_3 = ohe3.transform(y_train_3)

# ohe3 객체에 담긴 인코딩 정보가 ohe3.pickle에 저장
with open('./data/mecab/le3.pickle', 'wb') as handle:
    pickle.dump(ohe3, handle)

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [49]:
train_X, train_y_1 = convert_examples_to_features(train['text'], y_train_1, max_seq_len=max_seq_len, tokenizer=tokenizer)

100%|██████████| 1000000/1000000 [06:05<00:00, 2735.62it/s]


ValueError: invalid literal for int() with base 10: 'S'

In [None]:
test_X, test_y = convert_examples_to_features(test['text'], test['digit_1'], max_seq_len=max_seq_len, tokenizer=tokenizer)