In [2]:
from kobert import get_pytorch_kobert_model
from kobert_tokenizer import KoBERTTokenizer

tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
model, vocab  = get_pytorch_kobert_model()

using cached model. /home/bigdata/Desktop/Research/hk/competition/SDC/.cache/kobert_v1.zip
using cached model. /home/bigdata/Desktop/Research/hk/competition/SDC/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [3]:
import re
import gluonnlp as nlp
import numpy as np
import pandas as pd
import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# torch
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

#transformers
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from transformers import BertModel

#GPU 사용 시
device = torch.device("cuda:0")

In [4]:
df = pd.read_table('data/1. 실습용자료.txt', sep='|', encoding='cp949')
df.fillna('', inplace=True)
df['text'] = df['text_obj'] + ' ' + df['text_mthd'] + ' ' + df['text_deal']

clean = re.compile("[^ㄱ-힣 ]")
df['text'] = df['text'].apply(lambda x: clean.sub(' ', str(x)))

In [5]:
test = df.copy()
test['target'] = df[['digit_1', 'digit_2', 'digit_3']].apply(lambda x: ' '.join(x.values.astype(str)), axis=1)

In [6]:
label_df = pd.read_excel('data/한국표준산업분류(10차)_국문.xlsx', header=2)
label_df = label_df.iloc[:, range(0,5,2)]
label_df = label_df.fillna(method='ffill')
label_df.drop_duplicates(inplace=True)
label_df.reset_index(drop=True, inplace=True)
label_df[['코드.1', '코드.2']] = label_df[['코드.1', '코드.2']].astype(int)
label_df['target'] = label_df[['코드', '코드.1', '코드.2']].apply(lambda x: ' '.join(x.values.astype(str)), axis=1)

In [7]:
label_dict = {value: str(idx) for idx, value in enumerate(label_df['target'])}

In [8]:
data_list = [[text, label_dict[label]] for text, label in zip(test['text'], test['target'])]

In [9]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len, vocab=vocab, pad=pad, pair=pair)
        
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))
         
    def __len__(self):
        return (len(self.labels))

In [10]:
# Setting parameters
seed = 3413
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 10
max_grad_norm = 1
log_interval = 2500
learning_rate =  5e-5

In [11]:
dataset_train, dataset_test = train_test_split(data_list, test_size=0.2, shuffle=True, random_state=seed)

In [12]:
dataset_train

[['학원에서 중고생을 대상으로 수학 교습', '207'],
 ['노래방 노래연습시설갖추고 노래연습장', '219'],
 ['교회 기독교계통종교활동 종교서비스', '222'],
 ['사무실 고객상대로 생명보험 텔레마케팅 ', '193'],
 ['개별화물자동차로 고객의 요청으로 화물운송서비스', '139'],
 ['주차장에서 일반인대상으로 주차서비스', '147'],
 ['기업가치평의회 기업가치 평가를 위한 여론조사', '178'],
 ['목재 절단 조립하여 씽크대제작판매', '93'],
 ['피부과에서 왜래환자대상으로 피부진료', '211'],
 ['옥수수 찹쌀 엿기름 가열 농축 엿 조청', '20'],
 ['사업장에서 임대통하여 사무실임대', '171'],
 ['매장에서 소매업자를 대상으로 의류 잡화', '123'],
 ['중학교 중학생에게 중등교육서비스', '204'],
 ['음식점에서 접객시설을 갖추고 곰탕', '150'],
 ['매장에서 일반소비자에게소매 남성의류', '131'],
 ['개인택시로 일반인을 대상으로 승객운송서비스', '138'],
 ['사업장에서 일반소비자에게 소매 가전제품 소매', '130'],
 ['축협에서 일반 고객을 대상으로 대출및이자지급', '163'],
 ['매장에서 산업사용자에게 도매 자동차내장용품 네비게이션', '118'],
 ['접객시설을 갖추고 음식점에서 국수', '150'],
 ['개인택시로 일반인을 대상으로 승객운송서비스', '138'],
 ['중학교에서 중학생을 대상으로 중등 교육서비스', '204'],
 ['미장원 두발미용 서비스 제공 파마  컷트  염색', '226'],
 ['미장원에서 두발미용서비스 제공 컷트 파마', '226'],
 ['주점에서 접객요원을두고 맥주 양주', '151'],
 ['학원에서 학생대상으로 영어교육', '207'],
 ['피자집에서 접객시설을 갖추고 피자', '150'],
 ['상가에서 소매상인에게 의류잡화', '128'],
 ['음식점에서 접객시설갖추고 백숙', '150'],
 ['

In [None]:
tok = tokenizer.tokenize
data_train = BERTDataset(dataset_train, 0, 1, tok, vocab, max_len, True, False)
data_test = BERTDataset(dataset_test,0, 1, tok, vocab,  max_len, True, False)

In [None]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=len(label_dict),   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device),return_dict=False)
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [None]:
# BERT 모델 불러오기
model = BERTClassifier(model, dr_rate=0.5).to(device)
 
# optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

# 정확도 측정을 위한 함수 정의
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc
    
train_dataloader

In [None]:
train_history=[]
test_history=[]
loss_history=[]

for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm.notebook.tqdm(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
         
        #print(label.shape,out.shape)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
            train_history.append(train_acc / (batch_id+1))
            loss_history.append(loss.data.cpu().numpy())
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    #train_history.append(train_acc / (batch_id+1))
    
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm.notebook.tqdm(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
    test_history.append(test_acc / (batch_id+1))

In [None]:
def predict(predict_sentence):

    data = [predict_sentence, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)
    
    model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length = valid_length
        label = label.long().to(device)

        out = model(token_ids, valid_length, segment_ids).detach().cpu().numpy()
        out = np.argmax(out)
                    
    return out

In [None]:
test_df = pd.read_table('data/2. 모델개발용자료.txt', sep='|', encoding='cp949')
test_df.fillna('', inplace=True)
test_df['text'] = test_df['text_obj'] + ' ' + test_df['text_mthd'] + ' ' + test_df['text_deal']

clean = re.compile("[^ㄱ-힣 ]")
test_df['text'] = test_df['text'].apply(lambda x: clean.sub(' ', str(x)))

In [None]:
test_text = test_df['text'].tolist()
test_df['predict'] = test_df['text'].apply(lambda x: predict(x))

In [None]:
# test_df['predict'] = test_df['predict'].astype(str)

In [None]:
test_df['result'] = test_df['predict'].apply(lambda x: predict_to_result(x))

In [None]:
label_dict_reverse= dict(map(reversed, label_dict.items()))

In [None]:
[label_dict_reverse[i] for i in test_df['predict'].tolist()[:3]]

In [None]:
test_df['result'] = test_df['predict'].apply(lambda x: label_dict_reverse[x])

In [None]:
test_df.to_csv('predict.csv', index=False)

In [None]:
df = pd.read_csv('predict.csv')
df['digit_1'] = df['result'].apply(lambda x: x.split()[0])
df['digit_2'] = df['result'].apply(lambda x: x.split()[1])
df['digit_3'] = df['result'].apply(lambda x: x.split()[2])
df = df.iloc[:, :-3]
df.to_csv('submission.csv', index=False)