In [None]:
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3.0.2
!pip install torch

!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
plt.rc('font', family='malgun gothic')
plt.rc('axes', unicode_minus=False)
import seaborn as sns
import os
import re
import missingno as msno
import pickle
from glob import glob
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

import random
import torch.backends.cudnn as cudnn

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

#kobert
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

#transformers
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

data_path = "/content/drive/MyDrive/통계청/data/"
sub_path = "/content/drive/MyDrive/통계청/sub/"
pre_path = "/content/drive/MyDrive/통계청/kobert_v1/"
#GPU 사용
device = torch.device("cuda:0")
use_cuda = torch.cuda.is_available()
print(use_cuda)
print(torch.cuda.get_device_name(0))
#BERT 모델, Vocabulary 불러오기
bertmodel, vocab = get_pytorch_kobert_model()

In [None]:
torch.manual_seed(4346)
torch.cuda.manual_seed(4346)
torch.cuda.manual_seed_all(4346)
np.random.seed(4346)
cudnn.benchmark = False
cudnn.deterministic = True
random.seed(4346)

In [None]:
train = pd.read_table(data_path + "1. 실습용자료.txt", sep='|', encoding='cp949')
test = pd.read_table(data_path + "2. 모델개발용자료.txt", sep='|', encoding='cp949')
code = pd.read_excel(data_path + "한국표준산업분류.xlsx", header = None)

# Preprocessing

In [None]:
# 한 문장으로 합치기 때문에 공백으로 결측값 치환
train = train.fillna("")
test = test.fillna("")

# 한 문장으로 합치기
train["sen"] = train["text_obj"] + " " + train["text_mthd"] + " " + train["text_deal"]
test["sen"] = test["text_obj"] + " " + test["text_mthd"] + " " + test["text_deal"]

# 합친 문장 양쪽 공백 제거
train["sen"] = train["sen"].apply(lambda x : x.strip())
test["sen"] = test["sen"].apply(lambda x : x.strip())

# digit_1 A~ -> 0~으로 라벨링한 col 생성
train["digit_1_label"] = train["digit_1"].apply(lambda x : ord(x) - 65)

# digit_ 1, 2, 3 항목명 dataframe 만들어놓기
digit1_df = code.loc[3:][code[0].isnull() == False][[0, 1]].reset_index(drop = True).rename(columns = {0 : "digit_1", 1 : "digit_1_text"})
digit2_df = code.loc[3:][code[2].isnull() == False][[2, 3]].reset_index(drop = True).rename(columns = {2 : "digit_2", 3 : "digit_2_text"})
digit3_df = code.loc[3:][code[4].isnull() == False][[4, 5]].reset_index(drop = True).rename(columns = {4 : "digit_3", 5 : "digit_3_text"})

# digit_1 항목명 뒤의 특수기호+숫자 제거
digit1_df["digit_1_text"] = digit1_df["digit_1_text"].apply(lambda x : x.split("(")[0])

# digit_1 예측

In [None]:
# 데이터를 [문장, 레이블] 형태로 변환

data_list = []
for q, label in zip(train['sen'], train['digit_1_label'])  :
    data = []
    data.append(q)
    data.append(str(label))

    data_list.append(data)
  
from sklearn.model_selection import train_test_split
train_set, val_set = train_test_split(data_list, test_size=0.3, random_state=0)
print(len(train_set))
print(len(val_set))

In [None]:
# BERT에 넣을 DATASET 만드는 클래스
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [None]:
# Setting parameters
max_len = 64
batch_size = 16
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 400
learning_rate =  5e-5

In [None]:
#토큰화 및 dataload
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

data_train = BERTDataset(train_set, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(val_set, 0, 1, tok, max_len, True, False)

print(data_train[0])

In [None]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size)
val_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size)

In [None]:
# 사용할 모델

class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=19,   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [None]:
#BERT 모델 불러오기
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

#optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [None]:
#정확도 측정을 위한 함수 정의
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [None]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(val_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

In [None]:
torch.save(model, pre_path +'for_digit_1_model_epoch5.pt')


In [None]:
test_list = []
for data in test['sen']  :
    data = [data, '0']
    test_list.append(data)


#토큰화 및 dataload
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

test_set = BERTDataset(test_list, 0, 1, tok, max_len, True, False)
test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, num_workers=5)

In [None]:
digit_1_pred = []
model.eval()

for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length
    label = label.long().to(device)
    out = model(token_ids, valid_length, segment_ids)

    for i in out:
        logits=i
        logits = logits.detach().cpu().numpy()
        digit_1_pred.append(np.argmax(logits))

In [None]:
result = pd.read_csv(data_path + "답안 작성용 파일.csv", encoding = "cp949")
result["digit_1"] = digit_1_pred
result["digit_1"] = result["digit_1"].apply(lambda x : chr(x+65))
result.to_csv(pre_path + "digit_1_pred_epoch5.csv", index = False)

# digit_2 예측

In [None]:
result = pd.read_csv(pre_path + "digit_1_pred_epoch5.csv")
result

In [None]:
test["digit_1"] = result["digit_1"]

train["digit_1_name"] = train["digit_1"].map({i:j for i, j in digit1_df.values})
test["digit_1_name"] = test["digit_1"].map({i:j for i, j in digit1_df.values})

train["for_digit_2"] = train["sen"] + " " + train["digit_1_name"]
test["for_digit_2"] = test["sen"] + " " + test["digit_1_name"]

In [None]:
train.head(5)

In [None]:
test.head(5)

In [None]:
data_list = []
digit_2_label = {i :  sorted(train.digit_2.unique()).index(i) for i in sorted(train.digit_2.unique())}
for q, label in zip(train['for_digit_2'], train['digit_2'])  :
    data = []
    data.append(q)
    data.append(str(digit_2_label[label]))

    data_list.append(data)
  
from sklearn.model_selection import train_test_split
train_set, val_set = train_test_split(data_list, test_size=0.3, random_state=0)
print(len(train_set))
print(len(val_set))

In [None]:
#토큰화 및 dataload
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

data_train = BERTDataset(train_set, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(val_set, 0, 1, tok, max_len, True, False)

print(data_train[0])

In [None]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size)
val_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size)

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=74,   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [None]:
#BERT 모델 불러오기
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

#optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [None]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(val_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

In [None]:
torch.save(model, pre_path + "for_digit_2_model_epoch5.pt")

In [None]:
test_list = []
for q in test['for_digit_2']  :
    data = [q, '0']
    test_list.append(data)


#토큰화 및 dataload
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

test_set = BERTDataset(test_list, 0, 1, tok, max_len, True, False)
test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, num_workers=5)

In [None]:
digit_2_pred = []
model.eval()

for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length
    label = label.long().to(device)
    out = model(token_ids, valid_length, segment_ids)

    for i in out:
        logits=i
        logits = logits.detach().cpu().numpy()
        digit_2_pred.append(np.argmax(logits))

In [None]:
digit_2_to_org = {i:j for j, i in digit_2_label.items()}
result["digit_2"] = digit_2_pred
result["digit_2"] = result["digit_2"].map(digit_2_to_org)
result.to_csv(pre_path + "digit_2_pred.csv_epoch5", index = False)

# digit_3 예측

In [None]:
result = pd.read_csv(pre_path + "digit_2_pred.csv_epoch5")
test["digit_2"] = result["digit_2"]

In [None]:
# digit_2 항목명 담긴 digit2_df의 값이 한 자리 숫자면 (01, 02, 03..)식의 문자열인 것 바꿔주는 함수
def to_int(x):
  if x[0] == "0":
    return int(x[1])
  else:
    return int(x)

# 함수적용
digit2_df["digit_2"] = digit2_df["digit_2"].map(to_int)

# digit_2 항목명 라벨링
train["digit_2_name"] = train["digit_2"].map({i:j for i, j in digit2_df.values})
test["digit_2_name"] = test["digit_2"].map({i:j for i, j in digit2_df.values})

# digit_2 ;지우기
train["digit_2_name"] = train["digit_2_name"].apply(lambda x : x.replace(";", ""))
test["digit_2_name"] = test["digit_2_name"].apply(lambda x : x.replace(";", ""))

# digit_3 예측위한 sentence + digit_1항목명 + digit_2 항목명 col 생성
train["for_digit_3"] = train["for_digit_2"] + " " + train["digit_2_name"]
test["for_digit_3"] = test["for_digit_2"] + " " + test["digit_2_name"]

In [None]:
digit_3_label = {i :  sorted(train.digit_3.unique()).index(i) for i in sorted(train.digit_3.unique())}
data_list = []

for q, label in zip(train['for_digit_3'], train['digit_3']):
    data = []
    data.append(q)
    data.append(str(digit_3_label[label]))

    data_list.append(data)
  
from sklearn.model_selection import train_test_split
train_set, val_set = train_test_split(data_list, test_size=0.3, random_state=0)
print(len(train_set))
print(len(val_set))

In [None]:
#토큰화 및 dataload
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

data_train = BERTDataset(train_set, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(val_set, 0, 1, tok, max_len, True, False)

print(data_train[0])

In [None]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size)
val_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size)

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=225,   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [None]:
#BERT 모델 불러오기
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

#optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [None]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(val_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

In [None]:
torch.save(model, pre_path + "for_digit_3_model_epoch5.pt") 

In [None]:
model = torch.load(pre_path + "for_digit_3_model_epoch5.pt") 

In [None]:
test_list = []
for q in test['for_digit_3']:
    data = [q, '0']
    test_list.append(data)


#토큰화 및 dataload
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

test_set = BERTDataset(test_list, 0, 1, tok, max_len, True, False)
test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, num_workers=5)

In [None]:
digit_3_pred = []
model.eval()

for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length
    label = label.long().to(device)
    out = model(token_ids, valid_length, segment_ids)

    for i in out:
        logits=i
        logits = logits.detach().cpu().numpy()
        digit_3_pred.append(np.argmax(logits))

In [None]:
digit_3_to_org = {i:j for j, i in digit_3_label.items()}

test["digit_3"] = digit_3_pred
result["digit_3"] = digit_3_pred
result["digit_3"] = result["digit_3"].map(digit_3_to_org)
result.to_csv(sub_path + "DL_kobert_v1_epoch5_220401.csv", index = False)