In [None]:
import re
import os
import itertools
import numpy as np
import pandas as pd

import pickle
import glob

import utils
from tqdm import tqdm


from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder

import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

import transformers

from sklearn.model_selection import train_test_split

from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, XLMRobertaConfig

In [None]:
from torch import cuda
# GPU 확인
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
# 현재 특허 데이터가 위치해있는 폴더를 불러온다
DATA_DIR = os.path.join('/media', 'eunbinpark', 'eunbin')
# 특허 데이터 타입 별로 폴더가 있기에 
dir_names = ['ipc_files', 'title_files', 'claim_files']
# 이를 데이터 위치와 병합해준다
dir_names = [os.path.join(DATA_DIR, dir_name) for dir_name in dir_names] # ex ) '/media/eunbinpark/eunbin/ipc_files'

ipc_list = os.listdir(dir_names[0]) # IPC 의 전체 파일 리스트 ex ) ['129749732B1.txt', '2490179012B2.txt']
title_list = os.listdir(dir_names[1]) # TITLE 의 전체 파일 리스트 ex ) 상동
claim_list = os.listdir(dir_names[2]) # 청구항의 전체 파일 리스트 ex ) 상동
len(ipc_list), len(title_list), len(claim_list) # 전체 길이 확인

# 길이가 다르기 때문에 Claim 파일 기준으로 진행

(21974753, 21974751, 4204709)

In [None]:
# 파일 제목을 기준으로 교집합 되는 파일만을 사용
intersection_of_files = set(claim_list) & set(title_list) & set(ipc_list)

# 기준점의 파일 목록을 전체 파일path를 붙여줌 ex ) 
claim_list = [os.path.join(DATA_DIR, 'claim_files', filename) for filename in intersection_of_files]
# ex ) ['/media/eunbinpark/eunbin/ipc_files/29847179123B2.txt', '/media/eunbinpark/eunbin/ipc_files/29847179123B2.txt']

In [None]:
train_file_list, test_file_list = train_test_split(claim_list, test_size=0.2)

In [None]:
len(train_file_list), len(test_file_list)

(340036, 85009)

In [None]:
# 텍스트 라벨값을 숫자로 바꾸어 미리 저장해둔 dict 파일을 불러옴
with open('./dataset/label_encoding_target.pkl', 'rb') as f:
    target_dict = pickle.load(f)
    
    
len(target_dict)

# ex
# { "G38F": 234, "A34Q" : 394}

634

In [None]:
class DataTransform(Dataset):

    def __init__(self, file_list, target_dict, tokenizer, max_len):
        self.file_list = file_list
        self.target_dict = target_dict
        self.tokenizer = tokenizer
        self.max_len = max_len


    def __len__(self):
        return len(self.file_list)
    
    def __getitem__(self, idx):
        
        # 파일을 열기 위한 함수
        def _file_open(filepath, data_type='list'): 
            if data_type == "str": 
                with open(filepath, 'r') as f:
                    data = f.read()
                    
            elif data_type == "ipc":
                data = list()
                with open(filepath, 'r') as f:
                    for line in f:
                        # IPC 파일 형태 -> ["A39V 01/03", "A39V 01/03", "A39V 01/03"]
                        string_value = line.strip().replace(" ", "")[:4] # 해당 형태에서 앞 4글자만 추출
                        data.append(self.target_dict[string_value]) # 출력을 위한 리스트에 삽입
                data = data[:5]


            else:
                data = list()
                with open(filepath, 'r') as f:
                    for line in f:        
                        data.append(line.strip())

            return data
        
        # idx 번호를 기준으로 클레임 파일 불러오기
        y_data = self.file_list[idx]
        # 청구항 파일 path 기준으로 파일명만 추출
        filename = self.file_list[idx].split('/')[-1]

        # 패스 삽입
        title_filepath = os.path.join('/media', 'eunbinpark', 'eunbin', 'title_files', filename)
        ipc_filepath = os.path.join('/media', 'eunbinpark', 'eunbin', 'ipc_files', filename)
        
        # 미리 만들어놓은 함수로 파일 열기
        title_data = _file_open(title_filepath, 'str')
        claim_data = _file_open(y_data)
        targets = _file_open(ipc_filepath, 'ipc')

        # 전체 IPC 코드 개수를 기준으로 0으로 채워진 array 생성
        one_hot = np.zeros(len(target_dict))        
        for i in targets: # idx 를 기준으로 해당 array의 idx에 1로 표기 
            one_hot[i] = 1
        # ex ) [0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1]
        
        
        # 제목과 청구항을 하나로 이어 붙임
        all_text = title_data + ' ' + ' '.join(claim_data)
            
        # 이어 붙인 텍스트를 모델에 맞는 토크나이저를 사용해 
        # special token 삽입, MAX_LEN보다 긴 문장 절삭 등 진행
        inputs = self.tokenizer.encode_plus(
            all_text,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_attention_mask = True, 
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'one_hot': torch.tensor(one_hot, dtype=torch.long),

        }



In [None]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 3e-05
MODEL_NAME = 'xlm-roberta-base'

# 사용할 모델 xlm-roberta의 전용 토크나이저 사용
xlmroberta_tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)


train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

In [None]:
# 트레인, 테스트 나눈 파일을 데이터 로더에 태움
training = DataTransform(train_file_list, target_dict, xlmroberta_tokenizer, MAX_LEN)
loaded_train_data = DataLoader(training, **train_params)


testing = DataTransform(test_file_list, target_dict, xlmroberta_tokenizer, MAX_LEN)
loaded_test_data = DataLoader(testing, **test_params)

In [None]:
# 모델 호출
# num_labels -> y 개수
model = XLMRobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(target_dict))
# model.load_state_dict(torch.load('./model/20211215-xlm-roberta-base.pt.pt'))
model.to(device)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

In [None]:
# BCEWithLogitsLoss 사용 
def loss_fn(outputs, targets):
    return torch.nn.functional.binary_cross_entropy_with_logits(outputs, targets)


# 옵티마이저 아담, 러닝레이트 3e-05 사용
optimizer = torch.optim.Adam(params =model.parameters(), lr=LEARNING_RATE)

In [None]:

def train(epoch, num=1000):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _, data in enumerate(loaded_train_data, 0):
        # 데이터로더로 만들어진 데이터를 각각 불러와 gpu위에 올리기
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        one_hot = data['one_hot'].to(device, dtype = torch.float32)

        # 원핫인코딩에서 원 인덱스를 호출 
        targets = list()
        for target in one_hot.cpu():
            where_ones = np.argwhere(target == 1)[0].tolist()
            targets.append(where_ones)

        # 일열로 리스트 데이터를 펼침
        targets = torch.FloatTensor(list(itertools.chain(*targets)))

        # 원핫인코딩을 기준으로 1의 갯수가 몇 개인지 카운트
        num_target = torch.count_nonzero(one_hot, dim=1) # num_target : targets에서 

        # 모델에 태워 prediction 진행
        outputs = model(ids, mask)
        # 로스 계산
        loss = loss_fn(outputs.logits, one_hot)

        tr_loss += loss.item()
        
        big_val, big_idx = outputs.logits.topk(5, 1, True, True)

        big_idx = torch.FloatTensor(list(itertools.chain(*big_idx)))
        big_idx = big_idx.numpy()
        targets = targets.numpy()
        correct = len(set(targets) & set(big_idx))
        n_correct += correct

        logits=outputs.logits
        
        nb_tr_steps += 1
        nb_tr_examples+=targets.size
        
        if _%num==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per {num} steps: {loss_step}")
            print(f"Training Accuracy per {num} steps: {accu_step}")
            print('-'*50)
        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()
    print('='*50)
    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Precision Top 5 Epoch: {epoch_accu}")
    print('='*50)

    return 


start_time = datetime.datetime.now(timezone)# .strftime('%Y%m%d')
print(start_time)
for epoch in range(EPOCHS):
    train(epoch)
    
end_time = datetime.datetime.now(timezone) # .strftime('%Y%m%d')   
print(end_time)

2021-12-15 21:15:55.173606+09:00




Training Loss per 1000 steps: 0.6988092064857483
Training Accuracy per 1000 steps: 4.166666666666667
--------------------------------------------------
Training Loss per 1000 steps: 0.060733708457185794
Training Accuracy per 1000 steps: 14.71382798266005
--------------------------------------------------
Training Loss per 1000 steps: 0.03796321067629547
Training Accuracy per 1000 steps: 14.392259414225942
--------------------------------------------------
Training Loss per 1000 steps: 0.03019393668334679
Training Accuracy per 1000 steps: 14.20822555793378
--------------------------------------------------
Training Loss per 1000 steps: 0.026291462737565056
Training Accuracy per 1000 steps: 14.311557263482603
--------------------------------------------------
Training Loss per 1000 steps: 0.023943164860176055
Training Accuracy per 1000 steps: 14.307591885920868
--------------------------------------------------
Training Loss per 1000 steps: 0.02237139902096394
Training Accuracy per 1000 

Training Loss per 1000 steps: 0.004651098640903121
Training Accuracy per 1000 steps: 78.31434444792865
--------------------------------------------------
Training Loss per 1000 steps: 0.004650458617853287
Training Accuracy per 1000 steps: 78.29668174567895
--------------------------------------------------
Training Loss per 1000 steps: 0.00464749129232443
Training Accuracy per 1000 steps: 78.27251534923431
--------------------------------------------------
Training Loss per 1000 steps: 0.00464513408559595
Training Accuracy per 1000 steps: 78.2749709670332
--------------------------------------------------
Training Loss per 1000 steps: 0.004643450948178136
Training Accuracy per 1000 steps: 78.24891349183102
--------------------------------------------------
Training Loss per 1000 steps: 0.004645193183625604
Training Accuracy per 1000 steps: 78.2758088683296
--------------------------------------------------
Training Loss per 1000 steps: 0.004645291018351389
Training Accuracy per 1000 st

Training Loss per 1000 steps: 0.003205224865447054
Training Accuracy per 1000 steps: 82.5541432165275
--------------------------------------------------
Training Loss per 1000 steps: 0.003215984350997955
Training Accuracy per 1000 steps: 82.56046548481524
--------------------------------------------------
Training Loss per 1000 steps: 0.003224915080782249
Training Accuracy per 1000 steps: 82.51042851725445
--------------------------------------------------
Training Loss per 1000 steps: 0.003232867877909502
Training Accuracy per 1000 steps: 82.45996615024086
--------------------------------------------------
Training Loss per 1000 steps: 0.0032420531870468927
Training Accuracy per 1000 steps: 82.4335151169042
--------------------------------------------------
The Total Accuracy for Epoch 8: 82.41404025566403
Training Loss Epoch: 0.003246459864969735
Training Precision Top 5 Epoch: 82.41404025566403
Training Loss per 1000 steps: 0.003156636841595173
Training Accuracy per 1000 steps: 81.6

In [None]:
# 최종 결과값

# ==================================================
# The Total Accuracy for Epoch 9: 83.00673028074998
# Training Loss Epoch: 0.0029771517266332613
# Training Precision Top 5 Epoch: 83.00673028074998
# ==================================================

In [None]:
# 모델을 돌리는 시각 호출 
import datetime
import pytz
timezone = pytz.timezone('Asia/Seoul')

In [None]:
# 모델 저장용 파일 이름 생성 
FILENAME = f"./model/{datetime.datetime.now(timezone).strftime('%Y%m%d')}-{MODEL_NAME}"
FILENAME

'./model/20211216-xlm-roberta-base'

In [None]:
# 모델 저장
torch.save(model.state_dict(), f"{FILENAME}.pt")

In [None]:
# 학습된 모델 성능 측정

def validation(epoch):
    model.eval()
    t_loss = 0
    n_correct = 0
    nb_t_steps = 0
    nb_t_recall = 0
    nb_t_precision = 0
    
    # 학습된 파라미터 값을 평가하는 단계에서는 gradient를 계산할 필요가 없기 때문에 
    # 메모리 사용량을 줄이기 위해 코드 블럭을 with torch.no_grad():로 감싼다
    with torch.no_grad():
        for _, data in enumerate(loaded_test_data, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            one_hot = data['one_hot'].to(device, dtype = torch.float) # 정답
            
            targets = list()
            for target in one_hot.cpu():
                where_ones = np.argwhere(target == 1)[0].tolist()
                targets.append(where_ones)

            targets = torch.FloatTensor(list(itertools.chain(*targets)))
#             model.eval()
            outputs = model(ids, mask) # 예측값
#             model.train()
            big_val, big_idx = outputs.logits.topk(5, 1, True, True)

            big_idx = torch.FloatTensor(list(itertools.chain(*big_idx)))
            big_idx = big_idx.numpy()
            targets = targets.numpy()
            
            correct = len(set(targets) & set(big_idx))
            n_correct += correct

            nb_t_steps += 1
            nb_t_recall+=targets.size
            nb_t_precision+=big_idx.size
            
            if _%50==0:
                recall = (n_correct*100)/nb_t_recall 
                precision = (n_correct*100)/nb_t_precision 
                f1 = (2*recall*precision)/(recall+precision)
                
                print(f"Validation recall per 500 steps: {recall}")
                print(f"Validation precision per 500 steps: {precision}")
                print(f"Validation f1 per 500 steps: {f1}")
                print("-" * 30)
            

        epoch_recall = (n_correct*100)/nb_t_recall
        epoch_precision = (n_correct*100)/nb_t_precision
        epoch_f1 = (2*epoch_recall*epoch_precision) / (epoch_recall + epoch_precision)
            
    return epoch_recall, epoch_precision, epoch_f1


In [None]:
for epoch in range(EPOCHS):
    recall, precision, f1 = validation(epoch)
    print("=" * 30)
    print(f"Total Recall Score = {recall}")
    print(f"Total Precision = {precision}")
    print(f"Total F1 Score = {f1}")
    print("=" * 30)

Validation recall per 500 steps: 85.41666666666667
Validation precision per 500 steps: 25.625
Validation f1 per 500 steps: 39.42307692307693
------------------------------
Validation recall per 500 steps: 78.9132197891322
Validation precision per 500 steps: 23.848039215686274
Validation f1 per 500 steps: 36.62714097496706
------------------------------
Validation recall per 500 steps: 78.73575129533678
Validation precision per 500 steps: 23.508663366336634
Validation f1 per 500 steps: 36.20681439123183
------------------------------
Validation recall per 500 steps: 78.49492279872027
Validation precision per 500 steps: 23.3567880794702
Validation f1 per 500 steps: 36.00114836198922
------------------------------
Validation recall per 500 steps: 78.68354960234407
Validation precision per 500 steps: 23.37997512437811
Validation f1 per 500 steps: 36.04851855403203
------------------------------
Validation recall per 500 steps: 78.61276524364673
Validation precision per 500 steps: 23.339143

Validation recall per 500 steps: 78.18561428691267
Validation precision per 500 steps: 23.290482007996445
Validation f1 per 500 steps: 35.88984419624043
------------------------------
Validation recall per 500 steps: 78.20715060773783
Validation precision per 500 steps: 23.29666449369839
Validation f1 per 500 steps: 35.89945357021026
------------------------------
Validation recall per 500 steps: 78.21172734328304
Validation precision per 500 steps: 23.30736920459379
Validation f1 per 500 steps: 35.91264436556977
------------------------------
Validation recall per 500 steps: 78.19560034582435
Validation precision per 500 steps: 23.30825697625989
Validation f1 per 500 steps: 35.9119977379785
------------------------------
Validation recall per 500 steps: 78.21484802847755
Validation precision per 500 steps: 23.308088535291716
Validation f1 per 500 steps: 35.9138272458666
------------------------------
Validation recall per 500 steps: 78.18602584204658
Validation precision per 500 steps

Validation recall per 500 steps: 78.093859533693
Validation precision per 500 steps: 23.30132781267847
Validation f1 per 500 steps: 35.89303730828337
------------------------------
Validation recall per 500 steps: 78.09278950184974
Validation precision per 500 steps: 23.295044419766796
Validation f1 per 500 steps: 35.885469290437776
------------------------------
Validation recall per 500 steps: 78.12793712985086
Validation precision per 500 steps: 23.29619124797407
Validation f1 per 500 steps: 35.890539939500975
------------------------------


In [None]:
# 최종 결과값 

# ==============================
# Total Recall Score = 78.15474968819564
# Total Precision = 23.29353362585138
# Total F1 Score = 35.890213819515814
# ==============================

In [None]:
# Recall 은 높은데 Precision은 낮은 이유 분석