In [1]:
import sys, os
import torchvision.models as models
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import PreTrainedTokenizerFast
from transformers import GPT2LMHeadModel
import re

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
def preprocessing(df):
    # 0. None 기사 삭제
    df = df.dropna()
    # 1. 중복 기사 삭제
    df = df.drop_duplicates('url', keep = 'first')
    
    # 2. text에 있는 동영상 뉴스 삭제
    df['text'] = df['text'].replace('동영상 뉴스', '')
    
    # 3. [str] format 삭제
    regex_search_term = '(\[[^(\[|\]);]*\])+'
    regex_replacement = ''
    df['text'] = [re.sub(regex_search_term, '', string) for string in df['text']]
    
    # 4. (##=연합뉴스) 삭제
    regex_search_term = '\(((.*)=연합뉴스)\)+'
    df['text'] = [re.sub(regex_search_term, '', string) for string in df['text']]

    # 5. post_date 날짜 처리
    df['post_date'] = df['post_date'].replace('기사입력', '')
    # df['post_date'] = df['post_date'].split('최종수정')[0]

    return df
    

In [3]:
class NewsDataset(Dataset):
    def __init__(self, data_files):
        tokenizer = PreTrainedTokenizerFast.from_pretrained(
            "skt/kogpt2-base-v2",
            bos_token='</s>', 
            eos_token='</s>', 
            unk_token='<unk>',
            pad_token='<pad>', 
            mask_token='<mask>'
        )
        df_train = pd.DataFrame()
        for file in data_files:
            data = pd.read_csv(file, encoding = 'cp949')
            df_train = pd.concat([df_train, data[:5]])
            
            # create text txt.file 
            f = open('{}_text.txt'.format(file.split('.')[0]), mode = 'wt', encoding = 'utf-8')
            f.write(data['text'].to_string())
            f.close()
            df_train = preprocessing(df_train)
        self.x_train = tokenizer.batch_encode_plus(df_train['text'].to_list(), padding = True, return_tensors = "pt")['input_ids']
        self.y_train = self.x_train[1:]
    
    def __len__(self):
        return len(self.x_train)
    
    def __getitem__(self, idx):
        sample = self.x_train[idx]
        label = self.y_train[idx]
        return sample, label

In [27]:
def accuracy(y_pred, y):
    return (y_pred == y).sum() / (len(y) - 1)

def training(inputs, epochs, batch_size):
    losses, acces = [], []
    model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
    
    base_opt = torch.optim.SGD(model.parameters(), lr = 0.1)
    optimizer = torchcontrib.optim.SWA(base_opt, swa_start = 10, swa_freq = 5, swa_lr = 0.05)
    
    for epoch in range(epochs):
        m_losses, m_acces = [], []
        for batch in range(batch_size):
            optimizer.zero_grad()
            output = model(inputs, labels = inputs)
        
            loss = model.loss  # cross entropy loss
            losses.append(loss)
        
            acc = accuracy(outputs, inputs[1:])
            acces.append(acc)
        
            loss.backward()
            optimizer.step()
        
            if epoch > 0 and epoch % 5 == 0:
                optimizer.update_swa()
                print('epoch : {3.} / {} batch_size : {3.} / {} ... train loss : {}\tacc : {}'.format(epoch, epochs, batch, batch_size, m_losses.mean(), m_acces.mean()))
                m_losses, m_acces = [], []
            
        
    optimizer.swap_swa_sgd()
        
        

In [4]:
data_files = ["실종.csv", "대피.csv", "정전.csv", "가뭄.csv", "지진.csv", "코로나.csv", "테러.csv", "홍수.csv", "화재.csv"]
dataset = NewsDataset(data_files)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [None]:
epochs = 10
batch_size = 32

In [8]:
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
inputs = dataset.x_train
print(inputs)
outputs = model(inputs[:, :512], labels = inputs[:, :512])

In [9]:
outputs.loss

tensor(5.2250, grad_fn=<NllLossBackward>)

### 연습

In [None]:
txt = '안녕하세요 (서울 = 연합뉴스)입니다 (전주 = 연합뉴스) 이기도 하고요'
txt = txt.replace('%s = 연합뉴스)' % format('전주'), '')
txt = txt.split('입니다')
txt

In [None]:
a = pd.DataFrame({'c1': ['안녕', '하세요', '안녕'], 'c2' : ['hi', 'hello', 'my']})
a = a.drop_duplicates('c1', keep = 'first')
a

In [None]:
import re

In [None]:
regex_search_term = '(\[[^(\[|\]);]*\])+'
regex_replacement = ''
text_before = ['[연합뉴스] 안녕하세요[] [승희] 하이[dlfkdl]fdskf;k;s', '어디가세요']
for text in text_before:
    text_after = re.sub(regex_search_term, regex_replacement, text)
    print((text_after))

In [None]:
# regex_term = '(\(*=연합뉴스)\)+'
regex_term = '\(((.*)=연합뉴스)\)+'
text = '(서울=연합뉴스) 한정애 환경부 장관'
# text2 = '(전주=연합뉴스) 승희'
text_after = re.sub(regex_term, '', text)
text_after

In [11]:
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
  bos_token='</s>', eos_token='</s>', unk_token='<unk>',
  pad_token='<pad>', mask_token='<mask>') 

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [None]:
text = '근육이 커지기 위해서는'
input_ids = tokenizer.encode(text)
en_ids = model.generate(torch.tensor([input_ids]),
                           max_length=128,
                           repetition_penalty=2.0,
                           pad_token_id=tokenizer.pad_token_id,
                           eos_token_id=tokenizer.eos_token_id,
                           bos_token_id=tokenizer.bos_token_id,
                           use_cache=True)

In [None]:
print(torch.tensor([input_ids]))

In [22]:
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
encoded_choices = [tokenizer.encode(s) for s in choices]
print(encoded_choices)
cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
print(cls_token_location)
input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
print(input_ids)
mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1
print(mc_token_ids)

[[10553, 14308, 48893, 11849, 463, 10477, 13038, 739, 9969, 11925, 459, 12895, 739, 51200], [10553, 14308, 48893, 11849, 463, 11925, 10272, 739, 9969, 11925, 459, 12895, 739, 51200]]
[13, 13]
tensor([[[10553, 14308, 48893, 11849,   463, 10477, 13038,   739,  9969, 11925,
            459, 12895,   739, 51200],
         [10553, 14308, 48893, 11849,   463, 11925, 10272,   739,  9969, 11925,
            459, 12895,   739, 51200]]])
tensor([[13, 13]])
