In [1]:
!pip install konlpy

In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import json
import os
import random
import math
import copy
import gc
from tqdm import tqdm
from glob import glob

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import warnings
warnings.filterwarnings(action='ignore')

In [3]:
%cd ../

In [4]:
!mkdir tmp

In [5]:
!git clone https://github.com/seujung/KoBART-summarization.git

In [6]:
!ls

In [7]:
%cd KoBART-summarization

In [8]:
!pip install gdown

In [9]:
!python download_binary.py

In [10]:
!bash install_kobart.sh

In [11]:
!pip install -qr requirements.txt

In [12]:
!pip install git+https://github.com/SKT-AI/KoBART

In [13]:
%ls

In [14]:
%cd data

In [15]:
%ls

In [16]:
!tar -zxvf train.tar.gz

In [17]:
train = pd.read_csv('train.tsv', delimiter = '\t')

In [18]:
train

In [19]:
train_source = os.path.join('/kaggle/input/ai1234/train.json')
test_source = os.path.join('/kaggle/input/ai1234/test.json')

In [20]:
with open(train_source) as f:
    train_data = json.loads(f.read())
    
with open(test_source) as f:
    test_data = json.loads(f.read())

In [21]:
train = pd.DataFrame(columns=['uid', 'title', 'region', 'context', 'summary'])
uid = 1000
for data in tqdm(train_data):
    for agenda in data['context'].keys():
        context = ''
        for line in data['context'][agenda]:
            context += data['context'][agenda][line]
            context += ' '
        train.loc[uid, 'uid'] = uid
        train.loc[uid, 'title'] = data['title']
        train.loc[uid, 'region'] = data['region']
        train.loc[uid, 'context'] = context[:-1]
        train.loc[uid, 'summary'] = data['label'][agenda]['summary']
        uid += 1

In [22]:
test = pd.DataFrame(columns=['uid', 'title', 'region', 'context'])
uid = 2000
for data in tqdm(test_data):
    for agenda in data['context'].keys():
        context = ''
        for line in data['context'][agenda]:
            context += data['context'][agenda][line]
            context += ' '
        test.loc[uid, 'uid'] = uid
        test.loc[uid, 'title'] = data['title']
        test.loc[uid, 'region'] = data['region']
        test.loc[uid, 'context'] = context[:-1]
        uid += 1

In [23]:
train['total'] = train['title'] + ' ' + train['region'] + ' ' + train['context']
test['total'] = test['title'] + ' ' + test['region'] + ' ' + test['context']

In [24]:
train1 = train.drop(columns = ['uid','title','region','context'])
train2 = train1[['total', 'summary']]

In [25]:
train2.rename(columns = {'total': 'news'}, inplace = True)

In [26]:
test1 = test.drop(columns = ['uid','title','region','context'])
test2 = test1.rename(columns = {'total': 'news'})
test2.head()

In [27]:
train2.head()

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
train4, test4 = train_test_split(train2, random_state = 42, test_size = 0.25)

In [30]:
train4.to_csv('train.tsv', sep = '\t')
test4.to_csv('test.tsv', sep = '\t')

In [31]:
%cd ../

In [32]:
%ls

In [33]:
%ls

In [None]:
!python train.py --gradient_clip_val 1.0 --max_epochs 1 --default_root_dir logs --gpus 1 --batch_size 4 --num_workers 4 --max_len 8192

In [35]:
%cd /kaggle/KoBART-summarization/logs/model_chp

In [36]:
%ls

In [37]:
%cd /kaggle/KoBART-summarization

In [38]:
%ls

In [39]:
!python get_model_binary.py --hparams /kaggle/KoBART-summarization/logs/tb_logs/default/version_0/hparams.yaml --model_binary /kaggle/KoBART-summarization/logs/model_chp/epoch=00-val_loss=0.850.ckpt

In [40]:
import torch
from kobart import get_kobart_tokenizer
from transformers.models.bart import BartForConditionalGeneration

from tensorflow.keras.models import *

In [41]:
def load_model():
    model = BartForConditionalGeneration.from_pretrained('/kaggle/KoBART-summarization/kobart_summary')
    return model

model = load_model()
tokenizer = get_kobart_tokenizer()

In [42]:
test2

In [43]:
test2list = np.array(test2['news'].tolist())

In [44]:
text = test2list[5]
text1 = text.replace('\n', '')
input_ids = tokenizer.encode(text1)
input_ids = torch.tensor(input_ids)
input_ids = input_ids.unsqueeze(0)
input_ids.shape

In [45]:
output = model.generate(
    input_ids,
    eos_token_id=1, 
    min_length=30, 
    max_length=20000, 
    num_beams=5)

In [None]:
text = test2list[5]
text1 = text.replace('\n', '')
input_ids = tokenizer.encode(text1)
input_ids = torch.tensor(input_ids)
input_ids = input_ids.unsqueeze(0)

output = model.generate(
    input_ids, 
    eos_token_id=1, 
    min_length=30, 
    max_length=512, 
    num_beams=5,
    #do_sample=True,
    #early_stopping=True, 
    #length_penalty=0.8,
    #no_repeat_ngram_size=5, 
    #temperature = 0.7,
    num_return_sequences=2,
    repetition_penalty=10.0,
    #top_p=0.9,
)

output = tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
test_list = []
for i in tqdm(range(len(test2list))):
    input_test = test2list[i]
    input_test = tokenizer.encode(input_test)
    input_test = torch.tensor(input_test)
    input_test = input_test.unsqueeze(0)
    output = model.generate(
    input_test, 
    eos_token_id=1, 
    min_length=30, 
    max_length=512, 
    num_beams=5,
    num_return_sequences=2,
    repetition_penalty=10.0)
    
    output = tokenizer.decode(output[0], skip_special_tokens = True)
    

In [None]:
%cd /kaggle/KoBART-summarization/kobart_summary

In [None]:
%ls

In [None]:
test2.to_csv('test2.tsv', sep = '\t')

In [None]:
%cd /kaggle/KoBART-summarization/kobart_summary/

In [None]:
%ls

In [None]:
model_path = '/kaggle/KoBART-summarization/logs/model_chp/epoch=02-val_loss=0.832.ckpt'
test_path = '/kaggle/test2.tsv'

In [None]:
!python train.py --weights {model_path} --source {test_path}

In [None]:
!python rouge_metric.py

In [None]:
while True: pass