# **Pre-processing**

In [None]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers.optimization import  Adafactor 
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset

In [None]:
if torch.cuda.is_available():   
  dev = torch.device("cuda:0")   
  print("Running on the GPU")
else:   
  dev = torch.device("cpu")   
  print("Running on the CPU")

# **Create Bert dataset**

In [None]:
def load_data(path, num = 300):
    df = pd.read_csv(path)
    datas = df['input_text'][:num].tolist()
    labels = df['target_text'][:num].tolist()
    return datas,labels

In [None]:
class text_dataset(Dataset):
    def __init__(self, datas, labels, tokenizer):
        self.model_input = []
        max_input_len = len(max(datas))
        max_label_len = len(max(labels))
        for data, label in zip(datas,labels):
            data="WebNLG:"+data
            input_text = tokenizer.batch_encode_plus([data], max_length= max_input_len, pad_to_max_length=True,return_tensors='pt')
            output_text = tokenizer.batch_encode_plus([label], max_length= max_label_len, pad_to_max_length=True,return_tensors='pt')
            
            input_ids = input_text['input_ids'].squeeze()
            attention_mask = input_text['attention_mask'].squeeze()
            label_ids = output_text['input_ids'].squeeze()
            
            self.model_input.append([input_ids, attention_mask, label_ids])
            
    def __getitem__(self, index):
        return{'input_ids': self.model_input[index][0],
               'attention_mask': self.model_input[index][1],
               'labels':self.model_input[index][2]
              }
    def __len__(self):
        return len(self.model_input)

# **Define model**

In [None]:
def model_fine_tune(model, optimizer, dataloder, max_epoch = 30):   
    for epoch in range(max_epoch):
        total_loss = 0
        t = tqdm(dataloder)
        for cnt,data in enumerate(t, 1):
            for key in data.keys():
                 data[key] = data[key].cuda()
            outputs = model(**data, return_dict = True)

            loss = outputs.loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss+=loss.item() 
            t.set_description(f'Epoch {epoch}')
            t.set_postfix({'Loss': total_loss/cnt})
        if epoch > 20:
            model.save_pretrained(f'model_{epoch}')

# **Example**

In [None]:
train_df=pd.read_csv('/your_data.csv')
train_df=train_df.iloc[:3000, :]
train_df=train_df.sample(frac = 1)
#batch size is depending on your device's ability
batch_size=1
num_of_batches=len(train_df)//batch_size

In [None]:
train_df

Unnamed: 0,input_text,target_text
1459,很貴 & 沒,很貴東西少沒有很好吃
9770,牛肉 & 少 & 為,牛肉特別少，為什麼啊
5854,送過 & 晚 & 一個 & 小時 & 飯 & 菜,送過來晚了一個小時飯菜都涼了
7969,一個 & 半小時,一個半小時
4284,硬硬 & 塊 & 肘子 & 肉 & 混為 & 謝謝 & 侮辱 & 肘子 & 肉,"請不要把硬硬的肥肉塊跟嫩嫩的肘子肉混為一談。謝謝。,簡直侮辱了肘子肉。"
...,...,...
10106,提前 & 2 & 遲 & 一個 & 小時 & 才,提前2小時定，遲了一個小時才送來
1660,用餐 & 兩個 & 狀況 & 個 & 服務員 & 我們 & 火鍋 & 加湯 & 桌面 & 飲...,今天去用餐遇到兩個狀況，第一個是服務員在幫我們火鍋加湯的時候，碰撞到桌面的飲料灑了一些出來，...
8720,味道 & 為 & 列表 & 沒有 & 寫 & 打電話 & 說 & 換別,味道一般，海鮮蟹肉的從來就沒定成功，為什麼還要在列表裡面顯示，沒有就不要寫出來，最後還要打電...
2362,一進 & 肉 & 上燒 & 滿 & 整間 & 覺得 & 服務人員 & 與 & 打翻 & 潑灑...,一進去肉在石板上燒的煙味佈滿整間餐廳。起初覺得佈置跟氣氛及服務人員都很熱情與貼心。但在上飲料...


In [None]:
tokenizer = T5Tokenizer.from_pretrained("google/mt5-base")#'t5-base'
model = T5ForConditionalGeneration.from_pretrained("google/mt5-base").cuda()

datas, labels = load_data(train_df,10000)
dataset = text_dataset(datas, labels, tokenizer)
dataloder = DataLoader(dataset=dataset, batch_size=8, shuffle=True, num_workers=0)



optimizer = Adafactor(model.parameters(),lr=1e-6,
                      eps=(1e-30, 1e-3),
                      clip_threshold=1.0,
                      decay_rate=-0.8,
                      beta1=None,
                      weight_decay=0.0,
                      relative_step=False,
                      scale_parameter=False,
                      warmup_init=False)

model_fine_tune(model, optimizer, dataloder, 30)
model.eval()

In [None]:
#for testing
input_ids = tokenizer.encode("WebNLG: 難吃 & 環境 & 髒亂", return_tensors="pt")  # Batch size 1
input_ids=input_ids.cuda()
outputs = model.generate(input_ids)
tokenizer.decode(outputs[0])

# **Generate text data**

In [None]:
from IPython.display import HTML, display
def progress(loss,value, max=100):
 return HTML(""" Batch loss :{loss}
        <progress    
        value='{value}'max='{max}',style='width: 100%'>{value}
        </progress>
        """.format(loss=loss,value=value, max=max))

In [None]:
df = pd.read_csv(r'C:your_test_data.csv', encoding='utf-8-sig')
a = df.input_text.tolist()

tokenizer = T5Tokenizer.from_pretrained("google/mt5-base")
model = T5ForConditionalGeneration.from_pretrained(r"C:\MT5\model_29").cuda()

optimizer = Adafactor(model.parameters(),lr=1e-6,
                      eps=(1e-30, 1e-3),
                      clip_threshold=1.0,
                      decay_rate=-0.8,
                      beta1=None,
                      weight_decay=0.0,
                      relative_step=False,
                      scale_parameter=False,
                      warmup_init=False)

final = []
y = []
for i in a[:1000]:
    
    model.eval()
    input_ids = tokenizer.encode(f'WebNLG:{i}', return_tensors="pt")
    input_ids=input_ids.cuda()
    outputs = model.generate(input_ids)
    f_text=tokenizer.decode(outputs[0])
    if '<pad>' in f_text:
         f_text = f_text.replace('<pad>','')   
    if '</s>' in f_text:
         f_text = f_text.replace('</s>','')    
    final.append(f_text)
    y.append(0)
    
df_nf=pd.DataFrame({'text':final, 'y':y})


df_nf.to_csv(r'\MT5\mT5_samples.csv', encoding='utf-8-sig', index= False)