# Set Environment

In [None]:
# install pytorch with GPU accelerated
# (see https://pytorch.org/get-started/locally/ )
%pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu114

# install sentencepiece for multi-lingual modeling
%pip install omegaconf hydra-core fairseq sentencepiece

# install huggingface libraries
%pip install transformers datasets evaluate

In [None]:
%pip install pythainlp
%pip install python-crfsuite

# Import lib

In [None]:
import re, json, urllib, os, html
import pandas as pd
import numpy as np
from pythainlp.util import normalize

In [None]:
from datasets import (
    load_dataset, load_metric,
    Dataset,
    DatasetDict,
    Features, Sequence, ClassLabel, Value)
import datasets
from transformers import DataCollatorForSeq2Seq

from torch.utils.data import DataLoader
import evaluate

from nltk.tokenize import RegexpTokenizer

import torch
from transformers import AutoConfig, AutoModelForSeq2SeqLM

#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda")

In [None]:
%pip install rouge_score
from rouge_score import rouge_scorer

# Download dataset

In [None]:
# train dev test used in this task - download from my google drive

!gdown --id 14OqZcC6WFR1r0ga6qpYh7F6iXdOf-ZCT #dev
!gdown --id 18Sbqgqb7d7xClta8msnKivrDTXOCOTGB #test
!gdown --id 1OvrCdl3uCpTPh7TMXEx_Dcyhi5sZKfO7 #train

In [None]:
df_train = pd.read_csv('th_sum_train.csv', encoding='utf-8-sig')
df_dev = pd.read_csv('th_sum_dev.csv', encoding='utf-8-sig')
df_test = pd.read_csv('th_sum_test.csv', encoding='utf-8-sig')

In [None]:
print (df_train.shape)
print (df_dev.shape)
print (df_test.shape)

# Pre-processing

## Functions

In [None]:
def clean_text (text):
  """
  Cleans a string of text by removing URLs, normalizing repeated characters, shrinking white spaces and removing default messages.

  Args:
      text (str): The input string to clean.

  Returns:
      str: The cleaned string, with URLs removed, repeated characters normalized,
      and extra white space removed.

  Example:
      text = "ลูกหนี้  จะะได้รับข้อเเสนอปรับปรุงโครงสร้างหนี้ที่ผ่อนปรนเป็นพิเศษ http://www.thairath.com"
      clean_text(text)
      "ลูกหนี้ จะได้รับข้อเสนอปรับปรุงโครงสร้างหนี้ที่ผ่อนปรนเป็นพิเศษ"
  """
  text = normalize(text).strip() # normalize Thai vowels using Pythai library
  
  ## clean text ##
  text = re.sub(r'ํา','ำ', text) # o + า -> ำ
  text = re.sub('\(ภาพจาก.*\)|>>.*<<', '', text) # remove default msg. e.g. '(ภาพจาก....)', '>> อ่านเรื่องย่อนิยายทุกเรื่อง คลิกที่นี่ <<', '>> คลิกอ่านเรื่องย่อมงกุฎดอกหญ้า <<'
  text = html.unescape(urllib.parse.unquote(text)) # unescape: unicode, unquote: escaped URL
  text = re.sub(r'https?.+?(?:\s|$)', '', text) # remove URL link
  text = re.sub(r'(.)\1{2,}', r'\1', text) # remove repeated more than 3 characters ##
  text = re.sub(r'[\"\'\?\!]', '', text)
  text = re.sub(r'[ \u00a0\xa0\u3000\u2002-\u200a]+', ' ', text) # shrink whitespaces e.g. good  boy -> good boy
  text = re.sub(r'[\r\u200b\ufeff]+|\?', '', text) # remove non-breaking space and ?

  return text

In [None]:
# open file and make a list [[abbreviation : word], [...]]
with open('data/abbrev.txt', 'r') as file:
    # read the contents of the file into a variable
    contents = [line.strip().split(',') for line in file] #contents: list of list [[กทม.,กรุงเทพมหานคร,จังหวัด]]
    contents.pop(0) #remove the column header
    contents = [[str(s).replace('.', '\.') for s in l] for l in contents] #replace . with \.
    for sublist in contents:
      sublist.pop() #remove the last element is sublist [[กทม.,กรุงเทพมหานคร,จังหวัด]] --> [[กทม.,กรุงเทพมหานคร]]
    contents.sort(key=lambda x: len(x[0]), reverse=True) #sort list by length of abbreviations

In [None]:
def solve_abbrev (text):
  text = text.strip()
  for sublist in contents:
    if re.search(sublist[0], text):
      text = re.sub(sublist[0], sublist[1], text)
  
  return text

In [None]:
# remove duplicated row

df_train = df_train.drop_duplicates(subset=['title'], keep='first').reset_index(drop=True)
df_dev = df_dev.drop_duplicates(subset=['title'], keep='first').reset_index(drop=True)
df_test = df_test.drop_duplicates(subset=['title'], keep='first').reset_index(drop=True)

In [None]:
# apply cleaning and resolve abbrev

df_train['title'] = df_train['title'].apply(lambda x: solve_abbrev(clean_text(x)))
df_train['body'] = df_train['body'].apply(lambda x: solve_abbrev(clean_text(x)))
df_dev['title'] = df_dev['title'].apply(lambda x: solve_abbrev(clean_text(x)))
df_dev['body'] = df_dev['body'].apply(lambda x: solve_abbrev(clean_text(x)))
df_test['title'] = df_test['title'].apply(lambda x: solve_abbrev(clean_text(x)))
df_test['body'] = df_test['body'].apply(lambda x: solve_abbrev(clean_text(x)))

df_train['summary'] = df_train['summary'].apply(lambda x: solve_abbrev(clean_text(x)))
df_dev['summary'] = df_dev['summary'].apply(lambda x: solve_abbrev(clean_text(x)))
df_test['summary'] = df_test['summary'].apply(lambda x: solve_abbrev(clean_text(x)))

In [None]:
## text + tile

df_train['text'] = df_train['title'] + ' ' + df_train['body']
df_dev['text'] = df_dev['title'] + ' ' + df_dev['body']
df_test['text'] = df_test['title'] + ' ' + df_test['body']

df_train = df_train[['text', 'summary', 'type']]
df_dev = df_dev[['text', 'summary', 'type']]
df_test = df_test[['text', 'summary', 'type']]

In [None]:
df_dev.sample(7)

## To dict (test)

In [None]:
#dataframe เป็น dictionary

train = Dataset.from_dict(df_train)
dev = Dataset.from_dict(df_dev)
test = Dataset.from_dict(df_test)
my_dataset = datasets.DatasetDict({"train": train,"dev": dev, "test": test})

In [None]:
my_dataset

In [None]:
my_dataset['train'][10]

## Tokenizer

In [None]:
from transformers import AutoTokenizer
t5_tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")

In [None]:
def tokenize_sample_data(data):
  # Max token size is 14536 and 215 for inputs and labels, respectively.
  # Here I restrict these token size.
  input_feature = t5_tokenizer(data["text"], truncation=True, max_length=1024)
  label = t5_tokenizer(data["summary"], truncation=True, max_length=128)
  return {
    "input_ids": input_feature["input_ids"],
    "attention_mask": input_feature["attention_mask"],
    "labels": label["input_ids"],
  }

In [None]:
tokenized_ds = my_dataset.map(
  tokenize_sample_data,
  remove_columns=["summary", "text"],
  batched=True,
  batch_size=128)

tokenized_ds

# Eva Metric

In [None]:
#https://huggingface.co/docs/transformers/main_classes/configuration

mt5_config = AutoConfig.from_pretrained(
  "google/mt5-small",
  max_length=128,
  length_penalty=0.6,
  no_repeat_ngram_size=2,
  num_beams=15,
)
model = (AutoModelForSeq2SeqLM
         .from_pretrained("google/mt5-small", config=mt5_config)
         .to(device))

data_collator = DataCollatorForSeq2Seq(t5_tokenizer, model=model, return_tensors="pt")
rouge_metric = evaluate.load("rouge")

In [None]:
# function for custom tokenization
def tokenize_sentence(arg):
  encoded_arg = t5_tokenizer(arg)
  return t5_tokenizer.convert_ids_to_tokens(encoded_arg.input_ids)

# function to get ROUGE scores with custom tokenization
def metrics_func(eval_arg):
  preds, labels = eval_arg
  # Replace -100
  labels = np.where(labels != -100, labels, t5_tokenizer.pad_token_id)
  # Convert id tokens to text
  text_preds = t5_tokenizer.batch_decode(preds, skip_special_tokens=True)
  text_labels = t5_tokenizer.batch_decode(labels, skip_special_tokens=True)
  return rouge_metric.compute(
    predictions=text_preds,
    references=text_labels,
    tokenizer=tokenize_sentence
  )

# Fine-tuning

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
  output_dir = "mt5-summarize-th",
  log_level = "error",
  num_train_epochs = 12,
  learning_rate = 5e-4,
  lr_scheduler_type = "linear", #The type of learning rate scheduler to use.
  warmup_steps = 20, #The number of warmup steps to use for the learning rate scheduler.
  optim = "adafactor",
  weight_decay = 0.01,
  per_device_train_batch_size = 2,
  per_device_eval_batch_size = 1,
  gradient_accumulation_steps = 16,
  evaluation_strategy = "steps", #The evaluation strategy to use (e.g. "no", "steps", "epoch")
  eval_steps = 100,
  predict_with_generate=True, #Whether to use generation during prediction.
  generation_max_length = 128, #The maximum length of generated output during prediction.
  save_steps = 512,
  logging_steps = 10,
  push_to_hub = False
)

In [None]:
%pip install ipython-autotime
%load_ext autotime

In [None]:
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
  model = model,
  args = training_args,
  data_collator = data_collator,
  compute_metrics = metrics_func,
  train_dataset = tokenized_ds["train"],
  eval_dataset = tokenized_ds["dev"].select(range(20)),
  tokenizer = t5_tokenizer
)

trainer.train()

In [None]:
trainer.evaluate()

# Predict Test Set

In [None]:
# Predict on test set
test_dataset = tokenized_ds["test"]
predictions = trainer.predict(test_dataset)

if predictions is not None:
    # Convert the predictions to text
    predicted_text = t5_tokenizer.batch_decode(predictions.predictions, skip_special_tokens=True)
    
    # Print the predicted text
    for text in predicted_text:
        print(text)
else:
    print("Predictions are None.")

In [None]:
# convert id tokens to text
text_preds = t5_tokenizer.batch_decode(predictions.predictions, skip_special_tokens=True)
text_labels = t5_tokenizer.batch_decode(predictions.label_ids, skip_special_tokens=True)

# evaluate the predictions using ROUGE score
rouge_scores = rouge_metric.compute(predictions=text_preds, references=text_labels, tokenizer=tokenize_sentence)
print(rouge_scores)

In [None]:
df_test['prediction'] = text_preds

In [None]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
def compute_rouge(row):
    gold_summary = row['summary']
    predicted_summary = row['prediction']
    scores = scorer.score(gold_summary, predicted_summary)
    return scores

df_test['rouge_scores'] = df_test.apply(compute_rouge, axis=1)

In [None]:
# save prediction
df_test.to_csv(MAIN_PATH + 'prediction_df_test', encoding='utf-8-sig', index=False)

# Save fine-tuned model

In [None]:
import os
from transformers import AutoModelForSeq2SeqLM

# save fine-tuned model in local
if hasattr(trainer.model, "module"):
  trainer.model.module.save_pretrained(MAIN_PATH + "/trained_for_summarization_th")
else:
  trainer.model.save_pretrained(MAIN_PATH + "/trained_for_summarization_th")

# Explore Prediction

In [None]:
df_test.sample(3)

In [None]:
df_test.iloc[[355]]

In [None]:
df_test.loc[df_test['type'] == "ข่าว,ทั่วไทย"]