<a href="https://colab.research.google.com/github/joshIsac/LargeLanguageModel/blob/main/2348523_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install transformers
! pip install datasets
! pip install sentencepiece
! pip install rouge_score
! pip install transformers[torch]
! pip install accelerate -U  #library to help users easily train a Transformers model on any type of distributed setup, whether it is multiple GPU’s on one machine or multiple GPU’s across several machines.


In [3]:
import torch
import numpy as np
import datasets
from transformers import (
   AutoModelForSeq2SeqLM,
   AutoTokenizer,
   Seq2SeqTrainingArguments,
   Seq2SeqTrainer,
   DataCollatorForSeq2Seq,
)
from tabulate import tabulate
import nltk
from datetime import datetime


In [4]:
#model_name to indicate the specific pretrained model used
pretrained_model_name = "sshleifer/distilbart-xsum-12-3"
# Load the pretrained model
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
# Set model parameters or use the default
# print(model.config)
# Tokenization parameters
encoder_max_len = 256
decoder_max_len = 64



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/716M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:

# Load dataset from the CS
import pandas as pd
geographical_data_df = pd.read_csv('/content/drive/MyDrive/Geographicaldata (1).csv', encoding='latin-1', sep=',',on_bad_lines='skip') #The encoding='latin-1' argument is used to handle invalid characters appropriately.
# Reduce the dataset to 200 rows
geographical_data_df = geographical_data_df.head(200)


In [10]:
print(geographical_data_df.columns)

Index(['City Description', 'Feature Extraction'], dtype='object')


In [11]:
import pandas as pd
from datasets import Dataset
# Convert DataFrame to Dataset
geographical_data = Dataset.from_pandas(geographical_data_df)
def flatten_example(example):
   return {
       "CityDescription": example["City Description"],
       "FeatureExtraction": example["Feature Extraction"],
   }
def filter_samples(example):
   CityDescription = []
   FeatureExtraction = []
   for desc, feat in zip(example["City Description"], example["Feature Extraction"]):
       if len(desc) > 0:
           CityDescription.append(desc)
           FeatureExtraction.append(feat)
   return {"CityDescription": CityDescription, "FeatureExtraction": FeatureExtraction}

# Apply transformations to the dataset
geographical_data = geographical_data.map(flatten_example)
geographical_data = geographical_data.map(filter_samples, batched=True)
# Split the dataset into train and validation sets
train_data_txt, validation_data_txt = geographical_data.train_test_split(test_size=0.1).values()


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [12]:
def preprocess_batch(batch, tokenizer, max_source_length, max_target_length):
   src, tgt = batch["CityDescription"], batch["FeatureExtraction"]
   src_tokenized = tokenizer(
       src, padding="max_length", truncation=True, max_length=max_source_length
   )
   tgt_tokenized = tokenizer(
       tgt, padding="max_length", truncation=True, max_length=max_target_length
   )
   batch = {k: v for k, v in src_tokenized.items()}
   # Ignore padding in the loss
   batch["labels"] = [
       [-100 if token == tokenizer.pad_token_id else token for token in l]
       for l in tgt_tokenized["input_ids"]
   ]
   return batch


In [13]:
train_data = train_data_txt.map(
   lambda batch: preprocess_batch(
       batch, tokenizer, encoder_max_len, decoder_max_len
   ),
   batched=True,
   remove_columns=train_data_txt.column_names,
)


Map:   0%|          | 0/180 [00:00<?, ? examples/s]

In [14]:
validation_data = validation_data_txt.map(
   lambda batch: preprocess_batch(
       batch, tokenizer, encoder_max_len, decoder_max_len
   ),
   batched=True,
   remove_columns=validation_data_txt.column_names,
)


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [16]:
import numpy as np
from datasets import load_metric
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer
nltk.download("punkt", quiet=True)
metric = load_metric("rouge") #Recall-Oriented Understudy for Gisting Evaluation
def postprocess_text(preds, labels):
   preds = [pred.strip() for pred in preds]
   labels = [label.strip() for label in labels]
   # rougeLSum expects newline after each sentence
   preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
   labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
   return preds, labels


The repository for rouge contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/rouge.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


#Define function to evaluate the performance of the model on generated summaries compared to the reference summaries using the ROUGE metric.

In [17]:
def calculate_metrics(eval_preds):
   preds, labels = eval_preds
   if isinstance(preds, tuple):
       preds = preds[0]
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   # Replace -100 in the labels as we can't decode them.
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
   # Some simple post-processing
   decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
   result = metric.compute(
       predictions=decoded_preds, references=decoded_labels, use_stemmer=True
   )
   # Extract a few results from ROUGE
   result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

   prediction_lens = [
       np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
   ]
   result["gen_len"] = np.mean(prediction_lens)
   result = {k: round(v, 4) for k, v in result.items()}
   return result


#Train the model using the provided configuration and datasets.
#Define a Seq2SeqTrainingArguments object that encapsulates the training arguments and configuration.

#Create a DataCollatorForSeq2Seq object that is responsible for collating and processing the training data.

#Build  Seq2SeqTrainer object, to handle training loop, optimization, logging, and evaluation, and manages the training process.

In [18]:
training_args = Seq2SeqTrainingArguments(
   output_dir="results",
   num_train_epochs=1,  # demo
   do_train=True,
   do_eval=True,
   per_device_train_batch_size=4,  # demo
   per_device_eval_batch_size=4,
   # learning_rate=3e-05,
   warmup_steps=500,
   weight_decay=0.1,
   label_smoothing_factor=0.1,
   predict_with_generate=True,
   logging_dir="logs",
   logging_steps=50,
   save_total_limit=3,
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   data_collator=data_collator,
   train_dataset=train_data,
   eval_dataset=validation_data,
   tokenizer=tokenizer,
   compute_metrics=calculate_metrics,
)


#Evaluate the model after training

In [24]:
def generate_summary(test_data, model):
   inputs = tokenizer(
       test_data["CityDescription"],
       padding="max_length",
       truncation=True,
       max_length=encoder_max_len,
       return_tensors="pt",
   )
   input_ids = inputs.input_ids.to(model.device)
   attention_mask = inputs.attention_mask.to(model.device)
   outputs = model.generate(input_ids, attention_mask=attention_mask)
   output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
   return outputs, output_str
model_before_fine_tuning = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name)
test_samples = validation_data_txt.select(range(10))
features_before_fine_tuning = generate_summary(test_samples, model_before_fine_tuning)[1]
features_after_fine_tuning = generate_summary(test_samples, model)[1]


In [25]:
print("\nTarget Feature:\n")
print(
   tabulate(list(enumerate(test_samples["FeatureExtraction"])), headers=["Id", "Target Feature"])
)
print("\nCity Description:\n")
print(tabulate(list(enumerate(test_samples["CityDescription"])), headers=["Id", "City Description"]))



Target Feature:

  Id  Target Feature
----  --------------------------------------------------------------------------------------------------------------------
   0  Historical and cultural significance, lock manufacturing industry and educational institutions.
   1  Known for its paper industry, offers employment opportunities and economic growth through its industrial activities.
   2  Historical significance, architectural marvels
   3  Historical significance, Architectural marvels
   4  maritime trade and historical attractions.
   5  Proximity to Cochin International Airport, Famous St. Joseph's Church.
   6  Located on the banks of the Hooghly River, known for its jute mills.
   7  Historical significance and architectural marvels, rich cultural heritage and hub for classical music.
   8  Located in the foothills of the Himalayas, known for its cultural diversity.
   9  Industrial significance, cultural heritage.

City Description:

  Id  City Description
----  ---------------