# Tokenizer and Model

In [None]:
import torch
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer, T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('t5-small', legacy=False)
model = T5ForConditionalGeneration.from_pretrained("t5-small")


# Preprocess

In [None]:

def preprocess_function(examples):
    inputs = [text for text in examples["input_text"]]
    targets = examples["target_summary"]

    model_inputs = tokenizer(
        inputs,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    labels = tokenizer(
        targets,
        max_length=128,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    model_inputs['labels'] = labels['input_ids']
    return model_inputs



# Dataset Load

In [None]:
dataset_path = r"/content/dataset.csv"
try:
    data = pd.read_csv(dataset_path)
except FileNotFoundError as e:
    raise FileNotFoundError(e)

train_data, val_data= train_test_split(data, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/236 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

# Finetune

In [None]:

training_args = TrainingArguments(
    output_dir = './output',
    num_train_epochs = 20,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    learning_rate= 1e-4,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=1,
    #use_cpu=True,
)

trainer = Trainer(
    model = model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class = tokenizer
)

trainer.train()




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msolaimanhs26[0m ([33msolaimanhs26-bangabandhu-sheikh-mujibur-rahman-science-a[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.8382,0.274945
2,0.1867,0.132694
3,0.1052,0.066765
4,0.0531,0.017547
5,0.0185,0.014634
6,0.0159,0.014463
7,0.0125,0.013865
8,0.0126,0.013666
9,0.013,0.013554
10,0.006,0.013638


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=600, training_loss=0.23318454765404265, metrics={'train_runtime': 576.58, 'train_samples_per_second': 8.186, 'train_steps_per_second': 1.041, 'total_flos': 638813302947840.0, 'train_loss': 0.23318454765404265, 'epoch': 20.0})

# Evaluation

In [None]:

model.eval()
sample_input = """
User: Can you explain what blockchain technology is and how it works in simple terms? I keep hearing about it with cryptocurrencies, but I don’t get the details.
AI: Blockchain is a decentralized digital ledger that records transactions across many computers securely. Think of it as a chain of blocks, where each block contains a list of transactions. These blocks are linked using cryptography, ensuring they can’t be altered without changing all subsequent blocks. For cryptocurrencies like Bitcoin, when you send money, the transaction is verified by a network of computers (nodes) through consensus, added to a block, and then added to the blockchain. It’s transparent, secure, and doesn’t rely on a central authority like a bank.
"""
sample_encoding = tokenizer(sample_input, max_length=512, padding='max_length', truncation=True, return_tensors='pt')
with torch.no_grad():
    output = model.generate(sample_encoding['input_ids'].to(model.device), max_length=128)
print("Sample Prediction:", tokenizer.decode(output[0], skip_special_tokens=True))

Sample Prediction: The user asked about blockchain technology and how it works.


In [None]:

model.eval()
sample_input = """
User: Hey AI, how can I stay safe online?
AI: Great question! Start with strong passwords, enable two-factor authentication (2FA), and avoid clicking on suspicious links.
User: What’s 2FA?
AI: Two-Factor Authentication adds a second layer of security by requiring a one-time code along with your password—usually sent to your phone or email.
User: What about phishing?
AI: Phishing is a scam where attackers trick you into revealing sensitive information. Be cautious of unexpected emails or messages asking for credentials.
User: Should I use antivirus software?
AI: Yes, especially on Windows. Keep your operating system and software updated, and use a reliable antivirus tool.
User: Is public Wi-Fi dangerous?
AI: It can be. Avoid accessing sensitive accounts on public networks, or use a VPN to encrypt your data.
User: Noted. Thanks for the tips!
AI: You’re welcome! Stay safe and feel free to ask more.
"""
sample_encoding = tokenizer(sample_input, max_length=512, padding='max_length', truncation=True, return_tensors='pt')
with torch.no_grad():
    output = model.generate(sample_encoding['input_ids'].to(model.device), max_length=128)
print("Sample Prediction:", tokenizer.decode(output[0], skip_special_tokens=True))

Sample Prediction: The user asked how to stay safe online.


In [None]:

model.eval()
sample_input = """
User: Hi, can you tell me about Python?
AI: Sure! Python is a popular programming language known for its readability.
User: What can I use it for?
AI: You can use Python for web development, data analysis, AI, and more.
"""
sample_encoding = tokenizer(sample_input, max_length=512, padding='max_length', truncation=True, return_tensors='pt')
with torch.no_grad():
    output = model.generate(sample_encoding['input_ids'].to(model.device), max_length=128)
print("Sample Prediction:", tokenizer.decode(output[0], skip_special_tokens=True))

Sample Prediction: The user asked about Python and its use.


In [None]:

model.eval()
sample_input = """
User: Hello!
AI: Hi! How can I assist you today?
User: Can you explain what machine learning is?
AI: Certainly! Machine learning is a field of AI that allows systems to learn from data.
"""
sample_encoding = tokenizer(sample_input, max_length=512, padding='max_length', truncation=True, return_tensors='pt')
with torch.no_grad():
    output = model.generate(sample_encoding['input_ids'].to(model.device), max_length=128)
print("Sample Prediction:", tokenizer.decode(output[0], skip_special_tokens=True))

Sample Prediction: The user asked about machine learning.


In [None]:

model.eval()
sample_input = """
User: Hi! Can you help me understand SQL?
AI: Hi! SQL (Structured Query Language) is used to interact with databases. You can use it to query, insert, update, and delete data.
User: What’s a basic example?
AI: A basic query is:
SELECT * FROM customers WHERE country = 'USA';
It retrieves all customers from the USA.
User: What are tables in SQL?
AI: Tables are like spreadsheets. Each table has rows (records) and columns (fields).
User: Do I need to learn normalization?
AI: Yes, it's important for designing efficient databases. It helps reduce redundancy and ensures data integrity.
User: Where can I practice SQL?
AI: Try platforms like SQLZoo, LeetCode (SQL section), or Mode Analytics' SQL tutorials.
User: Thanks! That helps a lot.
AI: You're welcome! Practice regularly and you’ll improve quickly.
"""
sample_encoding = tokenizer(sample_input, max_length=512, padding='max_length', truncation=True, return_tensors='pt')
with torch.no_grad():
    output = model.generate(sample_encoding['input_ids'].to(model.device), max_length=128)
print("Sample Prediction:", tokenizer.decode(output[0], skip_special_tokens=True))

Sample Prediction: The user asked about SQL.


In [None]:
5 + 55 / 5+5

21.0

# Zip the Checkpoint

In [None]:
!zip -r t5_finetune_chat-observer.zip ./output/checkpoint-510

  adding: output/checkpoint-510/ (stored 0%)
  adding: output/checkpoint-510/model.safetensors (deflated 11%)
  adding: output/checkpoint-510/spiece.model (deflated 48%)
  adding: output/checkpoint-510/rng_state.pth (deflated 25%)
  adding: output/checkpoint-510/generation_config.json (deflated 29%)
  adding: output/checkpoint-510/optimizer.pt (deflated 7%)
  adding: output/checkpoint-510/added_tokens.json (deflated 83%)
  adding: output/checkpoint-510/tokenizer_config.json (deflated 94%)
  adding: output/checkpoint-510/config.json (deflated 63%)
  adding: output/checkpoint-510/scheduler.pt (deflated 56%)
  adding: output/checkpoint-510/trainer_state.json (deflated 82%)
  adding: output/checkpoint-510/special_tokens_map.json (deflated 85%)
  adding: output/checkpoint-510/training_args.bin (deflated 52%)


# Download Checkpoint

In [None]:
from google.colab import files
files.download('/content/t5_finetune_chat-observer.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Used save checkpoint

In [None]:
!unzip best_checkpoint.zip -d './output/Untitled Folder'

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_path = '/content/output/Untitled Folder/output/checkpoint-150'

model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

In [None]:
model.eval()
sample_input = """
User: What are black holes, and how do scientists detect them? They sound fascinating but confusing.

AI: A black hole is a region in space where gravity is so strong that nothing, not even light, can escape. They form when massive stars collapse or through other cosmic events. Scientists detect black holes indirectly, as they’re invisible. They observe effects like the bending of light (gravitational lensing), X-ray emissions from matter falling in, or gravitational waves from black hole mergers, as detected by observatories like LIGO. The Event Horizon Telescope also captured the first image of a black hole’s shadow in 2019.
"""

inputs = tokenizer(sample_input, return_tensors="pt")

outputs = model(**inputs)

In [None]:
!cp -r /content/output/checkpoint-510 /content/model


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_path = '/content/model/checkpoint-510'

model_ = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer_ = T5Tokenizer.from_pretrained(model_path)