In [1]:
import torch
import numpy as np
import pandas as pd
import re
import tensorflow as tf
import evaluate
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq, TrainerCallback, T5Config

from datasets import Dataset
from sklearn.model_selection import train_test_split

from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from torch.utils.data import TensorDataset
from torchsummary import summary

from collections import defaultdict
warnings.filterwarnings("ignore")

In [2]:
# Load the dataset from a CSV file
df = pd.read_csv('../data/medquad.csv')

# Display a sample of the data to understand its structure
print("Data Sample:")
print(df.head())

# Check for null values in the dataset
print("Null Value Data:")
print(df.isnull().sum())

# Define a list of common question words to filter relevant questions
question_words = ['what', 'who', 'why', 'when', 'where', 'how', 'is', 'are', 'does', 'do', 'can', 'will', 'shall']

# Convert all questions to lowercase for consistent filtering
df['question'] = df['question'].str.lower()

# Filter rows where the question starts with one of the question words
df = df[df['question'].str.split().str[0].isin(question_words)]

# Reset the index after filtering
df = df.reset_index(drop=True)

# Check for duplicate rows in the dataset
duplicates = df.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")

# Remove duplicate rows to ensure data uniqueness
df = df.drop_duplicates()

# Reset the index after removing duplicates
df.reset_index(drop=True, inplace=True)

# Drop unused columns ('source' and 'focus_area') to simplify the dataset
df = df.drop(columns=['source', 'focus_area'])

# Display dataset information (columns, data types, and non-null counts)
print("Table Info:")
print(df.info())

# Remove duplicate rows based on the 'question' and 'answer' columns
df = df.drop_duplicates(subset='question', keep='first').reset_index(drop=True)
df = df.drop_duplicates(subset='answer', keep='first').reset_index(drop=True)

# Drop rows with null values in the 'question' or 'answer' columns
df = df.dropna(subset=['question', 'answer']).reset_index(drop=True)

# Fill any remaining null values with empty strings and convert to string type
df['question'] = df['question'].fillna('').astype(str)
df['answer'] = df['answer'].fillna('').astype(str)

# Define a function to clean text by removing parentheses and extra spaces
def clean_text(text):
    text = re.sub(r"\(.*?\)", "", text)  # Remove text within parentheses
    text = re.sub(r'\s+', ' ', text.strip().lower())  # Normalize spaces and convert to lowercase
    return text

# Apply the clean_text function to the 'question' and 'answer' columns
df['question'] = df['question'].apply(clean_text)
df['answer'] = df['answer'].apply(clean_text)

# Further clean the text by ensuring lowercase, stripping whitespace, and normalizing spaces
df['question'] = df['question'].str.lower().str.strip().apply(lambda x: re.sub(r'\s+', ' ', x))
df['answer'] = df['answer'].str.lower().str.strip().apply(lambda x: re.sub(r'\s+', ' ', x))

# Check for null values again after cleaning
print("Null Value Data After Cleaning:")
print(df.isnull().sum())

# Check the number of unique questions and answers in the dataset
print(f"Unique questions: {df['question'].nunique()}")
print(f"Unique answers: {df['answer'].nunique()}")

# Display dataset information and a sample of the cleaned data
print("Final Dataset Info:")
df.info()
print("Final Data Sample:")
df.head()

Data Sample:
                                 question  \
0                What is (are) Glaucoma ?   
1                  What causes Glaucoma ?   
2     What are the symptoms of Glaucoma ?   
3  What are the treatments for Glaucoma ?   
4                What is (are) Glaucoma ?   

                                              answer           source  \
0  Glaucoma is a group of diseases that can damag...  NIHSeniorHealth   
1  Nearly 2.7 million people have glaucoma, a lea...  NIHSeniorHealth   
2  Symptoms of Glaucoma  Glaucoma can develop in ...  NIHSeniorHealth   
3  Although open-angle glaucoma cannot be cured, ...  NIHSeniorHealth   
4  Glaucoma is a group of diseases that can damag...  NIHSeniorHealth   

  focus_area  
0   Glaucoma  
1   Glaucoma  
2   Glaucoma  
3   Glaucoma  
4   Glaucoma  
Null Value Data:
question       0
answer         5
source         0
focus_area    14
dtype: int64
Number of duplicate rows: 48
Table Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex

Unnamed: 0,question,answer
0,what is glaucoma ?,glaucoma is a group of diseases that can damag...
1,what causes glaucoma ?,"nearly 2.7 million people have glaucoma, a lea..."
2,what are the symptoms of glaucoma ?,symptoms of glaucoma glaucoma can develop in o...
3,what are the treatments for glaucoma ?,"although open-angle glaucoma cannot be cured, ..."
4,who is at risk for glaucoma? ?,anyone can develop glaucoma. some people are a...


In [3]:
# Define the model name and load the T5 configuration
model_name = "t5-base"
config = T5Config.from_pretrained(model_name)

# Customize the configuration
config.dropout_rate = 0.1  # Set dropout rate to 0.1 for regularization
config.feed_forward_proj = "gelu"  # Use GELU activation for the feed-forward layers

# Load the pre-trained T5 model with the customized configuration
model = T5ForConditionalGeneration.from_pretrained(model_name, config=config)

# Load the tokenizer for the T5 model
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Explicitly resize the token embeddings to match the tokenizer's vocabulary size
model.resize_token_embeddings(len(tokenizer))

# Print a detailed summary of the model architecture
print("\nDetailed Model Summary:")
print("=" * 50)

def summarize_model_by_type(model):
    """
    Summarizes the model by counting the number of layers and parameters for each layer type.
    """
    layer_summary = defaultdict(int)  # Counts the number of layers by type
    param_summary = defaultdict(int)  # Counts the number of parameters by layer type

    for name, module in model.named_modules():
        layer_type = type(module).__name__  # Get the type of the current module
        layer_summary[layer_type] += 1  # Increment the count for this layer type
        param_summary[layer_type] += sum(p.numel() for p in module.parameters())  # Sum parameters

    # Print the summary table
    print(f"{'Layer Type':<30}{'Count':<10}{'Parameters':<15}")
    print("=" * 55)
    for layer_type, count in layer_summary.items():
        print(f"{layer_type:<30}{count:<10}{param_summary[layer_type]:<15,}")

summarize_model_by_type(model)

# Define a preprocessing function for the seq2seq task (optimized for speed)
def preprocess_function(batch):
    """
    Preprocesses the dataset by tokenizing the inputs and targets.
    Optimized for speed by using fast tokenization and avoiding unnecessary conversions.
    """
    # Format the inputs and targets
    inputs = [f"answer the following question: {q}" for q in batch['question']]
    targets = [str(a) for a in batch['answer']]

    # Tokenize the inputs and targets in one call each, return as lists (not tensors)
    model_inputs = tokenizer(
        inputs,
        max_length=128,
        truncation=True,
        padding="max_length",
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=64,
            truncation=True,
            padding="max_length",
        )

    # Replace padding token IDs with -100 for the loss function to ignore them
    labels_ids = [
        [(lid if lid != tokenizer.pad_token_id else -100) for lid in label]
        for label in labels["input_ids"]
    ]
    model_inputs["labels"] = labels_ids
    return model_inputs

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.15, random_state=42)

# Convert the pandas DataFrames to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Preprocess the training and validation datasets
train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    batch_size=16,  # Process in batches of 16
    remove_columns=train_dataset.column_names,  # Remove original columns
    num_proc=4,  # Use 4 processes for parallel processing
)

val_dataset = val_dataset.map(
    preprocess_function,
    batched=True,
    batch_size=16,  # Process in batches of 16
    remove_columns=val_dataset.column_names,  # Remove original columns
    num_proc=4,  # Use 4 processes for parallel processing
)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    do_eval=True,
    save_total_limit=1,  # Guarda solo el último checkpoint para ahorrar espacio
    learning_rate=5e-4,  # Puedes subirlo a 1e-3 para convergencia más rápida, pero cuidado con la estabilidad
    num_train_epochs=1,  # Solo 1 época para pruebas rápidas
    per_device_train_batch_size=16,  # Aumenta el batch size si tu RAM lo permite
    per_device_eval_batch_size=16,
    warmup_ratio=0.05,  # Menor warmup para acelerar el inicio
    weight_decay=0.01,  # Menor regularización para acelerar el aprendizaje
    predict_with_generate=True,
    fp16=False,
    no_cuda=True,
    logging_dir="./logs",
    logging_steps=100,  # Menos logs para menos overhead
    gradient_accumulation_steps=1,  # Sin acumulación para pasos más rápidos
    max_grad_norm=1.0,
    dataloader_num_workers=2,
    group_by_length=True,
    remove_unused_columns=True,
    label_smoothing_factor=0.05,
)

# Initialize the data collator for seq2seq tasks
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding='longest',  # Pad sequences to the longest in the batch
    return_tensors="pt",  # Return PyTorch tensors
)

# Define a function to compute evaluation metrics
def compute_metrics(eval_pred, tokenizer):
    """
    Computes exact match, BLEU, and ROUGE-L metrics for evaluation.
    """
    predictions, labels = eval_pred

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Normalize text for comparison
    decoded_preds = [text.strip().lower() for text in decoded_preds]
    decoded_labels = [text.strip().lower() for text in decoded_labels]

    # Compute exact match
    exact_match = np.mean([p == l for p, l in zip(decoded_preds, decoded_labels)])

    # Load BLEU and ROUGE metrics
    bleu_metric = evaluate.load("bleu")
    rouge_metric = evaluate.load("rouge")

    # Compute BLEU score
    bleu_score = bleu_metric.compute(
        predictions=decoded_preds,
        references=[[label] for label in decoded_labels]
    )["bleu"]

    # Compute ROUGE-L score
    rouge_score = rouge_metric.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )["rougeL"]

    return {
        "exact_match": exact_match,
        "BLEU": bleu_score,
        "ROUGE-L": rouge_score,
    }

# Initialize the Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=lambda eval_pred: compute_metrics(eval_pred, tokenizer),
)

# Train the model
trainer.train()

# Save the trained model and tokenizer
trainer.save_model("./t5_chatbot_model")
tokenizer.save_pretrained("./t5_chatbot_tokenizer")

# Save the model's state dictionary
model_path = "./t5_chatbot_model.h5"
torch.save(model.state_dict(), model_path)

# Save the training log history
log_history = trainer.state.log_history

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565



Detailed Model Summary:
Layer Type                    Count     Parameters     
T5ForConditionalGeneration    1         222,882,048    
Embedding                     3         24,653,568     
T5Stack                       2         247,534,848    
ModuleList                    26        396,455,424    
T5Block                       24        198,227,712    
T5LayerSelfAttention          24        56,642,304     
T5Attention                   36        84,935,424     
Linear                        193       222,833,664    
T5LayerNorm                   62        47,616         
Dropout                       86        0              
T5LayerFF                     24        113,264,640    
T5DenseActDense               24        113,246,208    
ReLU                          24        0              
T5LayerCrossAttention         12        28,320,768     


Map (num_proc=4):   0%|          | 0/11778 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2079 [00:00<?, ? examples/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
100,2.9223
200,2.5729
300,2.4269
400,2.4396
500,2.3767
600,2.3075
700,2.3281


In [None]:
# Initialize lists to store training and evaluation metrics
train_loss = []  # To store training loss values
eval_loss = []  # To store evaluation loss values
eval_bleu = []  # To store BLEU scores during evaluation
eval_exact_match = []  # To store exact match scores during evaluation
eval_rogue = []  # To store ROUGE-L scores during evaluation
steps = []  # To store training step numbers
eval_steps = []  # To store evaluation step numbers

# Extract metrics from the training log history
for log in log_history:
    if "loss" in log:  # Check if training loss is in the log
        train_loss.append(log["loss"])  # Append training loss
        steps.append(log["step"])  # Append the corresponding step number
    if "eval_loss" in log:  # Check if evaluation loss is in the log
        eval_loss.append(log["eval_loss"])  # Append evaluation loss
        eval_steps.append(log["step"])  # Append the corresponding step number
    if "eval_BLEU" in log:  # Check if BLEU score is in the log
        eval_bleu.append(log["eval_BLEU"])  # Append BLEU score
    if "eval_ROUGE-L" in log:  # Check if ROUGE-L score is in the log
        eval_rogue.append(log["eval_ROUGE-L"])  # Append ROUGE-L score
    if "eval_exact_match" in log:  # Check if exact match score is in the log
        eval_exact_match.append(log["eval_exact_match"])  # Append exact match score

# Plot the training and evaluation loss
plt.figure(figsize=(10, 6))
plt.plot(steps, train_loss, label="Training Loss", color="blue", marker="o")  # Plot training loss
plt.plot(steps[:len(eval_loss)], eval_loss, label="Evaluation Loss", color="orange", marker="o")  # Plot evaluation loss
plt.xlabel("Training Steps")  # X-axis label
plt.ylabel("Loss")  # Y-axis label
plt.title("Training vs Evaluation Loss")  # Plot title
plt.legend()  # Show legend
plt.grid(True)  # Add grid for better readability
plt.show()  # Display the plot

# Plot the BLEU score over training steps
plt.figure(figsize=(10, 6))
plt.plot(eval_steps, eval_bleu, label="BLEU", marker="o", linestyle="-", color="green")  # Plot BLEU score
plt.xlabel("Training Steps")  # X-axis label
plt.ylabel("Metric Score")  # Y-axis label
plt.title("BLEU Score Over Training Steps")  # Plot title
plt.legend()  # Show legend
plt.grid(True)  # Add grid for better readability
plt.tight_layout()  # Adjust layout for better spacing
plt.show()  # Display the plot

# Plot the ROUGE-L score over training steps
plt.figure(figsize=(10, 6))
plt.plot(eval_steps, eval_rogue, label="ROUGE-L", marker="o", linestyle="-", color="red")  # Plot ROUGE-L score
plt.xlabel("Training Steps")  # X-axis label
plt.ylabel("Metric Score")  # Y-axis label
plt.title("ROUGE-L Score Over Training Steps")  # Plot title
plt.legend()  # Show legend
plt.grid(True)  # Add grid for better readability
plt.tight_layout()  # Adjust layout for better spacing
plt.show()  # Display the plot

# Plot the exact match score over training steps
plt.figure(figsize=(10, 6))
plt.plot(eval_steps, eval_exact_match, label="Exact Match", marker="o", linestyle="-", color="black")  # Plot exact match score
plt.xlabel("Training Steps")  # X-axis label
plt.ylabel("Metric Score")  # Y-axis label
plt.title("Exact Match Over Training Steps")  # Plot title
plt.legend()  # Show legend
plt.grid(True)  # Add grid for better readability
plt.tight_layout()  # Adjust layout for better spacing
plt.show()  # Display the plot