In [16]:
# !pip install unsloth
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [2]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from trl import SFTTrainer
from transformers import TrainingArguments

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-11-29 06:29:04.846229: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764397745.217573      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764397745.342413      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


ü¶• Unsloth Zoo will now patch everything to make training faster!


In [None]:
import wandb
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
my_secret = user_secrets.get_secret("wandb_api_key") 
wandb.login(key=my_secret)

wandb.init(
    project="llama-bangla-empathic",
    name="llama-3.1-8b-finetuning-v2",
    config={
        "model": "Llama-3.1-8B-Instruct",
        "dataset": "bangla-empathic",
        "task": "instruction-finetuning",
        "language": "bangla",
        "epochs": 3,
        "batch_size": 2,
        "gradient_accumulation_steps": 8,
        "effective_batch_size": 16,
        "learning_rate": 5e-5,  # Fixed: Reduced from 2e-4 to 5e-5 for better convergence
        "lora_r": 16,
        "lora_alpha": 16,
        "max_seq_length": 2020,
    },
    tags=["llama-3.1", "bangla", "empathic", "unsloth", "lora", "fixed-training"]
)

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maber-islam-dev[0m ([33maber-islam-dev-jvai[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
class DatasetProcessor:
    # Process and format datasets for Llama 3.1 fine-tuning
    
    def __init__(self, csv_path):
        self.csv_path = csv_path
        self.df = None
        self.train_df = None
        self.val_df = None
        self.test_df = None
        self.train_dataset = None
        self.val_dataset = None
        self.test_dataset = None
    
    def load_and_clean_data(self):
        # Load and clean the dataset
        self.df = pd.read_csv(self.csv_path)
        
        # Keep only Questions and Answers columns
        self.df = self.df[['Questions', 'Answers']].copy()
        
        # Remove rows with missing values
        self.df = self.df.dropna()
        
        # Remove rows where text is empty after stripping whitespace
        self.df = self.df[(self.df['Questions'].str.strip() != '') & (self.df['Answers'].str.strip() != '')]
        
        # Strip whitespace
        self.df['Questions'] = self.df['Questions'].str.strip()
        self.df['Answers'] = self.df['Answers'].str.strip()
        
        print(f"Dataset size after cleaning: {len(self.df)}")
        return self.df
    
    def split_data(self, test_size=0.2, val_size=0.5, random_state=42):
        # Split data into train, validation, and test sets
        # First split: 80% train, 20% temp (for val + test)
        self.train_df, temp_df = train_test_split(
            self.df, test_size=test_size, random_state=random_state, shuffle=True
        )
        
        # Second split: Split temp into 50% validation, 50% test (10% each of total)
        self.val_df, self.test_df = train_test_split(
            temp_df, test_size=val_size, random_state=random_state, shuffle=True
        )
        
        print(f"Training samples: {len(self.train_df)}")
        print(f"Validation samples: {len(self.val_df):>6}")
        print(f"Test samples: {len(self.test_df):>6}")
        
        return self.train_df, self.val_df, self.test_df
    
    @staticmethod
    def format_prompt(question, answer=None):
        # Format prompt using Llama 3.1 official format
        # Reference: https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_1/
        # Fixed: Added proper newlines before <|eot_id|> tokens
        prompt = (
            "<|begin_of_text|>"
            "<|start_header_id|>system<|end_header_id|>\n\n"
            "You are a sympathetic and helpful assistant. You answer people's questions in Bengali language.\n<|eot_id|>"
            "<|start_header_id|>user<|end_header_id|>\n\n"
            f"{question}\n<|eot_id|>"
            "<|start_header_id|>assistant<|end_header_id|>\n\n"
        )
        if answer:
            prompt += f"{answer}\n<|eot_id|>"
        return prompt
    
    def formatting_prompts_func(self, examples):
        # Format dataset examples for training
        questions = examples['Questions']
        answers = examples['Answers']
        texts = [self.format_prompt(q, a) for q, a in zip(questions, answers)]
        return {"text": texts}
    
    def create_datasets(self):
        # Create HuggingFace datasets from dataframes
        self.train_dataset = Dataset.from_pandas(self.train_df[['Questions', 'Answers']].reset_index(drop=True))
        self.val_dataset = Dataset.from_pandas(self.val_df[['Questions', 'Answers']].reset_index(drop=True))
        self.test_dataset = Dataset.from_pandas(self.test_df[['Questions', 'Answers']].reset_index(drop=True))
        
        # Apply formatting
        self.train_dataset = self.train_dataset.map(self.formatting_prompts_func, batched=True)
        self.val_dataset = self.val_dataset.map(self.formatting_prompts_func, batched=True)
        self.test_dataset = self.test_dataset.map(self.formatting_prompts_func, batched=True)
        
        return self.train_dataset, self.val_dataset, self.test_dataset
    
    def display_sample(self, num_samples=2):
        # Display sample formatted data
        print("Sample formatted training data:")
        print("="*80)
        for i in range(min(num_samples, len(self.train_dataset))):
            print(f"\nSample {i+1}:")
            print("="*80)
            print(self.train_dataset[i]["text"])

In [None]:
class LLAMAFineTuner:
    # Fine-tune Llama 3.1 model with LoRA
    
    def __init__(self, model_name="unsloth/Meta-Llama-3.1-8B-Instruct", 
                 max_seq_length=2000, dtype=None, load_in_4bit=False, device_map="balanced"):
        self.model_name = model_name
        self.max_seq_length = max_seq_length
        self.dtype = dtype
        self.load_in_4bit = load_in_4bit
        self.device_map = device_map
        self.model = None
        self.tokenizer = None
        self.trainer = None
    
    def load_model(self):
        # Load the base model and tokenizer
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
            model_name=self.model_name,
            max_seq_length=self.max_seq_length,
            dtype=self.dtype,
            load_in_4bit=self.load_in_4bit,
            device_map=self.device_map
        )
        print(f"Model loaded: {self.model_name}")
        return self.model, self.tokenizer
    
    def apply_lora(self, r=16, lora_alpha=16, lora_dropout=0, bias="none",
                   use_gradient_checkpointing="unsloth", random_state=3407):
        # Apply LoRA configuration to model
        self.model = FastLanguageModel.get_peft_model(
            self.model,
            r=r, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                          "gate_proj", "up_proj", "down_proj"],
            lora_alpha=lora_alpha,
            lora_dropout=lora_dropout, # Supports any, but = 0 is optimized
            bias=bias, # Supports any, but = "none" is optimized
            # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
            use_gradient_checkpointing=use_gradient_checkpointing, # True or "unsloth" for very long context
            random_state=random_state,
            use_rslora=False, # We support rank stabilized LoRA
            loftq_config=None, # And LoftQ
        )
        print("LoRA configuration applied")
        return self.model
    
    def create_trainer(self, train_dataset, val_dataset, per_device_train_batch_size=2, 
                       gradient_accumulation_steps=8, num_train_epochs=3, learning_rate=2e-4, 
                       output_dir="outputs"):
        # Create SFT trainer with optimal batch configuration
        # Effective Batch Size = per_device_train_batch_size * gradient_accumulation_steps
        # Default: 2 * 8 = 16 (recommended for stable training)
        
        effective_batch_size = per_device_train_batch_size * gradient_accumulation_steps
        print(f"Batch configuration:")
        print(f"  Per-device batch size: {per_device_train_batch_size}")
        print(f"  Gradient accumulation steps: {gradient_accumulation_steps}")
        print(f"  Effective batch size: {effective_batch_size}")
        
        self.trainer = SFTTrainer(
            model=self.model,
            tokenizer=self.tokenizer,
            train_dataset=train_dataset,
            eval_dataset=val_dataset, # Use validation set for monitoring
            dataset_text_field="text",
            max_seq_length=self.max_seq_length,
            dataset_num_proc=2,
            packing=False, # Can make training 5x faster for short sequences
            
            args=TrainingArguments(
                per_device_train_batch_size=per_device_train_batch_size,
                per_device_eval_batch_size=2,
                gradient_accumulation_steps=gradient_accumulation_steps,
                warmup_steps=100, # Increased warmup for better stability
                
                # Choose one: num_train_epochs OR max_steps
                num_train_epochs=num_train_epochs, # For full training - trains through entire dataset
                # max_steps=max_steps, # Or use this for quick testing
                learning_rate=learning_rate,
                fp16=not is_bfloat16_supported(),
                bf16=is_bfloat16_supported(),
                logging_steps=10,
                optim="adamw_8bit",
                weight_decay=0.01,
                lr_scheduler_type="cosine",
                seed=42,
                output_dir=output_dir,
                
                # Evaluation and checkpointing
                eval_strategy="steps", # Changed from evaluation_strategy
                eval_steps=50, # Evaluate less frequently to speed up training
                save_strategy="steps",
                save_steps=100, # Save less frequently
                save_total_limit=3, # Keep only best 3 checkpoints
                load_best_model_at_end=True,
                metric_for_best_model="eval_loss",
                greater_is_better=False,
                
                # Weights & Biases integration
                report_to="wandb", # Enable wandb logging
                run_name="llama-3.2-8b-finetuning-v2", # Run name in wandb
                logging_first_step=True,
                logging_nan_inf_filter=True,
                remove_unused_columns=False, # Keep all columns for SFTTrainer
            ),
        )
        print("Trainer configured")
        return self.trainer
    
    def train(self):
        # Train the model using standard trainer.train() method
        # Note: Using trainer.train() instead of unsloth_train to avoid meta tensor issues
        print("Starting training...")
        trainer_stats = self.trainer.train()
        print("Training completed")
        return trainer_stats
    
    def save_model(self, output_dir="llama-3.1-8b-bangla-empathic-lora"):
        # Save the fine-tuned model
        self.model.save_pretrained(output_dir)
        self.tokenizer.save_pretrained(output_dir)
        # Optionally save to merged 16bit
        self.model.save_pretrained_merged("llama-3.1-8b-bangla-empathic-merged", self.tokenizer, save_method="merged_16bit")
        print(f"Model saved to: {output_dir}")
    
    def enable_inference_mode(self):
        # Enable inference mode for the model
        FastLanguageModel.for_inference(self.model)


In [None]:
class Evaluator:
    # Evaluate fine-tuned model
    
    def __init__(self, fine_tuner, data_processor):
        self.model = fine_tuner.model
        self.tokenizer = fine_tuner.tokenizer
        self.trainer = fine_tuner.trainer
        self.test_df = data_processor.test_df
        self.test_dataset = data_processor.test_dataset
        self.format_prompt = data_processor.format_prompt
    
    def generate_response(self, question, max_new_tokens=256, temperature=0.7, top_p=0.9):
        # Generate a response for a question
        prompt = self.format_prompt(question)
        inputs = self.tokenizer(prompt, return_tensors="pt").to("cuda")
        
        outputs = self.model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            use_cache=True
        )
        
        # Decode and extract only the assistant's response
        full_output = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract response after the prompt
        response = full_output[len(self.tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)):].strip()
        return response
    
    def evaluate_metrics(self, num_samples=100):
        # Evaluate model with BLEU, ROUGE, and Perplexity
        import numpy as np
        from evaluate import load
        
        # Load evaluation metrics
        bleu_metric = load("bleu")
        rouge_metric = load("rouge")
        
        # Generate predictions on test set
        predictions = []
        references = []
        
        for i in range(min(num_samples, len(self.test_dataset))):
            question = self.test_df.iloc[i]['Questions']
            reference = self.test_df.iloc[i]['Answers']
            prediction = self.generate_response(question)
            predictions.append(prediction)
            references.append(reference)
        
        # Calculate BLEU score
        bleu_results = bleu_metric.compute(predictions=predictions, references=[[ref] for ref in references])
        
        # Calculate ROUGE scores
        rouge_results = rouge_metric.compute(predictions=predictions, references=references)
        
        # Calculate perplexity from evaluation loss
        eval_results = self.trainer.evaluate(eval_dataset=self.test_dataset)
        perplexity = np.exp(eval_results['eval_loss'])
        
        results = {
            "perplexity": perplexity,
            "bleu": bleu_results['bleu'],
            "rouge1": rouge_results['rouge1'],
            "rouge2": rouge_results['rouge2'],
            "rougeL": rouge_results['rougeL']
        }
        
        print(f"\nEvaluation Results:")
        print(f"Perplexity: {results['perplexity']:.4f}")
        print(f"BLEU Score: {results['bleu']:.4f}")
        print(f"ROUGE-1: {results['rouge1']:.4f}")
        print(f"ROUGE-2: {results['rouge2']:.4f}")
        print(f"ROUGE-L: {results['rougeL']:.4f}")
        
        return results
    
    def create_human_eval_samples(self, sample_size=20, output_file="human_evaluation_samples.csv"):
        # Create samples for human evaluation
        import random
        
        # Sample random examples for human evaluation
        sample_indices = random.sample(range(len(self.test_df)), min(sample_size, len(self.test_df)))
        human_eval_data = []
        
        for idx in sample_indices:
            question = self.test_df.iloc[idx]['Questions']
            reference = self.test_df.iloc[idx]['Answers']
            
            # Get model prediction
            prediction = self.generate_response(question)
            
            human_eval_data.append({
                "id": idx,
                "question": question,
                "reference_answer": reference,
                "model_answer": prediction,
                "empathy_score": None, # To be filled by human evaluators (1-5)
                "relevance_score": None, # To be filled by human evaluators (1-5)
                "fluency_score": None, # To be filled by human evaluators (1-5)
                "notes": "" # Additional comments
            })
        
        # Save to CSV for human evaluation
        human_eval_df = pd.DataFrame(human_eval_data)
        human_eval_df.to_csv(output_file, index=False)
        
        print(f"\nCreated {len(human_eval_data)} samples for human evaluation")
        print(f"Saved to: {output_file}")
        print("\nEvaluation criteria:")
        print("1. Empathy Score (1-5): How empathetic and understanding is the response?")
        print("2. Relevance Score (1-5): How relevant is the response to the question?")
        print("3. Fluency Score (1-5): How fluent and natural is the Bengali language?")
        
        return human_eval_df
    
    def display_sample_responses(self, num_samples=5):
        # Display sample responses with streaming
        from transformers import TextStreamer
        print(f"Generating {num_samples} sample responses on test prompts")
       
        
        for i in range(min(num_samples, len(self.test_df))):
            question = self.test_df.iloc[i]['Questions']
            reference = self.test_df.iloc[i]['Answers']
            
            print(f"\n--- Sample {i+1} ---")
            print(f"Question: {question}")
            print(f"\nReference Answer: {reference}")
            print(f"\nModel Response:")
            
            # Create prompt
            prompt = self.format_prompt(question)
            inputs = self.tokenizer(prompt, return_tensors="pt").to("cuda")
            
            # Generate with streaming
            text_streamer = TextStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
            _ = self.model.generate(
                **inputs,
                streamer=text_streamer,
                max_new_tokens=300,
                temperature=0.5,
                top_p=0.9,
                use_cache=True
            )
            
            print("\n" + "-"*80)
    
    def log_all_responses(self, experiment_name="llama-3.1-8b-bangla-empathic"):
        # Log all test responses
        from datetime import datetime
        import uuid
        
        # Generate experiment ID
        experiment_id = f"{experiment_name}-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
        
        # Generate responses for all test samples and log them
        generated_responses = []
        
        print(f"Generating and logging responses for {len(self.test_df)} test samples...")
        
        for idx in range(len(self.test_df)):
            question = self.test_df.iloc[idx]['Questions']
            reference = self.test_df.iloc[idx]['Answers']
            response_text = self.generate_response(question)
            
            # Create log entry
            log_entry = {
                "id": str(uuid.uuid4()),
                "experiment_id": experiment_id,
                "sample_index": idx,
                "input_text": question,
                "reference_text": reference,
                "response_text": response_text,
                "timestamp": datetime.now().isoformat(),
                "model_name": "llama-3.1-8b-instruct",
                "temperature": 0.5,
                "top_p": 0.9,
                "max_new_tokens": 300
            }
            
            generated_responses.append(log_entry)
            
            if (idx + 1) % 10 == 0:
                print(f"Processed {idx + 1}/{len(self.test_df)} samples...")
        
        # Save to CSV
        responses_df = pd.DataFrame(generated_responses)
        log_filename = f"generated_responses_{experiment_id}.csv"
        responses_df.to_csv(log_filename, index=False)
        
        print(f"\nGenerated responses logged successfully!")
        print(f"Total responses: {len(generated_responses)}")
        print(f"Saved to: {log_filename}")
        
        return responses_df, log_filename

In [21]:
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU: {torch.cuda.get_device_name(1)}")
    print(
        f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} + {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB"
    )

PyTorch version: 2.9.0+cu128
CUDA available: True
GPU: Tesla T4
GPU: Tesla T4
GPU Memory: 15.8 + 15.8 GB


In [6]:
# Initialize DatasetProcessor
data_processor = DatasetProcessor('/kaggle/input/bengali-empathetic-conversations-corpus/BengaliEmpatheticConversationsCorpus .csv')

# Load and clean data
df = data_processor.load_and_clean_data()
df.head()

Dataset size after cleaning: 38210


Unnamed: 0,Questions,Answers
0,‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶∏‡ßç‡¶§‡ßç‡¶∞‡ßÄ ‡¶è‡¶¨‡¶Ç ‡¶Æ‡¶æ‡¶Ø‡¶º‡ßá‡¶∞ ‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá ‡¶ü‡¶æ‡¶®‡¶ü‡¶æ‡¶® ‡¶Æ‡¶§‡¶¨‡¶ø‡¶∞‡ßã‡¶ß ‡¶ö‡¶≤...,"‡¶Ü‡¶™‡¶®‡¶ø ‡¶Ø‡¶æ ‡¶¨‡¶∞‡ßç‡¶£‡¶®‡¶æ ‡¶ï‡¶∞‡¶õ‡ßá‡¶® ‡¶§‡¶æ‡¶ï‡ßá ‡¶Æ‡¶®‡ßã‡¶¨‡¶ø‡¶ú‡ßç‡¶û‡¶æ‡¶®‡ßÄ‡¶∞‡¶æ ""‡¶§‡ßç‡¶∞‡¶ø‡¶≠..."
1,"‡¶Ü‡¶Æ‡¶ø ‡¶¨‡¶æ‡¶ö‡ßç‡¶ö‡¶æ ‡¶®‡ßá‡¶ì‡¶Ø‡¶º‡¶æ‡¶∞ ‡¶™‡¶∞‡¶ø‡¶ï‡¶≤‡ßç‡¶™‡¶®‡¶æ ‡¶ï‡¶∞‡¶õ‡¶ø, ‡¶§‡¶æ‡¶á ‡¶Ü‡¶Æ‡¶æ‡¶ï‡ßá ‡¶ß...",‡¶π‡¶æ‡¶á‡•§ ‡¶Ü‡¶™‡¶®‡¶æ‡¶∞ ‡¶∂‡¶ø‡¶∂‡ßÅ‡¶∞ (‡¶è‡¶¨‡¶Ç ‡¶®‡¶ø‡¶ú‡ßá‡¶∞) ‡¶ú‡¶®‡ßç‡¶Ø ‡¶Ø‡¶æ ‡¶∏‡ßç‡¶¨‡¶æ‡¶∏‡ßç‡¶•‡ßç‡¶Ø...
2,"‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶Æ‡¶®‡ßá‡¶∞ ‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá ‡¶ó‡ßã‡¶™‡¶® ‡¶Ü‡¶õ‡ßá, ‡¶è‡¶¨‡¶Ç ‡¶Ü‡¶Æ‡¶ø ‡¶ú‡¶æ‡¶®‡¶ø ‡¶®‡¶æ ‡¶§‡¶æ‡¶¶‡ßá...",‡¶Æ‡¶®‡ßá ‡¶π‡¶ö‡ßç‡¶õ‡ßá ‡¶ó‡ßã‡¶™‡¶® ‡¶∞‡¶æ‡¶ñ‡¶æ ‡¶è‡¶ñ‡¶® ‡¶Ü‡¶™‡¶®‡¶æ‡¶∞ ‡¶ú‡¶®‡ßç‡¶Ø ‡¶è‡¶ï‡¶ü‡¶ø ‡¶∏‡¶Æ‡¶∏‡ßç‡¶Ø‡¶æ...
3,‡¶Ü‡¶Æ‡¶ø ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶∏‡¶Æ‡ßç‡¶™‡¶∞‡ßç‡¶ï‡ßá‡¶∞ ‡¶ï‡ßç‡¶∑‡ßá‡¶§‡ßç‡¶∞‡ßá ‡¶Ö‡¶§‡ßç‡¶Ø‡¶®‡ßç‡¶§ ‡¶Ö‡¶ß‡¶ø‡¶ï‡¶æ‡¶∞‡¶∏‡ßÇ‡¶ö‡¶ï...,‡¶π‡ßç‡¶Ø‡¶æ‡¶≤‡ßã‡•§ ‡¶è‡¶ü‡¶æ ‡¶¶‡ßÅ‡¶∞‡ßç‡¶¶‡¶æ‡¶®‡ßç‡¶§ ‡¶Ø‡ßá ‡¶Ü‡¶™‡¶®‡¶ø ‡¶â‡¶™‡¶≤‡¶¨‡ßç‡¶ß‡¶ø ‡¶ï‡¶∞‡¶§‡ßá ‡¶∏‡¶ï‡ßç...
4,‡¶ï‡¶Ø‡¶º‡ßá‡¶ï ‡¶¨‡¶õ‡¶∞ ‡¶Ü‡¶ó‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶Æ‡¶æ‡¶•‡¶æ‡¶Ø‡¶º ‡¶Ü‡¶ò‡¶æ‡¶§ ‡¶≤‡ßá‡¶ó‡ßá‡¶õ‡¶ø‡¶≤ ‡¶è‡¶¨‡¶Ç ‡¶Ü‡¶Æ‡¶æ...,‡¶Ü‡¶™‡¶®‡¶ø ‡¶¨‡¶≤‡ßá‡¶®‡¶®‡¶ø ‡¶ï‡¶ø ‡¶¨‡¶æ ‡¶ï‡¶§ ‡¶ì‡¶∑‡ßÅ‡¶ß ‡¶Ü‡¶™‡¶®‡¶ø ‡¶ö‡ßá‡¶∑‡ßç‡¶ü‡¶æ ‡¶ï‡¶∞‡ßá‡¶õ‡ßá‡¶®‡•§ ...


In [7]:
# Initialize LLAMAFineTuner
fine_tuner = LLAMAFineTuner(
    model_name="unsloth/Meta-Llama-3.1-8B-Instruct",
    max_seq_length=2020, # Choose any! We auto support RoPE Scaling internally!
    dtype=None, # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit=False, # Use 4-bit quantization to reduce memory usage. Can be False.
    device_map="balanced" # Enables Multi-GPU Training
)

# Load model and tokenizer
model, tokenizer = fine_tuner.load_model()

==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Model loaded: unsloth/Meta-Llama-3.1-8B-Instruct


In [8]:
# Train/Validation/Test Splits
train_df, val_df, test_df = data_processor.split_data(test_size=0.2, val_size=0.5, random_state=42)

Training samples: 30568
Validation samples:   3821
Test samples:   3821


In [9]:
# STEP 4: CREATE HUGGINGFACE DATASETS
train_dataset, val_dataset, test_dataset = data_processor.create_datasets()

Map:   0%|          | 0/30568 [00:00<?, ? examples/s]

Map:   0%|          | 0/3821 [00:00<?, ? examples/s]

Map:   0%|          | 0/3821 [00:00<?, ? examples/s]

In [10]:
# Display sample data
data_processor.display_sample(num_samples=2)

Sample formatted training data:

Sample 1:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a sympathetic and helpful assistant. You answer people's questions in Bengali language.<|eot_id|><|start_header_id|>user<|end_header_id|>

‡¶Ü‡¶Æ‡¶ø ‡¶Ü‡¶ó‡¶æ‡¶Æ‡ßÄ ‡¶´‡ßá‡¶¨‡ßç‡¶∞‡ßÅ‡¶Ø‡¶º‡¶æ‡¶∞‡¶ø‡¶§‡ßá ‡¶™‡ßç‡¶∞‡¶Æ‡ßã‡¶∂‡¶® ‡¶™‡¶æ‡¶ì‡¶Ø‡¶º‡¶æ‡¶∞ ‡¶ö‡ßá‡¶∑‡ßç‡¶ü‡¶æ ‡¶ï‡¶∞‡¶õ‡¶ø‡•§<|eot_id|><|start_header_id|>assistant<|end_header_id|>

‡¶Ü‡¶∞‡ßá‡•§ ‡¶Ü‡¶Æ‡¶ø ‡¶Ü‡¶™‡¶®‡¶ø ‡¶è‡¶ü‡¶æ ‡¶™‡ßá‡¶§‡ßá ‡¶Ü‡¶∂‡¶æ ‡¶ï‡¶∞‡¶ø!<|eot_id|>

Sample 2:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a sympathetic and helpful assistant. You answer people's questions in Bengali language.<|eot_id|><|start_header_id|>user<|end_header_id|>

‡¶Ü‡¶Æ‡¶ø ‡¶®‡¶ø‡¶ú‡ßá‡¶ï‡ßá ‡¶∞‡¶ï‡ßç‡¶∑‡¶æ ‡¶ï‡¶∞‡ßá‡¶õ‡¶ø<|eot_id|><|start_header_id|>assistant<|end_header_id|>

‡¶è‡¶ü‡¶æ ‡¶ö‡¶Æ‡ßé‡¶ï‡¶æ‡¶∞, ‡¶Ü‡¶Æ‡¶ø ‡¶Ü‡¶∂‡¶æ ‡¶ï‡¶∞‡¶ø ‡¶Ü‡¶™‡¶®‡¶ø ‡¶Ü‡¶ò‡¶æ‡¶§ ‡¶™‡¶æ‡¶¨‡ßá‡¶® ‡¶®‡¶æ!<|eot_id|>


In [11]:
# Apply LoRA
model = fine_tuner.apply_lora(
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=2020
)

Unsloth 2025.11.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


LoRA configuration applied


In [None]:
# Create trainer with optimal batch configuration and full epoch training
# Fixed: Changed from max_steps=100 to num_train_epochs=3
# - max_steps=100 was only 13% of one epoch (causing loss plateau at 0.6)
# - num_train_epochs=3 will train through entire dataset 3 times (~2,250 steps)
# - Learning rate reduced to 5e-5 for better convergence past plateaus

trainer = fine_tuner.create_trainer(
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    per_device_train_batch_size=2,  # Primary driver of VRAM usage
    gradient_accumulation_steps=8,  # Primary driver of training time
    num_train_epochs=3,  # Train for 3 full epochs instead of stopping at 100 steps
    learning_rate=5e-5,  # Reduced from 2e-4 to help break through loss plateaus
    output_dir="outputs"
)

Unsloth: Tokenizing ["text"] (num_proc=8):   0%|          | 0/30568 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=8):   0%|          | 0/3821 [00:00<?, ? examples/s]

Trainer configured


In [13]:
# Train the model
trainer_stats = fine_tuner.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 2
   \\   /|    Num examples = 30,568 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
10,1.5723,1.555313
20,1.3423,1.171718
30,1.0011,0.888749
40,0.8003,0.726945
50,0.6899,0.69625
60,0.6763,0.655702
70,0.6822,0.642613
80,0.6339,0.637852
90,0.6459,0.634976
100,0.6529,0.630435


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


0,1
eval/loss,‚ñà‚ñÖ‚ñÉ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ
eval/runtime,‚ñà‚ñÅ‚ñÇ‚ñÅ‚ñÅ‚ñÉ‚ñÇ‚ñÅ‚ñÇ‚ñÅ
eval/samples_per_second,‚ñÅ‚ñà‚ñá‚ñà‚ñà‚ñá‚ñá‚ñà‚ñá‚ñà
eval/steps_per_second,‚ñÅ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
train/epoch,‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà
train/global_step,‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà
train/grad_norm,‚ñà‚ñÜ‚ñÑ‚ñÑ‚ñÇ‚ñÇ‚ñÉ‚ñÇ‚ñÉ‚ñÅ‚ñÇ
train/learning_rate,‚ñÅ‚ñÇ‚ñÇ‚ñÉ‚ñÑ‚ñÑ‚ñÖ‚ñÜ‚ñá‚ñá‚ñà
train/loss,‚ñà‚ñá‚ñÜ‚ñÉ‚ñÇ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ

0,1
eval/loss,0.63044
eval/runtime,1234.2125
eval/samples_per_second,3.096
eval/steps_per_second,1.548
total_flos,1.0712537217417216e+16
train/epoch,0.02617
train/global_step,100.0
train/grad_norm,0.44142
train/learning_rate,0.0002
train/loss,0.6529


Training completed


In [14]:
# Save model
fine_tuner.save_model("llama-3.1-8b-bangla-empathic-lora")

Model saved to: llama-3.1-8b-bangla-empathic-lora


In [22]:
# Enable inference mode and initialize Evaluator
fine_tuner.enable_inference_mode()
evaluator = Evaluator(fine_tuner, data_processor)

# Evaluate using: Perplexity, BLEU, ROUGE
results = evaluator.evaluate_metrics(num_samples=10)

# Log to wandb
wandb.log(results)

ValueError: No columns in the dataset match the model's forward method signature: (input_ids, labels, seq_lengths, completion_mask, assistant_masks). The following columns have been ignored: [text, Answers, Questions]. Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`.

In [23]:
# Evaluate: Human evaluation on empathetic response quality
human_eval_df = evaluator.create_human_eval_samples(sample_size=20)


Created 20 samples for human evaluation
Saved to: human_evaluation_samples.csv

Evaluation criteria:
1. Empathy Score (1-5): How empathetic and understanding is the response?
2. Relevance Score (1-5): How relevant is the response to the question?
3. Fluency Score (1-5): How fluent and natural is the Bengali language?


In [25]:
# Sample model responses on test prompts
evaluator.display_sample_responses(num_samples=5)


Generating 5 sample responses on test prompts

--- Sample 1 ---
Question: ‡¶Ü‡¶Æ‡¶ø ‡¶¨‡ßá‡¶∂ ‡¶ï‡¶ø‡¶õ‡ßÅ‡¶¶‡¶ø‡¶® ‡¶ß‡¶∞‡ßá ‡¶¨‡¶ø‡¶∑‡¶®‡ßç‡¶®‡¶§‡¶æ‡¶Ø‡¶º ‡¶≠‡ßÅ‡¶ó‡¶õ‡¶ø‡•§ ‡¶Ü‡¶Æ‡¶ø ‡¶è‡¶ü‡¶æ‡¶∞ ‡¶Æ‡¶æ‡¶ß‡ßç‡¶Ø‡¶Æ‡ßá ‡¶ï‡¶æ‡¶ú ‡¶ï‡¶∞‡¶æ‡¶∞ ‡¶ö‡ßá‡¶∑‡ßç‡¶ü‡¶æ ‡¶ï‡¶∞‡¶õ‡¶ø, ‡¶è‡¶¨‡¶Ç ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶™‡ßç‡¶∞‡ßá‡¶Æ‡¶ø‡¶ï ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶∂‡¶ø‡¶≤‡¶æ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá. ‡¶§‡¶ø‡¶®‡¶ø‡¶á ‡¶è‡¶ï‡¶Æ‡¶æ‡¶§‡ßç‡¶∞ ‡¶¨‡ßç‡¶Ø‡¶ï‡ßç‡¶§‡¶ø ‡¶Ø‡¶ø‡¶®‡¶ø ‡¶Ü‡¶Æ‡¶ø ‡¶Ø‡ßá ‡¶ï‡ßã‡¶®‡¶ì ‡¶¨‡¶ø‡¶∑‡¶Ø‡¶º‡ßá ‡¶ï‡¶•‡¶æ ‡¶¨‡¶≤‡¶æ‡¶∞ ‡¶ú‡¶®‡ßç‡¶Ø ‡¶Ø‡¶•‡ßá‡¶∑‡ßç‡¶ü ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡¶æ‡¶∏ ‡¶ï‡¶∞‡¶ø, ‡¶ï‡¶ø‡¶®‡ßç‡¶§‡ßÅ ‡¶§‡¶ø‡¶®‡¶ø ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶¨‡¶ø‡¶∑‡¶£‡ßç‡¶®‡¶§‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶∞‡¶£‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶∏‡¶Æ‡ßç‡¶™‡¶∞‡ßç‡¶ï ‡¶õ‡¶ø‡¶®‡ßç‡¶® ‡¶ï‡¶∞‡¶æ‡¶∞ ‡¶∏‡¶ø‡¶¶‡ßç‡¶ß‡¶æ‡¶®‡ßç‡¶§ ‡¶®‡¶ø‡¶Ø‡¶º‡ßá‡¶õ‡¶ø‡¶≤‡ßá‡¶®‡•§ ‡¶Ü‡¶Æ‡¶ø ‡¶è‡¶ï‡¶ú‡¶® ‡¶•‡ßá‡¶∞‡¶æ‡¶™‡¶ø‡¶∏‡ßç‡¶ü ‡¶¨‡¶æ ‡¶Ö‡¶®‡ßç‡¶Ø ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶¶‡ßá‡¶ñ‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶Æ‡¶∞‡ßç‡¶•‡ßç‡¶Ø ‡¶®‡ßá‡¶á, ‡¶§‡¶æ‡¶á ‡¶§‡¶ø‡¶®‡¶ø ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶•‡¶æ‡¶

In [None]:
# Store logs for GeneratedResponses: id, experiment_id, input_text, response_text, timestamp
responses_df, log_filename = evaluator.log_all_responses()

# Upload to wandb as artifact
artifact = wandb.Artifact(
    name="generated-responses",
    type="predictions",
    description="Generated responses from fine-tuned model on test set"
)
artifact.add_file(log_filename)
wandb.log_artifact(artifact)
print("Logged to W&B as artifact")

Generating and logging responses for 3821 test samples...
Processed 10/3821 samples...
Processed 20/3821 samples...
Processed 30/3821 samples...
Processed 40/3821 samples...
Processed 50/3821 samples...
Processed 60/3821 samples...
Processed 70/3821 samples...


In [None]:
# Finish wandb run
wandb.finish()

print("\n" + "="*80)
print("Training and evaluation completed successfully!")
print("="*80)
print(f"\nModel saved to: llama-3.1-8b-bangla-empathic-lora")
print(f"Human evaluation samples: human_evaluation_samples.csv")