In [49]:
# !pip install unsloth
# !pip install evaluate

In [50]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from trl import SFTTrainer
from transformers import TrainingArguments

In [51]:
import wandb
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
my_secret = user_secrets.get_secret("wandb_api_key") 
wandb.login(key=my_secret)

wandb.init(
    project="llama-bangla-empathic",
    name="llama-3.1-8b-finetuning-v2",
    config={
        "model": "Llama-3.1-8B-Instruct",
        "dataset": "bangla-empathic",
        "task": "instruction-finetuning",
        "language": "bangla",
        "epochs": 3,
        "batch_size": 2,
        "gradient_accumulation_steps": 8,
        "effective_batch_size": 16,
        "learning_rate": 5e-5,  # Fixed: Reduced from 2e-4 to 5e-5 for better convergence
        "lora_r": 16,
        "lora_alpha": 16,
        "max_seq_length": 2020,
    },
    tags=["llama-3.1", "bangla", "empathic", "unsloth", "lora", "fixed-training"]
)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [52]:
class DatasetProcessor:
    # Process and format datasets for Llama 3.1 fine-tuning
    
    def __init__(self, csv_path):
        self.csv_path = csv_path
        self.df = None
        self.train_df = None
        self.val_df = None
        self.test_df = None
        self.train_dataset = None
        self.val_dataset = None
        self.test_dataset = None
    
    def load_and_clean_data(self):
        # Load and clean the dataset
        self.df = pd.read_csv(self.csv_path)
        
        # Keep only Questions and Answers columns
        self.df = self.df[['Questions', 'Answers']].copy()
        
        # Remove rows with missing values
        self.df = self.df.dropna()
        
        # Remove rows where text is empty after stripping whitespace
        self.df = self.df[(self.df['Questions'].str.strip() != '') & (self.df['Answers'].str.strip() != '')]
        
        # Strip whitespace
        self.df['Questions'] = self.df['Questions'].str.strip()
        self.df['Answers'] = self.df['Answers'].str.strip()
        
        print(f"Dataset size after cleaning: {len(self.df)}")
        return self.df
    
    def split_data(self, test_size=0.2, val_size=0.5, random_state=42):
        # Split data into train, validation, and test sets
        # First split: 80% train, 20% temp (for val + test)
        self.train_df, temp_df = train_test_split(
            self.df, test_size=test_size, random_state=random_state, shuffle=True
        )
        
        # Second split: Split temp into 50% validation, 50% test (10% each of total)
        self.val_df, self.test_df = train_test_split(
            temp_df, test_size=val_size, random_state=random_state, shuffle=True
        )
        
        print(f"Training samples: {len(self.train_df)}")
        print(f"Validation samples: {len(self.val_df):>6}")
        print(f"Test samples: {len(self.test_df):>6}")
        
        return self.train_df, self.val_df, self.test_df
    
    @staticmethod
    def format_prompt(question, answer=None):
        # Format prompt using Llama 3.1 official format
        # Reference: https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_1/
        # Fixed: Added proper newlines before <|eot_id|> tokens
        prompt = (
            "<|begin_of_text|>"
            "<|start_header_id|>system<|end_header_id|>\n\n"
            "You are a sympathetic and helpful assistant. You answer people's questions in Bengali language.\n<|eot_id|>"
            "<|start_header_id|>user<|end_header_id|>\n\n"
            f"{question}\n<|eot_id|>"
            "<|start_header_id|>assistant<|end_header_id|>\n\n"
        )
        if answer:
            prompt += f"{answer}\n<|eot_id|>"
        return prompt
    
    def formatting_prompts_func(self, examples):
        # Format dataset examples for training
        questions = examples['Questions']
        answers = examples['Answers']
        texts = [self.format_prompt(q, a) for q, a in zip(questions, answers)]
        return {"text": texts}
    
    def create_datasets(self):
        # Create HuggingFace datasets from dataframes
        self.train_dataset = Dataset.from_pandas(self.train_df[['Questions', 'Answers']].reset_index(drop=True))
        self.val_dataset = Dataset.from_pandas(self.val_df[['Questions', 'Answers']].reset_index(drop=True))
        self.test_dataset = Dataset.from_pandas(self.test_df[['Questions', 'Answers']].reset_index(drop=True))
        
        # Apply formatting
        self.train_dataset = self.train_dataset.map(self.formatting_prompts_func, batched=True)
        self.val_dataset = self.val_dataset.map(self.formatting_prompts_func, batched=True)
        self.test_dataset = self.test_dataset.map(self.formatting_prompts_func, batched=True)
        
        return self.train_dataset, self.val_dataset, self.test_dataset
    
    def display_sample(self, num_samples=2):
        # Display sample formatted data
        print("Sample formatted training data:")
        print("="*80)
        for i in range(min(num_samples, len(self.train_dataset))):
            print(f"\nSample {i+1}:")
            print("="*80)
            print(self.train_dataset[i]["text"])

In [None]:
class LLAMAFineTuner:
    # Fine-tune Llama 3.1 model with LoRA
    
    def __init__(self, model_name="unsloth/Meta-Llama-3.1-8B-Instruct", 
                 max_seq_length=2000, dtype=None, load_in_4bit=True, device_map="auto"):
        self.model_name = model_name
        self.max_seq_length = max_seq_length
        self.dtype = dtype
        self.load_in_4bit = load_in_4bit
        self.device_map = device_map
        self.model = None
        self.tokenizer = None
        self.trainer = None
    
    def load_model(self):
        # Load the base model and tokenizer
        # Fixed: Enable 4bit loading by default to avoid meta tensor issues
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
            model_name=self.model_name,
            max_seq_length=self.max_seq_length,
            dtype=self.dtype,
            load_in_4bit=self.load_in_4bit,
            device_map=self.device_map,
            # Fix: Add these parameters to prevent meta tensor issues
            trust_remote_code=True,
            use_cache=False,
        )
        print(f"Model loaded: {self.model_name}")
        return self.model, self.tokenizer
    
    def apply_lora(self, r=16, lora_alpha=16, lora_dropout=0, bias="none",
                   use_gradient_checkpointing="unsloth", random_state=3407):
        # Apply LoRA configuration to model
        self.model = FastLanguageModel.get_peft_model(
            self.model,
            r=r, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                          "gate_proj", "up_proj", "down_proj"],
            lora_alpha=lora_alpha,
            lora_dropout=lora_dropout, # Supports any, but = 0 is optimized
            bias=bias, # Supports any, but = "none" is optimized
            # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
            use_gradient_checkpointing=use_gradient_checkpointing, # True or "unsloth" for very long context
            random_state=random_state,
            use_rslora=False, # We support rank stabilized LoRA
            loftq_config=None, # And LoftQ
        )
        print("LoRA configuration applied")
        return self.model
    
    def create_trainer(self, train_dataset, val_dataset, per_device_train_batch_size=2, 
                       gradient_accumulation_steps=8, num_train_epochs=3, learning_rate=2e-4, 
                       output_dir="outputs"):
        # Create SFT trainer with optimal batch configuration
        # Effective Batch Size = per_device_train_batch_size * gradient_accumulation_steps
        # Default: 2 * 8 = 16 (recommended for stable training)
        
        effective_batch_size = per_device_train_batch_size * gradient_accumulation_steps
        print(f"Batch configuration:")
        print(f"  Per-device batch size: {per_device_train_batch_size}")
        print(f"  Gradient accumulation steps: {gradient_accumulation_steps}")
        print(f"  Effective batch size: {effective_batch_size}")
        
        self.trainer = SFTTrainer(
            model=self.model,
            tokenizer=self.tokenizer,
            train_dataset=train_dataset,
            eval_dataset=val_dataset, # Use validation set for monitoring
            dataset_text_field="text",
            max_seq_length=self.max_seq_length,
            dataset_num_proc=2,
            packing=False, # Can make training 5x faster for short sequences
            
            args=TrainingArguments(
                per_device_train_batch_size=per_device_train_batch_size,
                per_device_eval_batch_size=2,
                gradient_accumulation_steps=gradient_accumulation_steps,
                warmup_steps=100, # Increased warmup for better stability
                
                # Choose one: num_train_epochs OR max_steps
                num_train_epochs=num_train_epochs, # For full training - trains through entire dataset
                # max_steps=max_steps, # Or use this for quick testing
                learning_rate=learning_rate,
                fp16=not is_bfloat16_supported(),
                bf16=is_bfloat16_supported(),
                logging_steps=10,
                optim="adamw_8bit",
                weight_decay=0.01,
                lr_scheduler_type="cosine",
                seed=42,
                output_dir=output_dir,
                
                # Evaluation and checkpointing
                eval_strategy="steps",
                eval_steps=50,
                save_strategy="steps",
                save_steps=100,
                save_total_limit=3,
                load_best_model_at_end=True,
                metric_for_best_model="eval_loss",
                greater_is_better=False,
                
                # Weights & Biases integration
                report_to="wandb",
                run_name="llama-3.1-8b-finetuning-v2",
                logging_first_step=True,
                logging_nan_inf_filter=True,
                remove_unused_columns=False,
                
                # Fix: Add these to prevent meta tensor issues
                dataloader_pin_memory=False,
                gradient_checkpointing=True,
                # Fix: Use safer data loading options
                dataloader_num_workers=0,
            ),
        )
        print("Trainer configured")
        return self.trainer
    
    def train(self):
        # Train the model using unsloth's optimized training method
        # Fixed: Use unsloth_train instead of standard trainer.train() to avoid meta tensor issues
        print("Starting training...")
        from unsloth import unsloth_train
        trainer_stats = unsloth_train(
            self.trainer,
            resume_from_checkpoint=None,
            # Add safety parameters
            max_steps=None,  # Let epochs control training duration
        )
        print("Training completed")
        return trainer_stats
    
    def save_model(self, output_dir="llama-3.1-8b-bangla-empathic-lora"):
        # Save the fine-tuned model
        self.model.save_pretrained(output_dir)
        self.tokenizer.save_pretrained(output_dir)
        # Optionally save to merged 16bit
        try:
            self.model.save_pretrained_merged("llama-3.1-8b-bangla-empathic-merged", self.tokenizer, save_method="merged_16bit")
        except Exception as e:
            print(f"Warning: Could not save merged model: {e}")
            print("LoRA adapter saved successfully though.")
        print(f"Model saved to: {output_dir}")
    
    def enable_inference_mode(self):
        # Enable inference mode for the model
        FastLanguageModel.for_inference(self.model)

In [None]:
class Evaluator:
    # Evaluate fine-tuned model
    
    def __init__(self, fine_tuner, data_processor):
        self.model = fine_tuner.model
        self.tokenizer = fine_tuner.tokenizer
        self.trainer = fine_tuner.trainer
        self.test_df = data_processor.test_df
        self.test_dataset = data_processor.test_dataset
        self.format_prompt = data_processor.format_prompt
    
    def generate_response(self, question, max_new_tokens=256, temperature=0.7, top_p=0.9):
        # Generate a response for a question
        prompt = self.format_prompt(question)
        inputs = self.tokenizer(prompt, return_tensors="pt").to("cuda")
        
        outputs = self.model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            use_cache=True
        )
        
        # Decode and extract only the assistant's response
        full_output = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract response after the prompt
        response = full_output[len(self.tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)):].strip()
        return response
    
    def evaluate_metrics(self, num_samples=100):
        # Evaluate model with BLEU, ROUGE, and Perplexity
        import numpy as np
        from evaluate import load
        
        # Load evaluation metrics 
        bleu_metric = load("bleu")
        rouge_metric = load("rouge")
        
        # Generate predictions on test set
        predictions = []
        references = []
        
        for i in range(min(num_samples, len(self.test_dataset))):
            question = self.test_df.iloc[i]['Questions']
            reference = self.test_df.iloc[i]['Answers']
            prediction = self.generate_response(question)
            predictions.append(prediction)
            references.append(reference)
        
        # Calculate BLEU score
        bleu_results = bleu_metric.compute(predictions=predictions, references=[[ref] for ref in references])
        
        # Calculate ROUGE scores
        rouge_results = rouge_metric.compute(predictions=predictions, references=references)
        
        # Calculate perplexity from evaluation loss
        eval_results = self.trainer.evaluate(eval_dataset=self.test_dataset)
        perplexity = np.exp(eval_results['eval_loss'])
        
        results = {
            "perplexity": perplexity,
            "bleu": bleu_results['bleu'],
            "rouge1": rouge_results['rouge1'],
            "rouge2": rouge_results['rouge2'],
            "rougeL": rouge_results['rougeL']
        }
        
        print(f"\nEvaluation Results:")
        print(f"Perplexity: {results['perplexity']:.4f}")
        print(f"BLEU Score: {results['bleu']:.4f}")
        print(f"ROUGE-1: {results['rouge1']:.4f}")
        print(f"ROUGE-2: {results['rouge2']:.4f}")
        print(f"ROUGE-L: {results['rougeL']:.4f}")
        
        return results
    
    def create_human_eval_samples(self, sample_size=20, output_file="human_evaluation_samples.csv"):
        # Create samples for human evaluation
        import random
        
        # Sample random examples for human evaluation
        sample_indices = random.sample(range(len(self.test_df)), min(sample_size, len(self.test_df)))
        human_eval_data = []
        
        for idx in sample_indices:
            question = self.test_df.iloc[idx]['Questions']
            reference = self.test_df.iloc[idx]['Answers']
            
            # Get model prediction
            prediction = self.generate_response(question)
            
            human_eval_data.append({
                "id": idx,
                "question": question,
                "reference_answer": reference,
                "model_answer": prediction,
                "empathy_score": None, # To be filled by human evaluators (1-5)
                "relevance_score": None, # To be filled by human evaluators (1-5)
                "fluency_score": None, # To be filled by human evaluators (1-5)
                "notes": "" # Additional comments
            })
        
        # Save to CSV for human evaluation
        human_eval_df = pd.DataFrame(human_eval_data)
        human_eval_df.to_csv(output_file, index=False)
        
        print(f"\nCreated {len(human_eval_data)} samples for human evaluation")
        print(f"Saved to: {output_file}")
        print("\nEvaluation criteria:")
        print("1. Empathy Score (1-5): How empathetic and understanding is the response?")
        print("2. Relevance Score (1-5): How relevant is the response to the question?")
        print("3. Fluency Score (1-5): How fluent and natural is the Bengali language?")
        
        return human_eval_df
    
    def display_sample_responses(self, num_samples=5):
        # Display sample responses with streaming
        from transformers import TextStreamer
        print(f"Generating {num_samples} sample responses on test prompts")
       
        
        for i in range(min(num_samples, len(self.test_df))):
            question = self.test_df.iloc[i]['Questions']
            reference = self.test_df.iloc[i]['Answers']
            
            print(f"\n--- Sample {i+1} ---")
            print(f"Question: {question}")
            print(f"\nReference Answer: {reference}")
            print(f"\nModel Response:")
            
            # Create prompt
            prompt = self.format_prompt(question)
            inputs = self.tokenizer(prompt, return_tensors="pt").to("cuda")
            
            # Generate with streaming
            text_streamer = TextStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
            _ = self.model.generate(
                **inputs,
                streamer=text_streamer,
                max_new_tokens=300,
                temperature=0.5,
                top_p=0.9,
                use_cache=True
            )
            
            print("\n" + "-"*80)
    
    def log_all_responses(self, experiment_name="llama-3.1-8b-bangla-empathic"):
        # Log all test responses
        from datetime import datetime
        import uuid
        
        # Generate experiment ID
        experiment_id = f"{experiment_name}-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
        
        # Generate responses for all test samples and log them
        generated_responses = []
        
        print(f"Generating and logging responses for {len(self.test_df)} test samples...")
        
        for idx in range(len(self.test_df)):
            question = self.test_df.iloc[idx]['Questions']
            reference = self.test_df.iloc[idx]['Answers']
            response_text = self.generate_response(question)
            
            # Create log entry
            log_entry = {
                "id": str(uuid.uuid4()),
                "experiment_id": experiment_id,
                "sample_index": idx,
                "input_text": question,
                "reference_text": reference,
                "response_text": response_text,
                "timestamp": datetime.now().isoformat(),
                "model_name": "llama-3.1-8b-instruct",
                "temperature": 0.5,
                "top_p": 0.9,
                "max_new_tokens": 300
            }
            
            generated_responses.append(log_entry)
            
            if (idx + 1) % 10 == 0:
                print(f"Processed {idx + 1}/{len(self.test_df)} samples...")
        
        # Save to CSV
        responses_df = pd.DataFrame(generated_responses)
        log_filename = f"generated_responses_{experiment_id}.csv"
        responses_df.to_csv(log_filename, index=False)
        
        print(f"\nGenerated responses logged successfully!")
        print(f"Total responses: {len(generated_responses)}")
        print(f"Saved to: {log_filename}")
        
        return responses_df, log_filename

In [55]:
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU: {torch.cuda.get_device_name(1)}")
    print(
        f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} + {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB"
    )

PyTorch version: 2.9.0+cu128
CUDA available: True
GPU: Tesla T4
GPU: Tesla T4
GPU Memory: 15.8 + 15.8 GB


In [56]:
# Initialize DatasetProcessor
data_processor = DatasetProcessor('/kaggle/input/bengali-empathetic-conversations-corpus/BengaliEmpatheticConversationsCorpus .csv')

# Load and clean data
df = data_processor.load_and_clean_data()
df.head()

Dataset size after cleaning: 38210


Unnamed: 0,Questions,Answers
0,আমার স্ত্রী এবং মায়ের মধ্যে টানটান মতবিরোধ চল...,"আপনি যা বর্ণনা করছেন তাকে মনোবিজ্ঞানীরা ""ত্রিভ..."
1,"আমি বাচ্চা নেওয়ার পরিকল্পনা করছি, তাই আমাকে ধ...",হাই। আপনার শিশুর (এবং নিজের) জন্য যা স্বাস্থ্য...
2,"আমার মনের মধ্যে গোপন আছে, এবং আমি জানি না তাদে...",মনে হচ্ছে গোপন রাখা এখন আপনার জন্য একটি সমস্যা...
3,আমি আমার সম্পর্কের ক্ষেত্রে অত্যন্ত অধিকারসূচক...,হ্যালো। এটা দুর্দান্ত যে আপনি উপলব্ধি করতে সক্...
4,কয়েক বছর আগে আমার মাথায় আঘাত লেগেছিল এবং আমা...,আপনি বলেননি কি বা কত ওষুধ আপনি চেষ্টা করেছেন। ...


In [None]:
# Initialize LLAMAFineTuner
# Fixed: Enable load_in_4bit=True to prevent meta tensor issues
fine_tuner = LLAMAFineTuner(
    model_name="unsloth/Meta-Llama-3.1-8B-Instruct",
    max_seq_length=2020, # Choose any! We auto support RoPE Scaling internally!
    dtype=None, # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit=True, # Fixed: Enable 4-bit quantization to prevent meta tensor issues
    device_map="auto" # Auto device allocation
)

# Load model and tokenizer
model, tokenizer = fine_tuner.load_model()

==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



Model loaded: unsloth/Meta-Llama-3.1-8B-Instruct


In [58]:
# Train/Validation/Test Splits
train_df, val_df, test_df = data_processor.split_data(test_size=0.2, val_size=0.5, random_state=42)

Training samples: 30568
Validation samples:   3821
Test samples:   3821


In [59]:
# STEP 4: CREATE HUGGINGFACE DATASETS
train_dataset, val_dataset, test_dataset = data_processor.create_datasets()

Map:   0%|          | 0/30568 [00:00<?, ? examples/s]

Map:   0%|          | 0/3821 [00:00<?, ? examples/s]

Map:   0%|          | 0/3821 [00:00<?, ? examples/s]

In [60]:
# Display sample data
data_processor.display_sample(num_samples=2)

Sample formatted training data:

Sample 1:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a sympathetic and helpful assistant. You answer people's questions in Bengali language.
<|eot_id|><|start_header_id|>user<|end_header_id|>

আমি আগামী ফেব্রুয়ারিতে প্রমোশন পাওয়ার চেষ্টা করছি।
<|eot_id|><|start_header_id|>assistant<|end_header_id|>

আরে। আমি আপনি এটা পেতে আশা করি!
<|eot_id|>

Sample 2:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a sympathetic and helpful assistant. You answer people's questions in Bengali language.
<|eot_id|><|start_header_id|>user<|end_header_id|>

আমি নিজেকে রক্ষা করেছি
<|eot_id|><|start_header_id|>assistant<|end_header_id|>

এটা চমৎকার, আমি আশা করি আপনি আঘাত পাবেন না!
<|eot_id|>


In [61]:
# Apply LoRA
model = fine_tuner.apply_lora(
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=2020
)

LoRA configuration applied


In [65]:
# Create trainer with optimal batch configuration and full epoch training
# Fixed: Changed from max_steps=100 to num_train_epochs=3
# - max_steps=100 was only 13% of one epoch (causing loss plateau at 0.6)
# - num_train_epochs=3 will train through entire dataset 3 times (~2,250 steps)
# - Learning rate reduced to 5e-5 for better convergence past plateaus

trainer = fine_tuner.create_trainer(
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    per_device_train_batch_size=2,  # Primary driver of VRAM usage
    gradient_accumulation_steps=8,  # Primary driver of training time
    num_train_epochs=3,  # Train for 3 full epochs instead of stopping at 100 steps
    learning_rate=5e-5,  # Reduced from 2e-4 to help break through loss plateaus
    output_dir="outputs"
)

Batch configuration:
  Per-device batch size: 2
  Gradient accumulation steps: 8
  Effective batch size: 16


NotImplementedError: Cannot copy out of meta tensor; no data!

In [64]:
# Train the model
trainer_stats = fine_tuner.train()

Starting training...


AttributeError: 'NoneType' object has no attribute 'train'

In [None]:
# Save model
fine_tuner.save_model("llama-3.1-8b-bangla-empathic-lora")

In [None]:
# Enable inference mode and initialize Evaluator
fine_tuner.enable_inference_mode()
evaluator = Evaluator(fine_tuner, data_processor)

# Evaluate using: Perplexity, BLEU, ROUGE
results = evaluator.evaluate_metrics(num_samples=10)

# Log to wandb
wandb.log(results)

In [None]:
# Evaluate: Human evaluation on empathetic response quality
human_eval_df = evaluator.create_human_eval_samples(sample_size=20)

In [None]:
# Sample model responses on test prompts
evaluator.display_sample_responses(num_samples=5)

In [None]:
# Store logs for GeneratedResponses: id, experiment_id, input_text, response_text, timestamp
responses_df, log_filename = evaluator.log_all_responses()

# Upload to wandb as artifact
artifact = wandb.Artifact(
    name="generated-responses",
    type="predictions",
    description="Generated responses from fine-tuned model on test set"
)
artifact.add_file(log_filename)
wandb.log_artifact(artifact)
print("Logged to W&B as artifact")

In [None]:
# Finish wandb run
wandb.finish()

print("\n" + "="*80)
print("Training and evaluation completed successfully!")
print("="*80)
print(f"\nModel saved to: llama-3.1-8b-bangla-empathic-lora")
print(f"Human evaluation samples: human_evaluation_samples.csv")