In [1]:

import pandas as pd
# Generate full question and answer form the PDF
file_path = r'makemytrip_qa_dataset_mini.json'
try:
    df = pd.read_json(file_path)
    display(df.head())
except FileNotFoundError:
    print(f"Error: File not found at {file_path}. Please check the path.")
except Exception as e:
    print(f"An error occurred: {e}")

Unnamed: 0,question,answer
0,What was the Total revenue in 2023?,"The Total revenue in 2023 was USD 593,036."
1,What was the Total revenue in 2023?,"In 2023, the company reported Total revenue of..."
2,What was the Total revenue in 2023?,"Total revenue stood at USD 593,036 in 2023."
3,What was the Total revenue in 2023?,"The company recorded USD 593,036 as Total reve..."
4,How much was the Total revenue in 2023?,"The Total revenue in 2023 was USD 593,036."


## Prepare data for fine-tuning

### Subtask:
Format the questions and answers from the JSON data into a suitable format for fine-tuning the model.

**Reasoning**:
Convert the dataframe into a list of dictionaries, where each dictionary represents a training example with the question and answer formatted as a single text string.

In [2]:
# Prepare data for fine-tuning
training_data = []

# Define a system prompt for your domain
system_prompt = "You are a helpful assistant that provides financial data from MakeMyTrip reports."

# Correctly format each training example with the chat template
for index, row in df.iterrows():
    question = row['question']
    answer = row['answer']

    # Format the data using the TinyLlama chat template
    training_data.append({
        "text": f"<|system|>\n{system_prompt}</s>\n<|user|>\n{question}</s>\n<|assistant|>\n{answer}</s>"
    })

In [3]:
# Display the first few training examples
display(training_data[:5])

[{'text': '<|system|>\nYou are a helpful assistant that provides financial data from MakeMyTrip reports.</s>\n<|user|>\nWhat was the Total revenue in 2023?</s>\n<|assistant|>\nThe Total revenue in 2023 was USD 593,036.</s>'},
 {'text': '<|system|>\nYou are a helpful assistant that provides financial data from MakeMyTrip reports.</s>\n<|user|>\nWhat was the Total revenue in 2023?</s>\n<|assistant|>\nIn 2023, the company reported Total revenue of USD 593,036.</s>'},
 {'text': '<|system|>\nYou are a helpful assistant that provides financial data from MakeMyTrip reports.</s>\n<|user|>\nWhat was the Total revenue in 2023?</s>\n<|assistant|>\nTotal revenue stood at USD 593,036 in 2023.</s>'},
 {'text': '<|system|>\nYou are a helpful assistant that provides financial data from MakeMyTrip reports.</s>\n<|user|>\nWhat was the Total revenue in 2023?</s>\n<|assistant|>\nThe company recorded USD 593,036 as Total revenue in 2023.</s>'},
 {'text': '<|system|>\nYou are a helpful assistant that provid

In [4]:
from datasets import Dataset
from transformers import AutoTokenizer

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def preprocess(example):
    # Tokenize full text
    tokens = tokenizer(example['text'], truncation=True, padding=False, max_length=512)
    
    # Find indices to mask
    # Everything before <|assistant|> is ignored in loss
    text = example['text']
    assistant_index = text.find("<|assistant|>")
    
    if assistant_index == -1:
        # If no assistant token found, mask everything except the last few tokens
        labels = tokens['input_ids'].copy()
        labels[:-10] = [-100] * (len(labels) - 10)
    else:
        # Convert character index to token index
        prefix_ids = tokenizer(text[:assistant_index], add_special_tokens=False)['input_ids']
        prefix_len = len(prefix_ids)
        
        # Prepare labels: -100 for question/system tokens
        labels = tokens['input_ids'].copy()
        if prefix_len < len(labels):
            labels[:prefix_len] = [-100] * prefix_len
    
    tokens['labels'] = labels
    return tokens

dataset = Dataset.from_list(training_data)
tokenized_dataset = dataset.map(preprocess, remove_columns=["text"])

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 96/96 [00:00<00:00, 7409.75 examples/s]
Map: 100%|██████████| 96/96 [00:00<00:00, 7409.75 examples/s]


In [5]:
import os
import torch
import math
from torch import nn
import torch.nn.functional as F  # Add this missing import
from transformers import (
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

class LoraLinear(nn.Module):
    def __init__(self, in_features, out_features, r=8, lora_alpha=16, lora_dropout=0.05, bias=False):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.r = r
        self.scaling = lora_alpha / r if r > 0 else 1.0

        # Base frozen linear layer
        self.weight = nn.Parameter(torch.empty(out_features, in_features), requires_grad=False)
        self.bias = nn.Parameter(torch.zeros(out_features), requires_grad=False) if bias else None

        if r > 0:
            self.lora_A = nn.Parameter(torch.zeros((r, in_features)))
            self.lora_B = nn.Parameter(torch.zeros((out_features, r)))
            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
            nn.init.zeros_(self.lora_B)
            self.lora_dropout = nn.Dropout(p=lora_dropout)
        else:
            self.lora_A, self.lora_B, self.lora_dropout = None, None, None

    def forward(self, x):
        # Base forward
        result = F.linear(x, self.weight, self.bias)

        # LoRA adaptation
        if self.r > 0:
            lora_out = self.lora_dropout(x) @ self.lora_A.T @ self.lora_B.T
            result = result + self.scaling * lora_out

        return result

In [6]:
# ----------------------------
# 0. Disable WandB
# ----------------------------
os.environ["WANDB_MODE"] = "disabled"

# ----------------------------
# 1. Device Detection and Setup
# ----------------------------
import torch
import torch.nn as nn
import torch.nn.functional as F

def get_device_config():
    """Detect the best available device and configure accordingly"""
    if torch.cuda.is_available():
        device = "cuda:0"
        use_quantization = True
        torch_dtype = torch.bfloat16
        print("Using CUDA device with quantization")
    elif torch.backends.mps.is_available():
        device = "mps"
        use_quantization = False  # BitsAndBytes doesn't support MPS
        torch_dtype = torch.float32  # Use float32 for MPS stability
        print("Using MPS device without quantization (float32 for stability)")
    else:
        device = "cpu"
        use_quantization = False
        torch_dtype = torch.float32
        print("Using CPU device without quantization")
    
    return device, use_quantization, torch_dtype

device, use_quantization, torch_dtype = get_device_config()

# ----------------------------
# 2. Define an EFFICIENT MPS-Compatible MoE Layer
# ----------------------------

class MoELoRALinear(nn.Module):
    def __init__(self, base_linear, r, num_experts=2, k=1, lora_alpha=16, lora_dropout=0.05):
        super().__init__()
        self.base_linear = base_linear  # frozen pretrained weight
        self.num_experts = num_experts
        self.k = k
        self.device = next(base_linear.parameters()).device
        self.dtype = next(base_linear.parameters()).dtype

        # Simplified expert implementation for MPS compatibility
        self.experts = nn.ModuleList([
            LoraLinear(
                in_features=base_linear.in_features,
                out_features=base_linear.out_features,
                r=r,
                lora_alpha=lora_alpha,
                lora_dropout=lora_dropout
            )
            for _ in range(num_experts)
        ])

        # Simple gating mechanism
        self.gate = nn.Linear(base_linear.in_features, num_experts)
        
        # Move to correct device and dtype
        self.to(self.device)
        if self.dtype == torch.float32:
            self.float()

    def forward(self, x):
        # Ensure input is on correct device and dtype
        x = x.to(device=self.device, dtype=self.dtype)
        
        # Base output from frozen layer
        with torch.no_grad():
            base_out = self.base_linear(x)
        
        # Gating - use simpler approach for MPS
        gate_logits = self.gate(x)
        gate_weights = F.softmax(gate_logits, dim=-1)
        
        # Expert outputs - sequential processing for MPS stability
        expert_outputs = []
        for expert in self.experts:
            expert_out = expert(x)
            expert_outputs.append(expert_out)
        
        # Combine expert outputs
        combined_expert_out = torch.zeros_like(base_out)
        for i, expert_out in enumerate(expert_outputs):
            weight = gate_weights[..., i:i+1]
            combined_expert_out = combined_expert_out + weight * expert_out
        
        return base_out + combined_expert_out

def replace_proj_with_moe_lora(model, r=8, num_experts=2, k=1, lora_alpha=16, lora_dropout=0.05):
    """
    Replace only up_proj, down_proj in each MLP with MoE(LoRA) versions.
    MPS-compatible version.
    """
    print(f"Replacing projections with MoE-LoRA (r={r}, experts={num_experts})")
    
    for layer_idx, layer in enumerate(model.model.layers):
        print(f"Processing layer {layer_idx}")
        
        for proj_name in ["up_proj", "down_proj"]:
            old_proj = getattr(layer.mlp, proj_name)
            
            # Create MoE replacement
            moe_proj = MoELoRALinear(
                base_linear=old_proj,
                r=r,
                num_experts=num_experts,
                k=k,
                lora_alpha=lora_alpha,
                lora_dropout=lora_dropout,
            )
            
            # Set the new projection
            setattr(layer.mlp, proj_name, moe_proj)
            print(f"  Replaced {proj_name}")

    return model

# ----------------------------
# 3. Load base model with conditional quantization
# ----------------------------
print(f"Loading model for {device} with quantization={use_quantization}")

if use_quantization:
    # CUDA with quantization
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch_dtype,
        bnb_4bit_use_double_quant=True,
    )
    
    base_model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=quantization_config,
        torch_dtype=torch_dtype,
        device_map=device,
        trust_remote_code=True,
    )
else:
    # MPS/CPU - load without quantization
    base_model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch_dtype,
        trust_remote_code=True,
        low_cpu_mem_usage=True,  # Help with memory efficiency
    )
    
    # Move to device after loading
    print(f"Moving model to {device}")
    base_model = base_model.to(device)
    
    # Ensure correct dtype for MPS
    if device == "mps" and torch_dtype == torch.float32:
        base_model = base_model.float()

base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

print("Model loaded successfully!")

# ----------------------------
# 4. Apply MoE modifications
# ----------------------------
print("Applying MoE-LoRA modifications...")

model = replace_proj_with_moe_lora(
    base_model,
    r=8,
    num_experts=2,  # Keep simple for MPS
    k=1,
    lora_alpha=16,
    lora_dropout=0.05
)

print("MoE-LoRA applied successfully!")

Using MPS device without quantization (float32 for stability)
Loading model for mps with quantization=False
Moving model to mps
Moving model to mps
Model loaded successfully!
Applying MoE-LoRA modifications...
Replacing projections with MoE-LoRA (r=8, experts=2)
Processing layer 0
  Replaced up_proj
  Replaced down_proj
Processing layer 1
  Replaced up_proj
  Replaced down_proj
Processing layer 2
  Replaced up_proj
  Replaced down_proj
Processing layer 3
  Replaced up_proj
  Replaced down_proj
Processing layer 4
  Replaced up_proj
  Replaced down_proj
Processing layer 5
  Replaced up_proj
  Replaced down_proj
Processing layer 6
  Replaced up_proj
  Replaced down_proj
Processing layer 7
  Replaced up_proj
  Replaced down_proj
Processing layer 8
  Replaced up_proj
  Replaced down_proj
Processing layer 9
  Replaced up_proj
  Replaced down_proj
Processing layer 10
  Replaced up_proj
  Replaced down_proj
Processing layer 11
  Replaced up_proj
  Replaced down_proj
Processing layer 12
  Repla

In [7]:
# ----------------------------
# 5. Apply PEFT / LoRA to remaining layers (avoiding MoE-modified layers)
# ----------------------------
print("Applying PEFT configuration...")

# Configure LoRA for layers not modified by MoE
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Avoid up_proj, down_proj (they're MoE now)
    bias="none",
    task_type="CAUSAL_LM",
)

# Apply PEFT to the model
try:
    model = get_peft_model(model, peft_config)
    print("PEFT applied successfully!")
except Exception as e:
    print(f"PEFT application warning: {e}")
    print("Continuing without additional PEFT (MoE-LoRA is already applied)")

# Ensure model is on correct device
model = model.to(device)
if device == "mps" and torch_dtype == torch.float32:
    model = model.float()

print(f"Model is on device: {next(model.parameters()).device}")
print(f"Model dtype: {next(model.parameters()).dtype}")

Applying PEFT configuration...
'NoneType' object has no attribute 'cadam32bit_grad_fp32'
PEFT applied successfully!
Model is on device: mps:0
Model dtype: torch.float32


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [8]:
# Test the fine-tuned model
baseline_responses = []

# Define the system prompt used during fine-tuning
system_prompt = "You are a helpful assistant that provides financial data from MakeMyTrip reports."

# Get device from model
model_device = next(model.parameters()).device

for index, row in df.head(3).iterrows():  # Reduced to 3 for faster testing
    question = row['question']

    # Create the message list for the chat template
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": question},
    ]

    # Apply the chat template to format the input
    input_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True # This adds the <|assistant|> token at the end
    )

    # Tokenize the formatted input
    inputs = tokenizer(input_text, return_tensors="pt").to(model_device)

    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=50,
            do_sample=True,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode the entire generated output
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract only the generated answer part
    try:
        # The response will look like "<|system|>\n...</s>\n<|user|>\n...</s>\n<|assistant|>\n...answer...</s>"
        # We need to find the <|assistant|> token and get everything after it
        answer_start_token = '<|assistant|>'
        answer_start_index = decoded_output.rfind(answer_start_token)

        if answer_start_index != -1:
            generated_answer = decoded_output[answer_start_index + len(answer_start_token):].strip()
            # The model might generate a final </s> token, which we should remove
            if generated_answer.endswith('</s>'):
                generated_answer = generated_answer[:-len('</s>')].strip()
        else:
            generated_answer = "Could not extract answer from model output."

    except Exception as e:
        generated_answer = f"An error occurred: {e}"

    baseline_responses.append({"question": question, "generated_answer": generated_answer})

# Display the first few generated responses
display(baseline_responses[:3])

[{'question': 'What was the Total revenue in 2023?',
  'generated_answer': 'is   to the 2 1_in the 2 to the  ! de  the the .att the her. the  at  the  finite. tofinite to  the. . . .'},
 {'question': 'What was the Total revenue in 2023?',
  'generated_answer': 'become last   for  one  of have. 1. ( to  over the 1. str  for. for  on the  last  one (  of just be over  the last  can  to the just'},
 {'question': 'What was the Total revenue in 2023?',
  'generated_answer': '. be  canin the 1 1  end  ( the 1  is  is to the  last the  to  is the  (  our own _  Nov   Can  good  last_ the'}]

In [9]:
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable:,d} || Total params: {total:,d} || "
          f"Trainable%: {100 * trainable / total:.4f}")

# ----------------------------
# 8. Gradient checkpointing and model config
# ----------------------------
model.config.use_cache = False
if hasattr(model, 'gradient_checkpointing_disable'):
    model.gradient_checkpointing_disable()

# ----------------------------
# 9. Prepare collator
# ----------------------------
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# ----------------------------
# 10. Training arguments (MPS-optimized)
# ----------------------------
# Determine optimal settings based on device
if device.startswith("cuda"):
    # CUDA settings
    use_fp16 = True
    use_bf16 = False
    per_device_batch_size = 1
    gradient_accumulation_steps = 4
    dataloader_pin_memory = True
elif device == "mps":
    # MPS settings - use float32 for stability
    use_fp16 = False  # Disable FP16 for MPS stability
    use_bf16 = False
    per_device_batch_size = 1
    gradient_accumulation_steps = 8  # Increase for MPS
    dataloader_pin_memory = False  # MPS doesn't support pinned memory
else:
    # CPU settings
    use_fp16 = False
    use_bf16 = False
    per_device_batch_size = 1
    gradient_accumulation_steps = 16  # Increase for CPU
    dataloader_pin_memory = False

training_args = TrainingArguments(
    learning_rate=5e-5,
    output_dir="./results",
    num_train_epochs=2,  # Reduced for faster training on Mac
    per_device_train_batch_size=per_device_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    logging_steps=1,
    save_steps=50,  # Increase save steps to reduce I/O
    save_total_limit=2,
    fp16=use_fp16,
    bf16=use_bf16,
    dataloader_pin_memory=dataloader_pin_memory,
    remove_unused_columns=False,  # Keep all columns
    report_to=None,  # Disable reporting
    optim="adamw_torch",  # Use PyTorch optimizer for MPS
)

print(f"Training configuration for {device}:")
print(f"  - FP16: {use_fp16}, BF16: {use_bf16}")
print(f"  - Batch size: {per_device_batch_size}")
print(f"  - Gradient accumulation: {gradient_accumulation_steps}")
print(f"  - Pin memory: {dataloader_pin_memory}")

# ----------------------------
# 11. Trainer
# ----------------------------

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

print("Trainer initialized successfully!")

# ----------------------------
# 12. Train
# ----------------------------
print("Starting training...")
try:
    trainer.train()
    print("Training completed successfully!")
except Exception as e:
    print(f"Training error: {e}")
    print("You may need to reduce batch size or check MPS compatibility")

Trainable params: 7,659,520 || Total params: 2,123,067,480 || Trainable%: 0.3608
Training configuration for mps:
  - FP16: False, BF16: False
  - Batch size: 1
  - Gradient accumulation: 8
  - Pin memory: False
Trainer initialized successfully!
Starting training...


Trainable params: 7,659,520 || Total params: 2,123,067,480 || Trainable%: 0.3608
Training configuration for mps:
  - FP16: False, BF16: False
  - Batch size: 1
  - Gradient accumulation: 8
  - Pin memory: False
Trainer initialized successfully!
Starting training...


Step,Training Loss
1,8.0294
2,7.9353
3,7.8251
4,7.6917
5,7.5785
6,7.4331
7,7.3162
8,7.2358
9,7.0451
10,6.9021


Trainable params: 7,659,520 || Total params: 2,123,067,480 || Trainable%: 0.3608
Training configuration for mps:
  - FP16: False, BF16: False
  - Batch size: 1
  - Gradient accumulation: 8
  - Pin memory: False
Trainer initialized successfully!
Starting training...


Step,Training Loss
1,8.0294
2,7.9353
3,7.8251
4,7.6917
5,7.5785
6,7.4331
7,7.3162
8,7.2358
9,7.0451
10,6.9021


Training completed successfully!


In [10]:
# Test the fine-tuned model
fine_tuned_responses = []

# Define the system prompt used during fine-tuning
system_prompt = "You are a helpful assistant that provides financial data from MakeMyTrip reports."

# Get device from model
model_device = next(model.parameters()).device

for index, row in df.head(3).iterrows():  # Reduced to 3 for faster testing
    question = row['question']

    # Create the message list for the chat template
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": question},
    ]

    # Apply the chat template to format the input
    input_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True # This adds the <|assistant|> token at the end
    )

    # Tokenize the formatted input
    inputs = tokenizer(input_text, return_tensors="pt").to(model_device)

    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=50,
            do_sample=True,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode the entire generated output
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract only the generated answer part
    try:
        # The response will look like "<|system|>\n...</s>\n<|user|>\n...</s>\n<|assistant|>\n...answer...</s>"
        # We need to find the <|assistant|> token and get everything after it
        answer_start_token = '<|assistant|>'
        answer_start_index = decoded_output.rfind(answer_start_token)

        if answer_start_index != -1:
            generated_answer = decoded_output[answer_start_index + len(answer_start_token):].strip()
            # The model might generate a final </s> token, which we should remove
            if generated_answer.endswith('</s>'):
                generated_answer = generated_answer[:-len('</s>')].strip()
        else:
            generated_answer = "Could not extract answer from model output."

    except Exception as e:
        generated_answer = f"An error occurred: {e}"

    fine_tuned_responses.append({"question": question, "generated_answer": generated_answer})

# Display the first few generated responses
display(fine_tuned_responses[:3])

[{'question': 'What was the Total revenue in 2023?',
  'generated_answer': '.\n 1. |21 | on\n| on | can (Ex | Nov| | of|||0 2| | ^ data 24 2| (   ||  July  Aug.|'},
 {'question': 'What was the Total revenue in 2023?',
  'generated_answer': '414| |2711|4 14|| 14|4  data  data| 4 4 4824 14 a 2 14 4(448'},
 {'question': 'What was the Total revenue in 2023?',
  'generated_answer': '4 42|4 14 market03 of 2 201  | |1 |'}]

In [11]:
display(fine_tuned_responses)

[{'question': 'What was the Total revenue in 2023?',
  'generated_answer': '.\n 1. |21 | on\n| on | can (Ex | Nov| | of|||0 2| | ^ data 24 2| (   ||  July  Aug.|'},
 {'question': 'What was the Total revenue in 2023?',
  'generated_answer': '414| |2711|4 14|| 14|4  data  data| 4 4 4824 14 a 2 14 4(448'},
 {'question': 'What was the Total revenue in 2023?',
  'generated_answer': '4 42|4 14 market03 of 2 201  | |1 |'}]

In [12]:
base_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=2048, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=2048, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear(
            (base_layer): Linear(in_features=2048, out_features=256, bias=False)
            (lora_dropout): ModuleDict(
              (default): D