# Importing Data

In [23]:
with open("amazon_data_train.json", "r") as read_file:
    train_data = json.load(read_file)
 
with open("amazon_data_test.json", "r") as read_file:
    test_data = json.load(read_file)

In [24]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForQuestionAnswering, AdamW
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

In [7]:
train[0]

{'context': 'Nokia C12 Android 12 (Go Edition) Smartphone, All-Day Battery, 4GB RAM (2GB RAM + 2GB Virtual RAM) + 64GB Capacity | Light Mint',
 'qas': [{'id': '00001',
   'is_impossible': False,
   'question': 'What is the operating system of the Nokia C12 smartphone?',
   'answers': [{'text': 'Android 12 (Go Edition)', 'answer_start': 10}]},
  {'id': '00002',
   'is_impossible': False,
   'question': 'How much RAM does the Nokia C12 have?',
   'answers': [{'text': '4GB', 'answer_start': 63}]},
  {'id': '00003',
   'is_impossible': False,
   'question': 'Does the Nokia C12 have virtual RAM?',
   'answers': [{'text': '(2GB RAM + 2GB Virtual RAM)', 'answer_start': 71}]},
  {'id': '00004',
   'is_impossible': False,
   'question': 'What is the total capacity of the Nokia C12?',
   'answers': [{'text': '64GB', 'answer_start': 101}]},
  {'id': '00005',
   'is_impossible': False,
   'question': 'What is the color option available for the Nokia C12?',
   'answers': [{'text': 'Light Mint', 'an

# Creating Training and Validation Data

In [8]:
class QADataset(Dataset):
    def __init__(self, data, tokenizer, max_length=384):
        self.examples = []
        
        for item in data:
            context = item['context']
            for qa in item['qas']:
                # Skip impossible questions during training
                if qa['is_impossible']:
                    continue
                    
                question = qa['question']
                answer_text = qa['answers'][0]['text']
                answer_start = qa['answers'][0]['answer_start']
                
                # Save the example
                self.examples.append({
                    'context': context,
                    'question': question,
                    'answer_text': answer_text,
                    'answer_start': answer_start,
                    'id': qa['id']
                })
        
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        example = self.examples[idx]
        
        # Tokenize
        encoding = self.tokenizer(
            example['question'],
            example['context'],
            max_length=self.max_length,
            padding='max_length',
            truncation='only_second',
            return_offsets_mapping=True,
            return_tensors='pt'
        )
        
        # Get offsets before removing from encoding
        offsets = encoding.offset_mapping[0].numpy()
        
        # Find start and end positions
        start_pos = example['answer_start']
        end_pos = start_pos + len(example['answer_text'])
        
        # Map character positions to token positions
        start_token = end_token = 0
        
        for i, (offset_start, offset_end) in enumerate(offsets):
            if offset_start <= start_pos < offset_end:
                start_token = i
            if offset_start < end_pos <= offset_end:
                end_token = i
                break
        
        # Store example ID for evaluation
        example_id = example['id']
        
        # Remove offset mapping as it's not needed for training
        encoding.pop('offset_mapping')
        
        # Add start and end positions
        encoding['start_positions'] = torch.tensor([start_token])
        encoding['end_positions'] = torch.tensor([end_token])
        
        # Convert to individual tensors rather than batched
        result = {k: v.squeeze(0) for k, v in encoding.items()}
        result['example_id'] = example_id
        
        return result

class QAInferenceDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=384):
        self.examples = []
        self.contexts = {}
        
        for item in data:
            context = item['context']
            for qa in item['qas']:
                question = qa['question']
                
                # Save the example
                self.examples.append({
                    'context': context,
                    'question': question,
                    'id': qa['id']
                })
                # Save context for reference
                self.contexts[qa['id']] = context
        
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        example = self.examples[idx]
        
        # Tokenize
        encoding = self.tokenizer(
            example['question'],
            example['context'],
            max_length=self.max_length,
            padding='max_length',
            truncation='only_second',
            return_offsets_mapping=True,
            return_tensors='pt'
        )
        
        # Store context, offset mapping and ID for generating answers
        offset_mapping = encoding.offset_mapping.squeeze(0).numpy()
        example_id = example['id']
        
        # Remove offset mapping from encoding
        encoding.pop('offset_mapping')
        
        # Convert to individual tensors rather than batched
        result = {k: v.squeeze(0) for k, v in encoding.items()}
        result['example_id'] = example_id
        result['offset_mapping'] = offset_mapping
        
        return result

# Importing the Bert model

bert-base-uncased model has approximately 110 million parameters. Here's the exact breakdown:

BERT-base Architecture Specs:
Layers: 12

Hidden Size: 768

Attention Heads: 12

Total Parameters: ~110M



In [27]:
# Initialize tokenizer and model
model_name = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

# Set up device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)
model.to(device)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cpu


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

# Creatig the Datasets using the Dataloader Functions

In [28]:
from torch.utils.data import random_split

# Create datasets
train_dataset = QADataset(train_data, tokenizer)

# Split training data for validation
train_size = int(0.9 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_subset, val_subset = random_split(train_dataset, [train_size, val_size])

# Create test dataset for inference and evaluation
test_dataset = QAInferenceDataset(test_data, tokenizer)

# Create data loaders
batch_size = 8
train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_subset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

print(f"Training with {len(train_subset)} examples")
print(f"Validating with {len(val_subset)} examples")
print(f"Testing with {len(test_dataset)} examples")

Training with 73 examples
Validating with 9 examples
Testing with 31 examples


# Model Definition 

In [13]:
from transformers import AdamW
from tqdm import tqdm

def train_model(model, train_loader, eval_loader, device, epochs=3, lr=3e-5):
    optimizer = AdamW(model.parameters(), lr=lr)
    
    best_eval_loss = float('inf')
    best_model_state = None
    
    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
        
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Training]")
        for batch in progress_bar:
            # Remove example_id as it's not needed for training
            example_ids = batch.pop('example_id')
            
            # Move to device
            batch = {k: v.to(device) for k, v in batch.items()}
            
            # Forward pass
            outputs = model(**batch)
            loss = outputs.loss
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            progress_bar.set_postfix({'loss': loss.item()})
        
        avg_train_loss = train_loss / len(train_loader)
        
        # Evaluation
        model.eval()
        eval_loss = 0
        
        progress_bar = tqdm(eval_loader, desc=f"Epoch {epoch+1}/{epochs} [Evaluation]")
        for batch in progress_bar:
            example_ids = batch.pop('example_id')
            
            # Move to device
            batch = {k: v.to(device) for k, v in batch.items()}
            
            with torch.no_grad():
                outputs = model(**batch)
                loss = outputs.loss
            
            eval_loss += loss.item()
            progress_bar.set_postfix({'loss': loss.item()})
        
        avg_eval_loss = eval_loss / len(eval_loader)
        
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Eval Loss: {avg_eval_loss:.4f}")
        
        # Save best model
        if avg_eval_loss < best_eval_loss:
            best_eval_loss = avg_eval_loss
            best_model_state = model.state_dict().copy()
            print(f"New best model saved with eval loss: {avg_eval_loss:.4f}")
    
    # Load best model for return
    if best_model_state:
        model.load_state_dict(best_model_state)
    
    return model

# Training and Saving the Model

In [16]:
# Train the model
print("Starting training...")
model = train_model(model, train_loader, val_loader, device, epochs=5)

# Save the model
model_save_path = r'C:\Users\lovis\Desktop\Recommendation_system\Bert_FineTuned'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path}")



Starting training...


Epoch 1/5 [Training]: 100%|█████████████████████████████████████████████████| 10/10 [04:09<00:00, 24.98s/it, loss=3.77]
Epoch 1/5 [Evaluation]: 100%|█████████████████████████████████████████████████| 2/2 [00:12<00:00,  6.11s/it, loss=3.18]


Epoch 1/5, Train Loss: 4.6003, Eval Loss: 3.3538
New best model saved with eval loss: 3.3538


Epoch 2/5 [Training]: 100%|█████████████████████████████████████████████████| 10/10 [04:05<00:00, 24.52s/it, loss=2.83]
Epoch 2/5 [Evaluation]: 100%|█████████████████████████████████████████████████| 2/2 [00:14<00:00,  7.08s/it, loss=2.65]


Epoch 2/5, Train Loss: 3.3533, Eval Loss: 2.8259
New best model saved with eval loss: 2.8259


Epoch 3/5 [Training]: 100%|█████████████████████████████████████████████████| 10/10 [04:10<00:00, 25.03s/it, loss=2.09]
Epoch 3/5 [Evaluation]: 100%|██████████████████████████████████████████████████| 2/2 [00:13<00:00,  6.57s/it, loss=2.3]


Epoch 3/5, Train Loss: 2.7757, Eval Loss: 2.4090
New best model saved with eval loss: 2.4090


Epoch 4/5 [Training]: 100%|█████████████████████████████████████████████████| 10/10 [03:58<00:00, 23.85s/it, loss=1.79]
Epoch 4/5 [Evaluation]: 100%|█████████████████████████████████████████████████| 2/2 [00:13<00:00,  6.55s/it, loss=1.76]


Epoch 4/5, Train Loss: 2.3293, Eval Loss: 1.9509
New best model saved with eval loss: 1.9509


Epoch 5/5 [Training]: 100%|█████████████████████████████████████████████████| 10/10 [03:59<00:00, 23.99s/it, loss=1.89]
Epoch 5/5 [Evaluation]: 100%|█████████████████████████████████████████████████| 2/2 [00:13<00:00,  6.82s/it, loss=2.22]


Epoch 5/5, Train Loss: 1.8098, Eval Loss: 1.9901
Model saved to C:\Users\lovis\Desktop\Recommendation_system\Bert_FineTuned


# Evalutation Function for testing Data

In [18]:
import numpy as np

def evaluate_model(model, test_loader, tokenizer, device, test_data):
    model.eval()
    all_predictions = {}
    ground_truth = {}
    
    # Extract ground truth answers from test data
    for item in test_data:
        for qa in item['qas']:
            if not qa['is_impossible']:
                ground_truth[qa['id']] = qa['answers'][0]['text']
    
    progress_bar = tqdm(test_loader, desc="Evaluating")
    for batch in progress_bar:
        example_ids = batch.pop('example_id')
        offset_mappings = batch.pop('offset_mapping')
        
        # Move to device
        inputs = {k: v.to(device) for k, v in batch.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            start_logits = outputs.start_logits
            end_logits = outputs.end_logits
        
        # Get predictions
        for i, example_id in enumerate(example_ids):
            # Get the most likely start and end indices
            start_idx = torch.argmax(start_logits[i]).item()
            end_idx = torch.argmax(end_logits[i]).item()
            
            # Make sure end_idx >= start_idx
            if end_idx < start_idx:
                end_idx = start_idx
            
            # Get the mapped tokens
            offsets = offset_mappings[i]
            
            # Only consider tokens that are not special tokens
            # CLS token is at index 0, so start from 1
            if start_idx <= 0:
                start_idx = 1
            
            # Find the start and end character positions in the original context
            if start_idx < len(offsets) and end_idx < len(offsets):
                start_char = offsets[start_idx][0].item()
                end_char = offsets[end_idx][1].item()
                
                # Get the context and extract the predicted answer
                context = test_loader.dataset.contexts[example_id]
                if start_char < len(context) and end_char <= len(context):
                    predicted_answer = context[start_char:end_char]
                    all_predictions[example_id] = predicted_answer
    
    # Calculate metrics
    exact_match = 0
    f1_scores = []
    
    for qid, true_answer in ground_truth.items():
        if qid in all_predictions:
            prediction = all_predictions[qid]
            
            # Exact match
            if prediction.lower() == true_answer.lower():
                exact_match += 1
            
            # F1 score (token overlap)
            true_tokens = set(true_answer.lower().split())
            pred_tokens = set(prediction.lower().split())
            
            if not true_tokens and not pred_tokens:
                f1_scores.append(1.0)
            elif not true_tokens or not pred_tokens:
                f1_scores.append(0.0)
            else:
                common_tokens = true_tokens.intersection(pred_tokens)
                precision = len(common_tokens) / len(pred_tokens) if pred_tokens else 0
                recall = len(common_tokens) / len(true_tokens) if true_tokens else 0
                
                f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0
                f1_scores.append(f1)
    
    # Calculate final metrics
    em_score = exact_match / len(ground_truth) if ground_truth else 0
    avg_f1 = sum(f1_scores) / len(f1_scores) if f1_scores else 0
    
    return {
        'exact_match': em_score,
        'f1': avg_f1,
        'predictions': all_predictions
    }

# Evaluation on Testing Data

In [19]:
# Evaluate on test set
print("Evaluating on test set...")
eval_results = evaluate_model(model, test_loader, tokenizer, device, test_data)
print(f"Exact Match: {eval_results['exact_match']:.4f}")
print(f"F1 Score: {eval_results['f1']:.4f}")

# Save predictions to file
with open('qa_predictions.json', 'w') as f:
    json.dump(eval_results['predictions'], f, indent=2)
print("Predictions saved to qa_predictions.json")

Evaluating on test set...


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 4/4 [00:42<00:00, 10.63s/it]

Exact Match: 0.2222
F1 Score: 0.4937
Predictions saved to qa_predictions.json





# Inference Function

In [20]:
def run_inference(model, tokenizer, context, questions, device):
    model.eval()
    results = []
    
    for question in questions:
        # Tokenize
        inputs = tokenizer(
            question,
            context,
            max_length=384,
            padding='max_length',
            truncation='only_second',
            return_tensors='pt',
            return_offsets_mapping=True
        )
        
        offset_mapping = inputs.pop('offset_mapping')
        
        # Move to device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            start_logits = outputs.start_logits
            end_logits = outputs.end_logits
        
        # Get the most likely start and end indices
        start_idx = torch.argmax(start_logits[0]).item()
        end_idx = torch.argmax(end_logits[0]).item()
        
        # Make sure end_idx >= start_idx
        if end_idx < start_idx:
            end_idx = start_idx
        
        # Get the mapped tokens
        offsets = offset_mapping[0].cpu().numpy()
        
        # Only consider tokens that are not special tokens
        if start_idx <= 0:
            start_idx = 1
        
        # Find the start and end character positions in the original context
        if start_idx < len(offsets) and end_idx < len(offsets):
            start_char = offsets[start_idx][0].item()
            end_char = offsets[end_idx][1].item()
            
            # Extract the predicted answer
            if start_char < len(context) and end_char <= len(context):
                predicted_answer = context[start_char:end_char]
                results.append({
                    'question': question,
                    'predicted_answer': predicted_answer,
                    'confidence': float(torch.max(start_logits[0]).item() + torch.max(end_logits[0]).item()) / 2
                })
    
    return results

# Sample Inference

In [21]:
# Run inference on a sample
print("\nRunning sample inference...")
sample_context = test_data[0]['context']
sample_questions = [qa['question'] for qa in test_data[0]['qas']]

inference_results = run_inference(model, tokenizer, sample_context, sample_questions, device)

print(f"Sample context: {sample_context}")
for result in inference_results:
    print(f"Q: {result['question']}")
    print(f"A: {result['predicted_answer']} (Confidence: {result['confidence']:.4f})")
    print("-" * 50)


Running sample inference...
Sample context: Redmi Note 11 (Space Black, 4GB RAM, 64GB Storage)|90Hz FHD+ AMOLED Display | QualcommÂ® Snapdragonâ„¢ 680-6nm | 33W Charger Included
Q: What is the model name of the Redmi smartphone?
A: Redmi Note 11 (Sp (Confidence: 4.6656)
--------------------------------------------------
Q: What is the color option available for the Redmi Note 11?
A: ce Black, 4GB RA (Confidence: 5.5445)
--------------------------------------------------
Q: How much RAM does the Redmi Note 11 have?
A: Redmi Note 11 (Space Black, (Confidence: 3.3747)
--------------------------------------------------
Q: What is the storage capacity of the Redmi Note 11?
A: 64GB (Confidence: 3.4532)
--------------------------------------------------
Q: What is the display feature of the Redmi Note 11?
A: 90Hz FHD+ AMOLED Display (Confidence: 4.8998)
--------------------------------------------------
Q: What is included in the package of the Redmi Note 11?
A: Redmi Note 11 (S (Confidence: