<h1 align="center">Assignment 5</h1>

#Question-Answering by Fine-Tuning BERT

##Imports

In [9]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    
    BertTokenizer,
    BertForQuestionAnswering,
   
    AdamW
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu
import nltk
import gradio as gr
import logging

# Download NLTK data for BLEU score calculation
nltk.download('punkt')

# Ignore warnings
logging.disable(logging.WARNING)

[nltk_data] Downloading package punkt to C:\Users\Suyash
[nltk_data]     Tambe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


##Load Dataset and Process Dataset

In [10]:
import json
from torch.utils.data import Dataset
import torch

# Load the policy dataset
def load_policy_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    policies = []
    for company in data.get("companies", []):  # Use .get() to avoid KeyError if "companies" is missing
        company_name = company.get("company_name", "Unknown Company")
        if "policies" in company:  # Check if "policies" exists in each company entry
            for policy in company["policies"]:
                policies.append({
                    "company_name": company_name,
                    "id": policy["id"],
                    "question": policy["question"],
                    "answer": policy["answer"],
                    "topic": policy["topic"],
                    "tags": policy["tags"]
                })
    return policies

# Custom dataset class for policy data
class PolicyDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        context = f"{item['company_name']} policy on {item['topic']}: {item['answer']}"
        question = item['question']
        answer = item['answer']

        # Tokenize input
        inputs = self.tokenizer.encode_plus(
            question,
            context,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Locate answer position
        input_ids = inputs['input_ids'][0]
        answer_tokens = self.tokenizer.encode(answer, add_special_tokens=False)
        start_position = None
        end_position = None

        for i in range(len(input_ids) - len(answer_tokens) + 1):
            if input_ids[i:i+len(answer_tokens)].tolist() == answer_tokens:
                start_position = i
                end_position = i + len(answer_tokens) - 1
                break

        # Default to the CLS token position if answer not found
        if start_position is None:
            start_position = 0
            end_position = 0

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'start_positions': torch.tensor(start_position),
            'end_positions': torch.tensor(end_position),
            'answer': answer
            
        }


### Train, Validation and Test Split

In [11]:
data = load_policy_data(r'C:\Users\Suyash Tambe\Desktop\Chatbot\dataset.json')
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

###Tokenization and DataLoader

In [12]:
# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Prepare datasets and dataloaders
train_dataset = PolicyDataset(train_data, tokenizer)
val_dataset = PolicyDataset(val_data, tokenizer)
test_dataset = PolicyDataset(test_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)
test_loader = DataLoader(test_dataset, batch_size=4)

##Training Function

In [13]:
# Training function
def train(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc="Training")
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        progress_bar.set_postfix({'loss': loss.item()})

    return total_loss / len(train_loader)

###Model Initialization

In [14]:
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Set device and move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)



##Validation

In [15]:
# Validation function
def validate(model, val_loader, device):
    model.eval()
    total_loss = 0
    progress_bar = tqdm(val_loader, desc="Validating")
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            loss = outputs.loss
            total_loss += loss.item()

            progress_bar.set_postfix({'loss': loss.item()})

    return total_loss / len(val_loader)

##Test

In [16]:
# Calculate BLEU score
def calculate_bleu(predictions, references):
    bleu_scores = []
    for pred, ref in zip(predictions, references):
        bleu_scores.append(sentence_bleu([ref.split()], pred.split()))
    return sum(bleu_scores) / len(bleu_scores)


In [17]:
# Test function
def test(model, test_loader, tokenizer, device):
    model.eval()
    all_predictions = []
    all_answers = []
    progress_bar = tqdm(test_loader, desc="Testing")
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            answers = batch['answer']

            outputs = model(input_ids, attention_mask=attention_mask)
            start_scores = outputs.start_logits
            end_scores = outputs.end_logits

            for i in range(input_ids.shape[0]):
                start_index = torch.argmax(start_scores[i])
                end_index = torch.argmax(end_scores[i])
                prediction = tokenizer.decode(input_ids[i][start_index:end_index+1])
                all_predictions.append(prediction)
                all_answers.append(answers[i])

    bleu_score = calculate_bleu(all_predictions, all_answers)
    return bleu_score

In [19]:
# Training loop
num_epochs = 3
best_loss = float('inf')
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_loss = train(model, train_loader, optimizer, device)
    val_loss = validate(model, val_loader, device)
    print(f"Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(model.state_dict(), 'bert_qa_model.pth')
        print("Model saved!")
    else:
        print("Validation Loss Increased. Model Not Saved.")
    print("*" * 50)

Epoch 1/3


Training: 100%|██████████| 6/6 [00:52<00:00,  8.68s/it, loss=3.99]
Validating: 100%|██████████| 2/2 [00:02<00:00,  1.15s/it, loss=2.57]


Train Loss: 5.1761, Validation Loss: 2.9365
Model saved!
**************************************************
Epoch 2/3


Training: 100%|██████████| 6/6 [00:46<00:00,  7.74s/it, loss=1.44]
Validating: 100%|██████████| 2/2 [00:02<00:00,  1.25s/it, loss=0.566]


Train Loss: 2.0903, Validation Loss: 1.0809
Model saved!
**************************************************
Epoch 3/3


Training: 100%|██████████| 6/6 [00:46<00:00,  7.73s/it, loss=0.388]
Validating: 100%|██████████| 2/2 [00:02<00:00,  1.17s/it, loss=0.132]


Train Loss: 0.7234, Validation Loss: 0.4699
Model saved!
**************************************************


In [12]:
torch.save(model.state_dict(), "bert_qa_model.h5")

##BLEU Score

In [21]:
# Test the model
bleu_score = test(model, test_loader, tokenizer, device)
print(f"BLEU Score: {bleu_score:.4f}")


Testing: 100%|██████████| 2/2 [00:03<00:00,  1.54s/it]

BLEU Score: 0.6839





##Simple QA Bot

In [22]:
# Create a simple QA bot
def qa_bot(context, question):
    inputs = tokenizer.encode_plus(question, context, return_tensors='pt')
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        start_scores = outputs.start_logits
        end_scores = outputs.end_logits

    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)
    answer = tokenizer.decode(input_ids[0][start_index:end_index+1])
    return answer

In [30]:
test_data[2]['question']

'How does Salesforce interact with Indigenous communities?'

In [29]:
# Example usage of the QA bot
context = test_data[0]['question']
question = "How does Salesforce interact with Indigenous communities?'"
answer = qa_bot(context, question)
print(f"Question: {question}")
print(f"Answer: {answer}")

Question: How does Salesforce interact with Indigenous communities?'
Answer: ibm take to uphold its environmental policy?


In [18]:
def calculate_bleu(predictions, references):
    bleu_scores = []
    for pred, ref in zip(predictions, references):
        bleu_scores.append(sentence_bleu([ref.split()], pred.split()))
    return sum(bleu_scores) / len(bleu_scores)

In [19]:
!pip install gradio



[notice] A new release of pip is available: 24.1.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


