<a href="https://colab.research.google.com/github/kakeru91lt/Web-Technologies-/blob/main/QA_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
!pip install transformers
!pip install pandas
!pip install scikit-learn
!pip install torch
!pip install nltk
!pip install string


[31mERROR: Could not find a version that satisfies the requirement string (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for string[0m[31m
[0m

In [28]:
# Ensure the notebook runs in Google Colab environment
# If you're running this locally, install the required packages with:
# !pip install torch transformers sklearn nltk pandas

import pandas as pd
import re
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import BertTokenizerFast, BertForQuestionAnswering, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import string

# Download NLTK data
nltk.download('stopwords')

# Define function to remove links from text
def remove_links(text):
    pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    return re.sub(pattern, '', text)

# Define function to remove stop words
def remove_stop_words(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join(word for word in text.split() if word.lower() not in stop_words)

# Define function to remove punctuations
def remove_punctuations(text):
    return ''.join(char for char in text if char not in string.punctuation)

# Define the Porter Stemmer
ps = PorterStemmer()

# Define function to stem text
def stem_text(text):
    return ' '.join(ps.stem(word) for word in text.split())

def tokenize_and_encode(questions, answers, tokenizer, max_len):
    input_ids = []
    attention_masks = []
    start_positions = []
    end_positions = []

    for question, answer in zip(questions, answers):
        encoded_dict = tokenizer.encode_plus(question,
                                             answer,
                                             add_special_tokens=True,
                                             max_length=max_len,
                                             truncation=True,
                                             padding='max_length',
                                             return_attention_mask=True,
                                             return_offsets_mapping=True)

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

        offsets = encoded_dict['offset_mapping']
        start_idx, end_idx = None, None

        for idx, (start, end) in enumerate(offsets):
            if start_idx is None and start != 0 and question[start:end] in answer:
                start_idx = idx
            if start_idx is not None and question[start:end] in answer and end == len(answer):
                end_idx = idx
                break

        if start_idx is None or end_idx is None:
            print("Failed to find start or end positions for:")
            print("Question:", question)
            print("Answer:", answer)
            print("Offsets:", offsets)
            # Set start and end positions to -1 to indicate that answer is not present in the question
            start_idx = -1
            end_idx = -1

        start_positions.append(start_idx)
        end_positions.append(end_idx)

    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    start_positions = torch.tensor(start_positions)
    end_positions = torch.tensor(end_positions)

    assert len(input_ids) == len(attention_masks) == len(start_positions) == len(end_positions), "Length mismatch between tensors"

    return input_ids, attention_masks, start_positions, end_positions




# Load the dataset
data = pd.read_csv("/content/qa.csv") # Update with your dataset path

# Apply preprocessing steps
data['cleaned_question'] = data['question'].apply(remove_links)
data['cleaned_question'] = data['cleaned_question'].apply(remove_stop_words)
data['cleaned_question'] = data['cleaned_question'].apply(remove_punctuations)
data['cleaned_question'] = data['cleaned_question'].apply(stem_text)

data['cleaned_answer'] = data['answer'].apply(remove_links)
data['cleaned_answer'] = data['cleaned_answer'].apply(remove_stop_words)
data['cleaned_answer'] = data['cleaned_answer'].apply(remove_punctuations)
data['cleaned_answer'] = data['cleaned_answer'].apply(stem_text)

# Split data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

# Initialize tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', do_lower_case=True)
max_len = 128  # Adjust based on your dataset

# Tokenize and encode data
train_input_ids, train_attention_masks, train_start_positions, train_end_positions = tokenize_and_encode(
    train_data['cleaned_question'].values, train_data['cleaned_answer'].values, tokenizer, max_len)
val_input_ids, val_attention_masks, val_start_positions, val_end_positions = tokenize_and_encode(
    val_data['cleaned_question'].values, val_data['cleaned_answer'].values, tokenizer, max_len)

print("Size of train_input_ids:", train_input_ids.size())
print("Size of train_attention_masks:", train_attention_masks.size())
print("Size of train_start_positions:", train_start_positions.size())
print("Size of train_end_positions:", train_end_positions.size())

# Create TensorDatasets
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_start_positions, train_end_positions)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_start_positions, val_end_positions)


# Define batch size
batch_size = 8

# Create DataLoader for training and validation sets
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

# Initialize model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Define learning rate scheduler
num_epochs = 3
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training function
def train_model(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_masks, start_positions, end_positions = batch

        model.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

# Evaluation function
def evaluate_model(model, dataloader, device):
    model.eval()
    all_start_preds = []
    all_end_preds = []
    all_start_labels = []
    all_end_labels = []

    with torch.no_grad():
        for batch in dataloader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_masks, start_positions, end_positions = batch

            outputs = model(input_ids, attention_mask=attention_masks)
            start_logits, end_logits = outputs.start_logits, outputs.end_logits

            start_preds = torch.argmax(start_logits, dim=1)
            end_preds = torch.argmax(end_logits, dim=1)

            all_start_preds.extend(start_preds.cpu().numpy())
            all_end_preds.extend(end_preds.cpu().numpy())
            all_start_labels.extend(start_positions.cpu().numpy())
            all_end_labels.extend(end_positions.cpu().numpy())

    start_accuracy = accuracy_score(all_start_labels, all_start_preds)
    end_accuracy = accuracy_score(all_end_labels, all_end_preds)

    start_precision = precision_score(all_start_labels, all_start_preds, average='weighted')
    end_precision = precision_score(all_end_labels, all_end_preds, average='weighted')

    start_recall = recall_score(all_start_labels, all_start_preds, average='weighted')
    end_recall = recall_score(all_end_labels, all_end_preds, average='weighted')

    start_f1 = f1_score(all_start_labels, all_start_preds, average='weighted')
    end_f1 = f1_score(all_end_labels, all_end_preds, average='weighted')

    return start_accuracy, end_accuracy, start_precision, end_precision, start_recall, end_recall, start_f1, end_f1

# Training and evaluation loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    train_loss = train_model(model, train_dataloader, optimizer, scheduler, device)
    start_acc, end_acc, start_prec, end_prec, start_rec, end_rec, start_f1, end_f1 = evaluate_model(model, val_dataloader, device)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Training Loss: {train_loss:.4f}")
    print(f"Validation Start Accuracy: {start_acc:.4f}, Precision: {start_prec:.4f}, Recall: {start_rec:.4f}, F1: {start_f1:.4f}")
    print(f"Validation End Accuracy: {end_acc:.4f}, Precision: {end_prec:.4f}, Recall: {end_rec:.4f}, F1: {end_f1:.4f}")

# Save model
model_save_path = './bert_qa_model.pth'
torch.save(model.state_dict(), model_save_path)
print("Model saved!")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Failed to find start or end positions for:
Question: favorit featur
Answer: person prefer
Offsets: [(0, 0), (0, 5), (5, 7), (8, 12), (12, 14), (0, 0), (0, 6), (7, 13), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), 

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/3
Training Loss: 0.8999
Validation Start Accuracy: 0.9752, Precision: 0.9521, Recall: 0.9752, F1: 0.9633
Validation End Accuracy: 0.9917, Precision: 0.9847, Recall: 0.9917, F1: 0.9879


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/3
Training Loss: 0.0452
Validation Start Accuracy: 0.9752, Precision: 0.9521, Recall: 0.9752, F1: 0.9633
Validation End Accuracy: 0.9917, Precision: 0.9847, Recall: 0.9917, F1: 0.9879


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 3/3
Training Loss: 0.0253
Validation Start Accuracy: 0.9752, Precision: 0.9521, Recall: 0.9752, F1: 0.9633
Validation End Accuracy: 0.9917, Precision: 0.9847, Recall: 0.9917, F1: 0.9879
Model saved!


In [29]:
from google.colab import files
files.download('bert_qa_model.pth')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>