In [None]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

***# Step 1***

In [None]:
train_data = pd.read_csv("C:/Users/hp/Downloads/train.xlsx")  
test_data = pd.read_csv("C:/Users/hp/Downloads/test.xlsx") 

***# Step 2: Data Preprocessing***

In [None]:
# Tokenization using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

***# Step 3: Model Architecture***

In [None]:
class TweetSimilarityModel(nn.Module):
    def __init__(self):
        super(TweetSimilarityModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.fc = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = output['pooler_output']
        similarity_score = self.sigmoid(self.fc(pooled_output))
        return similarity_score

***# Step 4: Training and Evaluation***

In [None]:
def train_model(model, train_loader, criterion, optimizer, num_epochs=5):
    model.train()
    for epoch in range(num_epochs):
        for batch in train_loader:
            input_ids, attention_mask, labels = batch
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs.squeeze(), labels.float())
            loss.backward()
            optimizer.step()

In [None]:
def evaluate_model(model, test_loader):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask)
            predictions.extend(outputs.cpu().numpy().tolist())
            true_labels.extend(labels.cpu().numpy().tolist())
    predictions = np.array(predictions).flatten()
    true_labels = np.array(true_labels)
    precision = precision_score(true_labels, predictions.round())
    recall = recall_score(true_labels, predictions.round())
    f1 = f1_score(true_labels, predictions.round())
    return precision, recall, f1

In [None]:
        # Convert tweets to BERT tokens and create DataLoader
def tokenize_and_create_dataloader(data, tokenizer, max_length=128, batch_size=32):
    input_ids = []
    attention_masks = []
    labels = data['label'].tolist()
    for index, row in data.iterrows():
        encoded_dict = tokenizer.encode_plus(
                            row['text1'],
                            row['text2'],
                            add_special_tokens=True,
                            max_length=max_length,
                            padding='max_length',
                            truncation=True,
                            return_attention_mask=True,
                            return_tensors='pt'
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)
    dataset = TensorDataset(input_ids, attention_masks, labels)
    dataloader = DataLoader(dataset, batch_size=batch_size)
    return dataloader

train_dataloader = tokenize_and_create_dataloader(train_data, tokenizer)
test_dataloader = tokenize_and_create_dataloader(test_data, tokenizer)

# Initialize model, criterion, and optimizer
model = TweetSimilarityModel()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

# Train the model
train_model(model, train_dataloader, criterion, optimizer)

# Evaluate the model
precision, recall, f1 = evaluate_model(model, test_dataloader)
print(f"Precision: {precision}, Recall: {recall}, F1 Score: {f1}")