In [None]:
!pip install sentencepiece
!pip install transformers

# Import Necessary Libraries


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

# Generate Summary

In [None]:
train_df = pd.read_csv("/content/original_train.csv")    # original training set from organizers
test_df = pd.read_csv("/content/original_test.csv")      # original test set from organizers
dev_df = pd.read_csv("/content/original_dev.csv")        # original dev set from organizers

In [None]:
def generate_long_text_summary(long_text, max_length_per_section):
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    model = T5ForConditionalGeneration.from_pretrained("t5-base").to('cuda')

    # Split the text into smaller sections
    sections = [long_text[i:i + max_length_per_section] for i in range(0, len(long_text), max_length_per_section)]

    summaries = []

    for section in sections:
        input_text = "summarize: " + section
        inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=1000, truncation=True, padding=True)

        # Adjust max_length and length_penalty as needed
        summary_ids = model.generate(inputs.to('cuda'), max_length=100, length_penalty=2.0, num_beams=4, early_stopping=True)

        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    # Concatenate the summaries for each section
    final_summary = " ".join(summaries)
    return final_summary

In [None]:
def generate_double_summary(df):
    # Iterate through each row and generate summaries
    for index, row in df.iterrows():
        try:
          input_text = row['explanation']

          # Generate the first summary
          summary = generate_long_text_summary(input_text, 1000)

          # Use the first summary as input for the second summary
          input_text = summary
          summary_new = generate_long_text_summary(input_text, 300)

          # Store the final summary in the 'summary' column
          df.at[index, 'summary'] = summary_new
        except:
          df.at[index, 'summary'] = " "
    return df

In [None]:
# Generate 2nd level summary on all 3 sets
df_train = generate_double_summary(train_df)
df_test = generate_double_summary(test_df)
df_dev = generate_double_summary(dev_df)

# Save all 3 dataframes
df_train.to_csv("/content/summary_train.csv")
df_test.to_csv("/content/summary_test.csv")
df_dev.to_csv("/content/summary_dev.csv")

In [None]:
df_dev = pd.read_csv("/content/summary_dev.csv") # Summarized Dev set
df_train = pd.read_csv("/content/summary_train.csv") # Summarized Train set
df_test = pd.read_csv("/content/summary_test.csv") # Summarized Test set

# Extract embeddings using transformer

In [None]:
model_name = 'LambdaX-AI/legal-deberta-v1'  # You can choose a different BERT model if needed
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to('cuda')

In [None]:
def get_embeddings(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    inputs = inputs.to('cuda')
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().detach().cpu().numpy()
    return embeddings

# Get embeddings for questions and answers
df_train['question_embeddings'] = df_train['question'].apply(get_embeddings)
df_train['answer_embeddings'] = df_train['answer'].apply(get_embeddings)
df_train['summary_embeddings'] = df_train['summary'].apply(get_embeddings)


In [None]:
df_dev['question_embeddings'] = df_dev['question'].apply(get_embeddings)
df_dev['answer_embeddings'] = df_dev['answer'].apply(get_embeddings)
df_dev['summary_embeddings'] = df_dev['summary'].apply(get_embeddings)

In [None]:
df_test['question_embeddings'] = df_test['question'].apply(get_embeddings)
df_test['answer_embeddings'] = df_test['answer'].apply(get_embeddings)
df_test['summary_embeddings'] = df_test['summary'].apply(get_embeddings)

In [None]:
qe_train= torch.tensor(df_train['question_embeddings'].tolist(), dtype=torch.float32)
ae_train= torch.tensor(df_train['answer_embeddings'].tolist(), dtype=torch.float32)
se_train= torch.tensor(df_train['summary_embeddings'].tolist(), dtype=torch.float32)

qe_dev= torch.tensor(df_dev['question_embeddings'].tolist(), dtype=torch.float32)
ae_dev= torch.tensor(df_dev['answer_embeddings'].tolist(), dtype=torch.float32)
se_dev= torch.tensor(df_dev['summary_embeddings'].tolist(), dtype=torch.float32)

qe_test= torch.tensor(df_test['question_embeddings'].tolist(), dtype=torch.float32)
ae_test= torch.tensor(df_test['answer_embeddings'].tolist(), dtype=torch.float32)
se_test= torch.tensor(df_test['summary_embeddings'].tolist(), dtype=torch.float32)

In [None]:
labels_train = torch.FloatTensor(df_train['label'])
labels_dev = torch.FloatTensor(df_dev['label'])

question_train_tensor = torch.FloatTensor(qe_train)
answer_train_tensor = torch.FloatTensor(ae_train)
summary_train_tensor = torch.FloatTensor(se_train)
labels_train_tensor = torch.FloatTensor(labels_train)

question_test_tensor = torch.FloatTensor(qe_test)
answer_test_tensor = torch.FloatTensor(ae_test)
summary_test_tensor = torch.FloatTensor(se_test)

question_dev_tensor = torch.FloatTensor(qe_dev)
answer_dev_tensor = torch.FloatTensor(ae_dev)
summary_dev_tensor = torch.FloatTensor(se_dev)
labels_dev_tensor = torch.FloatTensor(labels_dev)

# Function to train Siamese Network

In [None]:
# Define Siamese Network
class SiameseNetwork(nn.Module):
    def __init__(self, embedding_size):
        super(SiameseNetwork, self).__init__()

        # Define three branches for question, answer, and summary
        self.branch_question = nn.Sequential(
            nn.Linear(embedding_size, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU()
        )

        self.branch_answer = nn.Sequential(
            nn.Linear(embedding_size, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU()
        )

        self.branch_summary = nn.Sequential(
            nn.Linear(embedding_size, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU()
        )

        # Final linear layer for computing similarity
        self.fc = nn.Linear(256*3, 1)

    def forward_one_branch(self, x, branch):
        # Forward pass for a single branch
        x = branch(x)
        return x

    def forward(self, question, answer, summary):
        # Forward pass for each branch
        output_question = self.forward_one_branch(question, self.branch_question)
        output_answer = self.forward_one_branch(answer, self.branch_answer)
        output_summary = self.forward_one_branch(summary, self.branch_summary)

        # Concatenate outputs from three branches
        concatenated = torch.cat((output_question, output_answer, output_summary), 1)

        # Final linear layer to compute similarity
        similarity_score = torch.sigmoid(self.fc(concatenated))

        return similarity_score


# Training function
def train_siamese_network(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        for question, answer, summary, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(question, answer, summary)
            loss = criterion(outputs, labels.unsqueeze(1).float())
            loss.backward()
            optimizer.step()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')



In [None]:
# Create a DataLoader for training and testing
train_dataset = TensorDataset(question_train_tensor, answer_train_tensor, summary_train_tensor, labels_train_tensor)
test_dataset = TensorDataset(question_test_tensor, answer_test_tensor, summary_test_tensor)
dev_dataset = TensorDataset(question_dev_tensor, answer_dev_tensor, summary_dev_tensor, labels_dev_tensor)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
dev_loader = DataLoader(dev_dataset, batch_size=16, shuffle=False)

# Create and initialize the Siamese network
embedding_size = 1536  # Adjust according to your embedding size
siamese_model = SiameseNetwork(embedding_size)
criterion = nn.BCELoss()
optimizer = optim.Adam(siamese_model.parameters(), lr=0.001)

# Train the Siamese network
train_siamese_network(siamese_model, train_loader, criterion, optimizer, 20)


# Function to test Siamese Network


In [None]:
# Test function
def test_siamese_network(model, dev_loader):
    model.eval()
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for question, answer, summary, labels in dev_loader:
            outputs = model(question, answer, summary)
            predicted = (outputs > 0.6).float().cpu().numpy()
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted)

    macro_f1 = f1_score(all_labels, all_predictions, average='macro')
    accuracy = accuracy_score(all_labels, all_predictions)

    print(f'Test Macro F1 Score: {macro_f1:.4f}')
    print(f'Test Accuracy: {accuracy:.4f}')

# Function to predict using siamese network

In [None]:
def predict_siamese_network(model, test_loader):
    model.eval()
    all_predictions = []

    with torch.no_grad():
        for question, answer, summary in test_loader:
            outputs = model(question, answer, summary)
            predicted = (outputs > 0.6).float().cpu().numpy()
            all_predictions.extend(predicted)

    return all_predictions

In [None]:
# Test the Siamese network on dev set
test_siamese_network(siamese_model, dev_loader)

In [None]:
# Get predictions of Siamese network on test set
predictions = predict_siamese_network(siamese_model, test_loader)
predictions = np.array(predictions)
y = pd.DataFrame({'predictions': np.squeeze(predictions)})

In [None]:
y.to_csv("siamese.csv", index=True)

In [None]:
torch.save({
            'model_state_dict': siamese_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'embedding_size': embedding_size
            }, 'siamese_model.pth')