# Import necessary libraries

In [None]:
!pip install transformers
!pip install sentencepiece

In [None]:
import pandas as pd
import transformers
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import T5Tokenizer, T5ForConditionalGeneration
import nltk
nltk.download('punkt')

# Generate Summary

In [None]:
train_df = pd.read_csv("/content/original_train.csv")    # original training set from organizers
test_df = pd.read_csv("/content/original_test.csv")      # original test set from organizers
dev_df = pd.read_csv("/content/original_dev.csv")        # original dev set from organizers

In [None]:
def generate_long_text_summary(long_text, max_length_per_section):
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    model = T5ForConditionalGeneration.from_pretrained("t5-base").to('cuda')

    # Split the text into smaller sections
    sections = [long_text[i:i + max_length_per_section] for i in range(0, len(long_text), max_length_per_section)]

    summaries = []

    for section in sections:
        input_text = "summarize: " + section
        inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=1000, truncation=True, padding=True)

        # Adjust max_length and length_penalty as needed
        summary_ids = model.generate(inputs.to('cuda'), max_length=100, length_penalty=2.0, num_beams=4, early_stopping=True)

        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    # Concatenate the summaries for each section
    final_summary = " ".join(summaries)
    return final_summary

In [None]:
def generate_double_summary(df):
    # Iterate through each row and generate summaries
    for index, row in df.iterrows():
        try:
          input_text = row['explanation']

          # Generate the first summary
          summary = generate_long_text_summary(input_text, 1000)

          # Use the first summary as input for the second summary
          input_text = summary
          summary_new = generate_long_text_summary(input_text, 300)

          # Store the final summary in the 'summary' column
          df.at[index, 'summary'] = summary_new
        except:
          df.at[index, 'summary'] = " "
    return df

In [None]:
# Generate 2nd level summary on all 3 sets
df_train = generate_double_summary(train_df)
df_test = generate_double_summary(test_df)
df_dev = generate_double_summary(dev_df)

# Save all 3 dataframes
df_train.to_csv("/content/summary_train.csv")
df_test.to_csv("/content/summary_test.csv")
df_dev.to_csv("/content/summary_dev.csv")

In [None]:
df_dev = pd.read_csv("/content/summary_dev.csv") # Summarized Dev set
df_train = pd.read_csv("/content/summary_train.csv") # Summarized Train set
df_test = pd.read_csv("/content/summary_test.csv") # Summarized Test set

# Transformer embeddings

In [None]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained('LambdaX-AI/legal-deberta-v1')
deberta_model = AutoModel.from_pretrained('LambdaX-AI/legal-deberta-v1').to('cuda')

In [None]:
# Function to get sentence embeddings using Transformer
def get_embeddings(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    inputs = inputs.to('cuda')
    outputs = deberta_model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().detach().cpu().numpy()
    return embeddings


In [None]:
# Get embeddings for questions and answers for training set
df_train['question_embeddings'] = df_train['question'].apply(get_embeddings)
df_train['answer_embeddings'] = df_train['answer'].apply(get_embeddings)
df_train['summary_embeddings'] = df_train['summary'].apply(get_embeddings)

In [None]:
# Get embeddings for questions and answers for dev set
df_dev['question_embeddings'] = df_dev['question'].apply(get_embeddings)
df_dev['answer_embeddings'] = df_dev['answer'].apply(get_embeddings)
df_dev['summary_embeddings'] = df_dev['summary'].apply(get_embeddings)

In [None]:
# Get embeddings for questions and answers for test set
df_test['question_embeddings'] = df_test['question'].apply(get_embeddings)
df_test['answer_embeddings'] = df_test['answer'].apply(get_embeddings)
df_test['summary_embeddings'] = df_test['summary'].apply(get_embeddings)

# Word2Vec embeddings

In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize


# Tokenize the sentences
df_train['question_tokens'] = df_train['question'].apply(word_tokenize)
df_train['answer_tokens'] = df_train['answer'].apply(word_tokenize)
df_train['summary_tokens'] = df_train['summary'].apply(word_tokenize)

df_dev['question_tokens'] = df_dev['question'].apply(word_tokenize)
df_dev['answer_tokens'] = df_dev['answer'].apply(word_tokenize)
df_dev['summary_tokens'] = df_dev['summary'].apply(word_tokenize)

df_test['question_tokens'] = df_test['question'].apply(word_tokenize)
df_test['answer_tokens'] = df_test['answer'].apply(word_tokenize)
df_test['summary_tokens'] = df_test['summary'].apply(word_tokenize)

my_embedding_size = 5
# Train a Word2Vec model
word2vec_model = Word2Vec(sentences=df_train['question_tokens'].tolist() +
                                   df_train['answer_tokens'].tolist() +
                                   df_train['summary_tokens'].tolist() +
                                   df_dev['question_tokens'].tolist() +
                                   df_dev['answer_tokens'].tolist() +
                                   df_dev['summary_tokens'].tolist() +
                                   df_test['question_tokens'].tolist() +
                                   df_test['answer_tokens'].tolist() +
                                   df_test['summary_tokens'].tolist(),
                          vector_size=my_embedding_size,  # Setting the desired size of embeddings
                          window=7,  # Adjusting the window size
                          min_count=1,  # Minimum word frequency to be included in the model
                          workers=2)  # Number of CPU cores to use during training


In [None]:
def get_word2vec_embeddings(tokens):
    embeddings = [word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv]
    if not embeddings:
        # If no word has embeddings, return zeros or handle as needed
        return [0.0] * your_embedding_size
    return embeddings

# Get Word2Vec embeddings for questions, answers, and summaries
df_train['question_embeddings'] = df_train['question_tokens'].apply(get_word2vec_embeddings)
df_train['answer_embeddings'] = df_train['answer_tokens'].apply(get_word2vec_embeddings)
df_train['summary_embeddings'] = df_train['summary_tokens'].apply(get_word2vec_embeddings)

In [None]:
# Get Word2Vec embeddings for questions, answers, and summaries for test and dev set
df_test['question_embeddings'] = df_test['question_tokens'].apply(get_word2vec_embeddings)
df_test['answer_embeddings'] = df_test['answer_tokens'].apply(get_word2vec_embeddings)
df_test['summary_embeddings'] = df_test['summary_tokens'].apply(get_word2vec_embeddings)

df_dev['question_embeddings'] = df_dev['question_tokens'].apply(get_word2vec_embeddings)
df_dev['answer_embeddings'] = df_dev['answer_tokens'].apply(get_word2vec_embeddings)
df_dev['summary_embeddings'] = df_dev['summary_tokens'].apply(get_word2vec_embeddings)

# GloVe embeddings

In [None]:
!pip install glove-python3

In [None]:
from glove import Glove
from glove import Corpus
from nltk.tokenize import word_tokenize

your_embedding_size = 5
# Tokenize the sentences
df_train['question_tokens'] = df_train['question'].apply(word_tokenize)
df_train['answer_tokens'] = df_train['answer'].apply(word_tokenize)
df_train['summary_tokens'] = df_train['summary'].apply(word_tokenize)

# Combine all tokens for training GloVe
all_tokens = df_train['question_tokens'].tolist() + df_train['answer_tokens'].tolist() + df_train['summary_tokens'].tolist()

# Create a GloVe Corpus
corpus = Corpus()
corpus.fit(all_tokens, window=10)  # Adjusting the window size

# Train the GloVe model
glove = Glove(no_components=your_embedding_size, learning_rate=0.05)  # Setting the desired size of embeddings and learning rate
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)  # Adjusting the number of epochs and threads

# Adding the words and their embeddings to the model
glove.add_dictionary(corpus.dictionary)

# Function to get GloVe embeddings
def get_glove_embeddings(tokens):
    embeddings = [glove.word_vectors[glove.dictionary[word]] for word in tokens if word in glove.dictionary]
    if not embeddings:
        # If no word has embeddings, return zeros or handle as needed
        return [0.0] * your_embedding_size
    return embeddings

# Get GloVe embeddings for questions, answers, and summaries
df_train['question_embeddings'] = df_train['question_tokens'].apply(get_glove_embeddings)
df_train['answer_embeddings'] = df_train['answer_tokens'].apply(get_glove_embeddings)
df_train['summary_embeddings'] = df_train['summary_tokens'].apply(get_glove_embeddings)


In [None]:
df_dev['question_tokens'] = df_dev['question'].apply(word_tokenize)
df_dev['answer_tokens'] = df_dev['answer'].apply(word_tokenize)
df_dev['summary_tokens'] = df_dev['summary'].apply(word_tokenize)

df_dev['question_embeddings'] = df_dev['question_tokens'].apply(get_glove_embeddings)
df_dev['answer_embeddings'] = df_dev['answer_tokens'].apply(get_glove_embeddings)
df_dev['summary_embeddings'] = df_dev['summary_tokens'].apply(get_glove_embeddings)


# Similarity scores or Distance

In [None]:
from scipy.spatial.distance import euclidean
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cityblock


def calculate_distance(row):
    # Function to calculate distance between 2 indices
    qa_correlation = cityblock(row['question_embeddings'], row['answer_embeddings'])
    as_correlation = cityblock(row['answer_embeddings'], row['summary_embeddings'])
    return np.mean([ as_correlation, qa_correlation])

df_train['similarity'] = df_train.apply(calculate_distance, axis=1)
df_dev['similarity'] = df_dev.apply(calculate_distance, axis=1)
df_test['similarity'] = df_test.apply(calculate_distance, axis=1)

In [None]:
def calculate_cosine_similarity(embeddings1, embeddings2):
    # Convert the lists of arrays to numpy arrays
    array1 = np.array(embeddings1)
    array2 = np.array(embeddings2)

    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(array1.reshape(1, -1), array2.reshape(1, -1))

    # Take the mean of the cosine similarity values
    mean_similarity = np.mean(similarity_matrix)

    return mean_similarity

# Apply the function to calculate cosine similarity for each row
df_train['question_answer_similarity'] = df_train.apply(lambda row: calculate_cosine_similarity(row['question_embeddings'], row['answer_embeddings']), axis=1)
df_train['answer_summary_similarity'] = df_train.apply(lambda row: calculate_cosine_similarity(row['answer_embeddings'], row['summary_embeddings']), axis=1)
df_train['mean_similarity'] = df_train[['question_answer_similarity', 'answer_summary_similarity']].mean(axis=1)

In [None]:
# May need a reshape based on type of embeddings
df_dev['question_answer_similarity'] = df_dev.apply(lambda row: calculate_cosine_similarity(row['question_embeddings'], row['answer_embeddings']), axis=1)
df_dev['answer_summary_similarity'] = df_dev.apply(lambda row: calculate_cosine_similarity(row['answer_embeddings'], row['summary_embeddings']), axis=1)
df_dev['mean_similarity'] = df_dev[['question_answer_similarity', 'answer_summary_similarity']].mean(axis=1)

In [None]:
df_test['question_answer_similarity'] = df_test.apply(lambda row: calculate_cosine_similarity(row['question_embeddings'], row['answer_embeddings']), axis=1)
df_test['answer_summary_similarity'] = df_test.apply(lambda row: calculate_cosine_similarity(row['answer_embeddings'], row['summary_embeddings']), axis=1)
df_test['mean_similarity'] = df_test[['question_answer_similarity', 'answer_summary_similarity']].mean(axis=1)

In [None]:
def reshape_embedding(df):
    # Function to reshape embeddings when explanaton is not available
    # Check if the embedding is 1D and reshape if needed
    for idx, row in df.iterrows():
      embedding = df['summary_embeddings'][idx]
      embedding = np.array(embedding)
      if len(embedding.shape) == 1:
        embedding = embedding.reshape(1, -1)
        print(idx)
        df.at[idx, 'summary_embeddings'] = embedding
        x = df['summary_embeddings'][idx]
        print(x.shape)
    return df

df_test = reshape_embedding(df_test)

# Assigning Predictions

In [None]:
def assign_predictions(df):
    # Create a new column 'predictions' initialized with 0
    df['predictions'] = 0

    # Iterate over unique 'question_embeddings'
    for idx, row in df.iterrows():
        # Find the rows with the same 'question_embeddings'
        same_question_rows = df[df['question_embeddings'].apply(lambda x: np.array_equal(x, row['question_embeddings']))]

        # Find the index of the row with the minimum 'distance'
        min_distance_index = same_question_rows['mean_similarity'].idxmin()         # For Distance
        # max_distance_index = same_question_rows['mean_similarity'].idxmax()       # Uncomment this line for Cosine similarity

        # Update the 'predictions' column for the row with the minimum distance
        df.loc[min_distance_index, 'predictions'] = 1                               # For Distance
        # df.loc[max_distance_index, 'predictions'] = 1                             # Uncomment this line for cosine similarity
    return df

# Apply predictions
df_dev = assign_predictions(df_dev)
df_train = assign_predictions(df_train)
df_test = assign_predictions(df_test)

In [None]:
# Function to replace predictions that are most likely to be wromgly predicted (Cosine similarity based)

def assign_new_predictions(df):
    # Create a new column 'predictions' initialized with 0
    df['predictions'] = 0

    # Iterate over unique 'question_embeddings'
    for idx, row in df.iterrows():
        # Find the rows with the same 'question_embeddings'
        same_question_rows = df[df['question_embeddings'].apply(lambda x: np.array_equal(x, row['question_embeddings']))]

        if not same_question_rows.empty:
            # Check if there is a wrongly classified index
            second_largest_similarity_index = same_question_rows['mean_similarity'].nlargest(2).index[1] if len(same_question_rows) > 1 else same_question_rows.index[0]
            max_similarity_index = same_question_rows['mean_similarity'].idxmax()
            if df['mean_similarity'][max_similarity_index] - df['mean_similarity'][second_largest_similarity_index] <=0.0005:
                # Update the 'predictions' column for the entire set with the second largest distance
                df.loc[same_question_rows.index, 'predictions'] = 0
                df.loc[second_largest_similarity_index, 'predictions'] = 1
            else:
                # Update the 'predictions' column for the entire set with the largest distance
                df.loc[same_question_rows.index, 'predictions'] = 0
                df.loc[max_similarity_index, 'predictions'] = 1

            if second_largest_distance_index==max_similarity_index:
                df.loc[same_question_rows.index, 'predictions'] = 0
    return df

df_dev = assign_new_predictions(df_dev)
df_train = assign_new_predictions(df_train)
df_test = assign_new_predictions(df_test)

# Analysis

In [None]:
# Function to check distribution of Right (R) and Wrong (W) predictions
# Q indicates rows where Question-Answer similarity >= Summary-Answer similarity
# S indicates rows where Summary-Answer similarity >= Question-Answer similarity

def count_combinations(df):
    return df.groupby(['higher_score', 'R/W']).size().reset_index(name='count')

# Call the function on your DataFrames
df_train_counts = count_combinations(df_train)
df_dev_counts = count_combinations(df_dev)

# Display the counts
print("Training Set Counts:")
print(df_train_counts)

print("\nDevelopment Set Counts:")
print(df_dev_counts)

In [None]:
from sklearn.metrics import f1_score, classification_report

# Calculate macro F1 score
macro_f1 = f1_score(df_dev['label'], df_dev['predictions'], average='macro')

# Print the macro F1 score
print("Macro F1 Score:", macro_f1)

# Print the detailed classification report (precision, recall, F1 score for each class)
classification_rep = classification_report(df_dev['label'], df_dev['predictions'])
print("Classification Report for dev set:\n", classification_rep)

In [None]:
from sklearn.metrics import f1_score, classification_report

# Calculate macro F1 score
macro_f1 = f1_score(df_train['label'], df_train['predictions'], average='macro')

# Print the macro F1 score
print("Macro F1 Score:", macro_f1)

# Print the detailed classification report (precision, recall, F1 score for each class)
classification_rep = classification_report(df_train['label'], df_train['predictions'])
print("Classification Report for train set:\n", classification_rep)

In [None]:
y=df_test['predictions']