In [None]:
import numpy as np
import pandas as pd

import spacy
from nltk import ngrams as nltk_ngrams

from sklearn.metrics import cohen_kappa_score, confusion_matrix, classification_report

import evaluate
from sentence_transformers import SentenceTransformer, util
from transformers import T5Tokenizer, T5ForConditionalGeneration, PegasusTokenizer, PegasusForConditionalGeneration

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
precision_metric = evaluate.load('precision')
recall_metric = evaluate.load('recall')

rouge_model = evaluate.load('rouge')
# bleu_model = evaluate.load("bleu")
# bleurt_model = evaluate.load("bleurt", module_type="metric")

spacy_model = spacy.load('en_core_web_sm')
similarity_model = SentenceTransformer('stsb-roberta-large')

In [None]:
# pegasus_tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')
# pegasus_model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-large')

t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base')

def generate_summary(tokenizer, model, text):

    text = f"Summarize: {text}"
    inputs = tokenizer.encode(text, return_tensors="pt", truncation=True)
    
    summary_ids = model.generate(inputs, max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True, temperature=0.1, repetition_penalty=2.0)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return summary

In [None]:
df = pd.read_csv('~/active-projects/textbook-question-generation/data/aqag-chatgpt-vicuna.csv')
# df.rename({'correct_answer_vicuna': 'gold_answer'}, axis=1, inplace=True)
df.rename({'correct_answer': 'gold_answer'}, axis=1, inplace=True)
df.rename({'correct_answer_vicuna': 'correct_answer'}, axis=1, inplace=True)
# df.head()

In [None]:
# df.shape, df.dropna(subset=['clean_text', 'question', 'correct_answer', 'incorrect_answer', 'gold_answer']).shape

In [None]:
df.dropna(subset=['clean_text', 'question', 'correct_answer', 'incorrect_answer', 'gold_answer'], inplace=True)
# df.shape

In [None]:
temp_correct_df = df[['clean_text', 'question', 'gold_answer', 'correct_answer']]
temp_correct_df.rename({'correct_answer': 'student_response'}, axis=1, inplace=True)
temp_correct_df['true_label'] = 1
# temp_correct_df.head()

In [None]:
temp_incorrect_df = df[['clean_text', 'question', 'gold_answer', 'incorrect_answer']]
temp_incorrect_df.rename({'incorrect_answer': 'student_response'}, axis=1, inplace=True)
temp_incorrect_df['true_label'] = 0
# temp_incorrect_df.head()

In [None]:
df.shape, temp_correct_df.shape, temp_incorrect_df.shape

In [None]:
df = pd.concat([temp_correct_df, temp_incorrect_df]).reset_index(drop=True)
df.head()

In [None]:
df['gold_answer'] = df['gold_answer'].apply(lambda x: generate_summary(t5_tokenizer, t5_model, x))
df['student_response'] = df['student_response'].apply(lambda x: generate_summary(t5_tokenizer, t5_model, x))

#### Preprocessing

In [None]:
# preprocessing steps for similarity computation:
# 1. lower case
# 2. remove non-alphanumeric characters except those bringing in context - (['@', '#', '$', '%', '*', '<', '>', '.', ','])
# 3. remove stopwords
# 4. lemmatize --- experiment

def func_preprocessing(text:str, lemmatize:bool=False):

    return_list = list()
    spacy_document = spacy_model(text.lower().strip())
    for token in [token for token in spacy_document]:
        if (token.text.isalnum() or any(i in token.text and token.text.count(i) == 1 for i in ['@', '#', '$', '%', '<', '>', '.', ',', '+', '-', '*'])) and (not token.is_stop):
            if lemmatize:
                return_list.append(token.lemma_)
            else:
                return_list.append(token.text)
    
    return ' '.join(return_list)

df['processed_gold_answer'] = df['gold_answer'].apply(lambda x: func_preprocessing(x))
df['processed_student_response'] = df['student_response'].apply(lambda x: func_preprocessing(x))

df['processed_gold_answer_lemmatized'] = df['gold_answer'].apply(lambda x: func_preprocessing(x, lemmatize=True))
df['processed_student_response_lemmatized'] = df['student_response'].apply(lambda x: func_preprocessing(x, lemmatize=True))

df.head()

#### Computing Cosine Similarity

In [None]:
# computing similarity between correct and incorect answer
def compute_similarity(list_answers: list):
    # calculating embeddings for the list -> [correct answer, incorrect answer]
    embeddings = similarity_model.encode(list_answers, batch_size=16)
    # returning similarity
    return util.pytorch_cos_sim(embeddings[0], embeddings[1])[0].item()

df['processed_similarity'] = df.apply(lambda x: compute_similarity([x['processed_gold_answer'], x['processed_student_response']]), axis=1)
df['processed_lemmatized_similarity'] = df.apply(lambda x: compute_similarity([x['processed_gold_answer_lemmatized'], x['processed_student_response_lemmatized']]), axis=1)

df.head()

#### Computing ROUGE

In [None]:
# computing rouge_n. n is calculated as follows - 
# minimum of (9 OR 'half of number of tokens in correct answer' OR 'half of number of tokens in incorrect answer') - including 9 because that is the highest that evaluate library can compute
# maximum of (1 OR the output of above) - to prevent n from being equal to 0.

compute_rouge = lambda predictions, references, n: rouge_model.compute(predictions=[predictions], references=[references], rouge_types=[f'rouge{n}'])
get_n = lambda t1, t2: max(1, min(9, int(len(t1.split()) / 2), int(len(t2.split()) / 2)))

df['processed_rouge'] = df.apply(lambda x: list(compute_rouge(x['processed_student_response'], x['processed_gold_answer'], get_n(x['processed_student_response'], x['processed_gold_answer'])).values())[0], axis=1)
df['processed_lemmatized_rouge'] = df.apply(lambda x: list(compute_rouge(x['processed_student_response_lemmatized'], x['processed_gold_answer_lemmatized'], get_n(x['processed_student_response_lemmatized'], x['processed_gold_answer_lemmatized'])).values())[0], axis=1)

In [None]:
df.head()

In [None]:
# df.to_csv('~/active-projects/textbook-question-generation/data/aqag-chatgpt-vicuna-with-rouge-and-sim-vga.csv', index=False)

In [None]:
df['processed_rouge'].plot(kind='hist')

In [None]:
df['processed_lemmatized_rouge'].plot(kind='hist')

In [None]:
df['processed_similarity'].plot(kind='hist')

In [None]:
df['processed_lemmatized_similarity'].plot(kind='hist')

#### Scoring

In [None]:
# scoring method -
# 1. if similarity > 0.90 and rouge > 0.90 -> mark as correct
# 2. if similarity > 0.95 and rouge > 0.85 -> mark as correct
# 3. if similarity > 0.85 and rouge > 0.95 -> mark as correct
# 4. else incorrect
# return 1 for correct and 0 for incorrect

def score(similarity_score: float, rouge_score: float):
    if (similarity_score >= 0.90 and rouge_score >= 0.90) or (similarity_score >= 0.95 and rouge_score >= 0.85) or (similarity_score >= 0.85 and rouge_score >= 0.95):
        return 1
    return 0

df['processed_score'] = df.apply(lambda x: score(x['processed_similarity'], x['processed_rouge']), axis=1)
df['processed_lemmatized_score'] = df.apply(lambda x: score(x['processed_lemmatized_similarity'], x['processed_lemmatized_rouge']), axis=1)

In [None]:
confusion_matrix(df['true_label'], df['processed_score'])

In [None]:
print(classification_report(df['true_label'], df['processed_score']))

In [None]:
cohen_kappa_score(df['true_label'], df['processed_score'])

In [None]:
confusion_matrix(df['true_label'], df['processed_lemmatized_score'])

In [None]:
print(classification_report(df['true_label'], df['processed_lemmatized_score']))

In [None]:
cohen_kappa_score(df['true_label'], df['processed_lemmatized_score'])

In [None]:
# # scoring method -
# # 1. if similarity > 0.90 and rouge > 0.90 -> mark as correct
# # 2. if similarity > 0.95 and rouge > 0.85 -> mark as correct
# # 3. if similarity > 0.85 and rouge > 0.95 -> mark as correct
# # 4. else incorrect
# # return 1 for correct and 0 for incorrect

# def score(similarity_score: float, rouge_score: float):
    
#     for threshold in [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]:
#     if (similarity_score >= 0.90 and rouge_score >= 0.90) or (similarity_score >= 0.95 and rouge_score >= 0.85) or (similarity_score >= 0.85 and rouge_score >= 0.95):
#         return 1
#     return 0

# df['processed_score'] = df.apply(lambda x: score(x['processed_similarity'], x['processed_rouge']), axis=1)
# df['processed_lemmatized_score'] = df.apply(lambda x: score(x['processed_lemmatized_similarity'], x['processed_lemmatized_rouge']), axis=1)