In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
import numpy as np
import pandas as pd

import spacy
from nltk import ngrams as nltk_ngrams

from sklearn.metrics import cohen_kappa_score, confusion_matrix, classification_report

import evaluate
from sentence_transformers import SentenceTransformer, util

import seaborn as sns
import matplotlib.pyplot as plt

# precision_metric = evaluate.load('precision')
# recall_metric = evaluate.load('recall')
# rouge_model = evaluate.load('rouge')
# bleu_model = evaluate.load("bleu")
bleurt_model = evaluate.load("bleurt", module_type="metric", checkpoint="bleurt-20")
# similarity_model = SentenceTransformer('stsb-roberta-large')

nlp = spacy.load('en_core_web_sm')

In [None]:
df = pd.read_csv('~/active-projects/textbook-question-generation/data/aqag-chatgpt-vicuna.csv')
# df.rename({'correct_answer_vicuna': 'gold_answer'}, axis=1, inplace=True)
df.rename({'correct_answer': 'gold_answer'}, axis=1, inplace=True)
df.rename({'correct_answer_vicuna': 'correct_answer'}, axis=1, inplace=True)
# df.head()

In [None]:
# df.shape, df.dropna(subset=['clean_text', 'question', 'correct_answer', 'incorrect_answer', 'gold_answer']).shape

In [None]:
df.dropna(subset=['clean_text', 'question', 'correct_answer', 'incorrect_answer', 'gold_answer'], inplace=True)
# df.shape

In [None]:
temp_correct_df = df[['clean_text', 'question', 'gold_answer', 'correct_answer']]
temp_correct_df.rename({'correct_answer': 'student_response'}, axis=1, inplace=True)
temp_correct_df['true_label'] = 1
# temp_correct_df.head()

In [None]:
temp_incorrect_df = df[['clean_text', 'question', 'gold_answer', 'incorrect_answer']]
temp_incorrect_df.rename({'incorrect_answer': 'student_response'}, axis=1, inplace=True)
temp_incorrect_df['true_label'] = 0
# temp_incorrect_df.head()

In [None]:
df.shape, temp_correct_df.shape, temp_incorrect_df.shape

In [None]:
df = pd.concat([temp_correct_df, temp_incorrect_df]).reset_index(drop=True)
df.head()

#### Preprocessing

In [None]:
# preprocessing steps for similarity computation:
# 1. lower case
# 2. remove non-alphanumeric characters except those bringing in context - (['@', '#', '$', '%', '*', '<', '>', '.', ','])
# 3. remove stopwords
# 4. lemmatize --- experiment

def func_preprocessing(text:str, lemmatize:bool=False):

    return_list = list()
    doc = nlp(text.lower().strip())
    for token in [token for token in doc]:
        if (token.text.isalnum() or any(i in token.text and token.text.count(i) == 1 for i in ['@', '#', '$', '%', '<', '>', '.', ',', '+', '-', '*'])) and (not token.is_stop):
            if lemmatize:
                return_list.append(token.lemma_)
            else:
                return_list.append(token.text)
    
    return ' '.join(return_list)

df['processed_gold_answer'] = df['gold_answer'].apply(lambda x: func_preprocessing(x))
df['processed_student_response'] = df['student_response'].apply(lambda x: func_preprocessing(x))

df['processed_lemmatized_gold_answer'] = df['gold_answer'].apply(lambda x: func_preprocessing(x, lemmatize=True))
df['processed_lemmatized_student_response'] = df['student_response'].apply(lambda x: func_preprocessing(x, lemmatize=True))

df.head()

#### Computing BLUERT

In [None]:
df['score'] = bleurt_model.compute(predictions=df['student_response'].tolist(), references=df['gold_answer'].tolist())['scores']
df['processed_score'] = bleurt_model.compute(predictions=df['processed_student_response'].tolist(), references=df['processed_gold_answer'].tolist())['scores']
df['processed_lemmatized_score'] = bleurt_model.compute(predictions=df['processed_lemmatized_student_response'].tolist(), references=df['processed_lemmatized_gold_answer'].tolist())['scores']

In [None]:
df.head()

In [None]:
# df.to_csv('~/active-projects/textbook-question-generation/data/aqag-chatgpt-vicuna-bleurt.csv', index=False)

In [None]:
# df = pd.read_csv('~/active-projects/textbook-question-generation/data/aqag-chatgpt-vicuna-bleurt.csv')
# df.head()

In [None]:
df['score'].plot(kind='hist')

In [None]:
df['processed_score'].plot(kind='hist')

In [None]:
df['processed_lemmatized_score'].plot(kind='hist')

#### Scoring

In [None]:
df['label'] = df['score'].apply(lambda x: 1 if x>0.8 else 0)
df['processed_label'] = df['processed_score'].apply(lambda x: 1 if x>0.8 else 0)
df['processed_lemmatized_label'] = df['processed_lemmatized_score'].apply(lambda x: 1 if x>0.8 else 0)

In [None]:
confusion_matrix(df['true_label'], df['label'])

In [None]:
print(classification_report(df['true_label'], df['label']))

In [None]:
cohen_kappa_score(df['true_label'], df['label'])

In [None]:
confusion_matrix(df['true_label'], df['processed_label'])

In [None]:
print(classification_report(df['true_label'], df['processed_label']))

In [None]:
cohen_kappa_score(df['true_label'], df['processed_label'])

In [None]:
confusion_matrix(df['true_label'], df['processed_lemmatized_label'])

In [None]:
print(classification_report(df['true_label'], df['processed_lemmatized_label']))

In [None]:
cohen_kappa_score(df['true_label'], df['processed_lemmatized_label'])

In [None]:
df[df['true_label'] == 1]['score'].plot(kind='hist');

In [None]:
df[df['true_label'] == 0]['score'].plot(kind='hist');