In [1]:
import numpy as np
import pandas as pd

import evaluate
from sentence_transformers import SentenceTransformer, util
from transformers import T5Tokenizer, T5ForConditionalGeneration, PegasusTokenizer, PegasusForConditionalGeneration

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
# rouge_model = evaluate.load('rouge')
# similarity_model = SentenceTransformer('stsb-roberta-large')

# pegasus_tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')
# pegasus_model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-large')

# t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
# t5_model = T5ForConditionalGeneration.from_pretrained('t5-base')

def generate_summary(tokenizer, model, text):

    text = f"Summarize: {text}"
    inputs = tokenizer.encode(text, return_tensors="pt", truncation=True)
    
    summary_ids = model.generate(inputs, max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True, temperature=0.1, repetition_penalty=2.0)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    return summary

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [3]:
df = pd.read_csv('~/active-projects/textbook-question-generation/data/aqag-chatgpt-vicuna.csv')
# df.rename({'correct_answer_vicuna': 'gold_answer'}, axis=1, inplace=True)
df.rename({'correct_answer': 'gold_answer'}, axis=1, inplace=True)
df.rename({'correct_answer_vicuna': 'correct_answer'}, axis=1, inplace=True)
df.head()

Unnamed: 0.1,Unnamed: 0,index,module,chapter,section,subsection,heading,raw_text,clean_text,slug,question,gold_answer,incorrect_answer,type,correct_answer
0,0,0,1,1,0,0,Decisions ... Decisions in the Social Media Age,Every day we are faced with a myriad of decisi...,Every day we are faced with a myriad of decisi...,decisions--decisions-in-the-social-media-age,How can social media alter how we make decisions?,Social media outlets like Facebook and Twitter...,Social media outlets like Facebook and Twitter...,recall,Social media can alter how we make decisions b...
1,1,1,1,1,0,1,Introduction,What is economics and why should you spend you...,What is economics and why should you spend you...,introduction,What is economics?,Economics is both a subject area and a way of ...,Economics is primarily about money or finance.,recall,Economics is a subject area that studies how p...
2,2,2,1,1,1,0,Overview,"import Alert from ""react-bootstrap/Alert"";\nim...","By the end of this section, you will be able t...",overview,What is scarcity?,Scarcity is a fact of life in which human want...,Scarcity is when resources are infinite and hu...,recall,Scarcity refers to the limited availability of...
3,3,3,1,1,1,1,Introduction to FRED,Data is very important in economics because it...,Data is very important in economics because it...,introduction-to-fred,What is the name of the database where most of...,The St. Louis Federal Reserve Bank's FRED data...,The US Census Bureau database.,recall,The data used for this course is obtained from...
4,4,4,1,1,1,2,The Problem of Scarcity,"Think about all the things you consume: food, ...","Think about all the things you consume: food, ...",the-problem-of-scarcity,What economic principle does Adam Smith first ...,Division and specialization of labor.,Scarcity.,recall,Adam Smith first put forth the division and sp...


In [4]:
# df.shape, df.dropna(subset=['clean_text', 'question', 'correct_answer', 'incorrect_answer', 'gold_answer']).shape

In [None]:
df.dropna(subset=['clean_text', 'question', 'correct_answer', 'incorrect_answer', 'gold_answer'], inplace=True)
# df.shape

In [5]:
temp_correct_df = df[['clean_text', 'question', 'gold_answer', 'correct_answer']]
temp_correct_df.rename({'correct_answer': 'student_response'}, axis=1, inplace=True)
temp_correct_df['true_label'] = 1
# temp_correct_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_correct_df.rename({'correct_answer': 'student_response'}, axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_correct_df['true_label'] = 1


In [6]:
temp_incorrect_df = df[['clean_text', 'question', 'gold_answer', 'incorrect_answer']]
temp_incorrect_df.rename({'incorrect_answer': 'student_response'}, axis=1, inplace=True)
temp_incorrect_df['true_label'] = 0
# temp_incorrect_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_incorrect_df.rename({'incorrect_answer': 'student_response'}, axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_incorrect_df['true_label'] = 0


In [7]:
# df.shape, temp_correct_df.shape, temp_incorrect_df.shape

In [8]:
df = pd.concat([temp_correct_df, temp_incorrect_df]).reset_index(drop=True)

In [9]:
# df['ga_summary'] = df['gold_answer'].apply(lambda x
i=1

In [17]:
df['gold_answer'][i:i+1].apply(lambda x: generate_summary(pegasus_tokenizer, pegasus_model, x))[i]

'Summarize: Economics is both a subject area and a way of viewing the world.'

In [18]:
df['gold_answer'][i:i+1].apply(lambda x: generate_summary(t5_tokenizer, t5_model, x))[i]

': Economics is both a subject area and a way of viewing the world.: Economics is both a subject area and a way of viewing the world.'

In [19]:
df['gold_answer'][i]

'Economics is both a subject area and a way of viewing the world.'

In [20]:
df['student_response'][i:i+1].apply(lambda x: generate_summary(pegasus_tokenizer, pegasus_model, x))[i]

'Summarize: Economics is a subject area that studies how people make choices and how those choices affect the economy.'

In [21]:
df['student_response'][i:i+1].apply(lambda x: generate_summary(t5_tokenizer, t5_model, x))[i]

': Economics is a subject area that studies how people make choices and how those choices affect the economy. To summarize: Economics is a subject area that studies how people make choices and how those choices affect the economy.'

In [22]:
df['student_response'][i]

'Economics is a subject area that studies how people make choices and how those choices affect the economy.'

In [24]:
generate_summary(pegasus_tokenizer, pegasus_model, df.iloc[0]['clean_text'])

'We rarely have the data we need to make perfect decisions, we are faced with what economists call “imperfect information,” but we still make hundreds of decisions a day.'

In [23]:
df.iloc[0]['clean_text']

'Every day we are faced with a myriad of decisions, from the simple question of what to have for breakfast to more complex choices like whether to double major. Our response to these decisions depends on the information we have available at any given moment. We rarely have the data we need to make perfect decisions, we are faced with what economists call “imperfect information,” but we still make hundreds of decisions a day.\nNow we have another avenue to gather information—social media. Outlets like Facebook and Twitter are altering how we make choices, how we spend our time, which products we buy, and more. How many of you chose a university without first checking its social media presence for information and feedback? As you will see in this course, what happens in economics is affected by how well and how fast information disseminates through channels like social media.\nThis chapter is an introduction to the world of making decisions, processing information, and understanding beha

#### Preprocessing

In [None]:
# preprocessing steps for similarity computation:
# 1. lower case
# 2. remove non-alphanumeric characters except those bringing in context - (['@', '#', '$', '%', '*', '<', '>', '.', ','])
# 3. remove stopwords
# 4. lemmatize --- experiment

def func_preprocessing(text:str, lemmatize:bool=False):

    return_list = list()
    spacy_document = spacy_model(text.lower().strip())
    for token in [token for token in spacy_document]:
        if (token.text.isalnum() or any(i in token.text and token.text.count(i) == 1 for i in ['@', '#', '$', '%', '<', '>', '.', ',', '+', '-', '*'])) and (not token.is_stop):
            if lemmatize:
                return_list.append(token.lemma_)
            else:
                return_list.append(token.text)
    
    return ' '.join(return_list)

df['processed_gold_answer'] = df['gold_answer'].apply(lambda x: func_preprocessing(x))
df['processed_student_response'] = df['student_response'].apply(lambda x: func_preprocessing(x))

df['processed_gold_answer_lemmatized'] = df['gold_answer'].apply(lambda x: func_preprocessing(x, lemmatize=True))
df['processed_student_response_lemmatized'] = df['student_response'].apply(lambda x: func_preprocessing(x, lemmatize=True))

df.head()

#### Computing Cosine Similarity

In [None]:
# computing similarity between correct and incorect answer
def compute_similarity(list_answers: list):
    # calculating embeddings for the list -> [correct answer, incorrect answer]
    embeddings = similarity_model.encode(list_answers, batch_size=16)
    # returning similarity
    return util.pytorch_cos_sim(embeddings[0], embeddings[1])[0].item()

df['processed_similarity'] = df.apply(lambda x: compute_similarity([x['processed_gold_answer'], x['processed_student_response']]), axis=1)
df['processed_lemmatized_similarity'] = df.apply(lambda x: compute_similarity([x['processed_gold_answer_lemmatized'], x['processed_student_response_lemmatized']]), axis=1)

df.head()

#### Computing ROUGE

In [None]:
# computing rouge_n. n is calculated as follows - 
# minimum of (9 OR 'half of number of tokens in correct answer' OR 'half of number of tokens in incorrect answer') - including 9 because that is the highest that evaluate library can compute
# maximum of (1 OR the output of above) - to prevent n from being equal to 0.

compute_rouge = lambda predictions, references, n: rouge_model.compute(predictions=[predictions], references=[references], rouge_types=[f'rouge{n}'])
get_n = lambda t1, t2: max(1, min(9, int(len(t1.split()) / 2), int(len(t2.split()) / 2)))

df['processed_rouge'] = df.apply(lambda x: list(compute_rouge(x['processed_student_response'], x['processed_gold_answer'], get_n(x['processed_student_response'], x['processed_gold_answer'])).values())[0], axis=1)
df['processed_lemmatized_rouge'] = df.apply(lambda x: list(compute_rouge(x['processed_student_response_lemmatized'], x['processed_gold_answer_lemmatized'], get_n(x['processed_student_response_lemmatized'], x['processed_gold_answer_lemmatized'])).values())[0], axis=1)

In [None]:
df.head()

In [None]:
df.to_csv('~/active-projects/textbook-question-generation/data/aqag-chatgpt-vicuna-with-rouge-and-sim-vga.csv', index=False)

In [None]:
df['processed_rouge'].plot(kind='hist')

In [None]:
df['processed_lemmatized_rouge'].plot(kind='hist')

In [None]:
df['processed_similarity'].plot(kind='hist')

In [None]:
df['processed_lemmatized_similarity'].plot(kind='hist')

#### Scoring

In [None]:
# scoring method -
# 1. if similarity > 0.90 and rouge > 0.90 -> mark as correct
# 2. if similarity > 0.95 and rouge > 0.85 -> mark as correct
# 3. if similarity > 0.85 and rouge > 0.95 -> mark as correct
# 4. else incorrect
# return 1 for correct and 0 for incorrect

def score(similarity_score: float, rouge_score: float):
    if (similarity_score >= 0.90 and rouge_score >= 0.90) or (similarity_score >= 0.95 and rouge_score >= 0.85) or (similarity_score >= 0.85 and rouge_score >= 0.95):
        return 1
    return 0

df['processed_score'] = df.apply(lambda x: score(x['processed_similarity'], x['processed_rouge']), axis=1)
df['processed_lemmatized_score'] = df.apply(lambda x: score(x['processed_lemmatized_similarity'], x['processed_lemmatized_rouge']), axis=1)

In [None]:
confusion_matrix(df['true_label'], df['processed_score'])

In [None]:
print(classification_report(df['true_label'], df['processed_score']))

In [None]:
cohen_kappa_score(df['true_label'], df['processed_score'])

In [None]:
confusion_matrix(df['true_label'], df['processed_lemmatized_score'])

In [None]:
print(classification_report(df['true_label'], df['processed_lemmatized_score']))

In [None]:
cohen_kappa_score(df['true_label'], df['processed_lemmatized_score'])

In [None]:
# scoring method -
# 1. if similarity > 0.90 and rouge > 0.90 -> mark as correct
# 2. if similarity > 0.95 and rouge > 0.85 -> mark as correct
# 3. if similarity > 0.85 and rouge > 0.95 -> mark as correct
# 4. else incorrect
# return 1 for correct and 0 for incorrect

def score(similarity_score: float, rouge_score: float):
    
    for threshold in [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]:
    if (similarity_score >= 0.90 and rouge_score >= 0.90) or (similarity_score >= 0.95 and rouge_score >= 0.85) or (similarity_score >= 0.85 and rouge_score >= 0.95):
        return 1
    return 0

df['processed_score'] = df.apply(lambda x: score(x['processed_similarity'], x['processed_rouge']), axis=1)
df['processed_lemmatized_score'] = df.apply(lambda x: score(x['processed_lemmatized_similarity'], x['processed_lemmatized_rouge']), axis=1)

In [None]:
df = pd.read_csv('~/active-projects/textbook-question-generation/data/aqag-chatgpt-vicuna-with-rouge-and-sim-vga.csv')

In [None]:
df.head()

In [None]:
# from transformers import pipeline
summary_model = pipeline("summarization", model= "csebuetnlp/mT5_multilingual_XLSum")

In [None]:
i = 0
ga = df.iloc[1]['gold_answer']
sr = df.iloc[1]['student_response']

In [None]:
ga

In [None]:
sr

In [None]:
summary_model([ga, sr])

In [None]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_gigaword")
# model = AutoModelForSeq2SeqLM.from_pretrained("google/roberta2roberta_L-24_gigaword")

# input_ids = tokenizer(ga, return_tensors="pt").input_ids
# output_ids = model.generate(input_ids)[0]
# print(tokenizer.decode(output_ids, skip_special_tokens=True))