In [None]:
%%capture
!pip install transformers
# !pip install sentencepiece==0.1.96
# !pip install pytorch-lightning

In [None]:
import pandas as pd
from google.colab import drive
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
drive.mount('/content/drive')

In [None]:
# this is another fine-tuned classifier based on Roberta, we're not using it here

# from transformers import pipeline
# classifier = pipeline('zero-shot-classification', model='roberta-large-mnli')

In [None]:
# sequence_to_classify = "one day I will see the world"
# candidate_labels = ['travel', 'cooking', 'dancing']
# classifier(sequence_to_classify, candidate_labels)

In [None]:
# sequence_to_classify = "The CEO had a strong handshake."
# candidate_labels = ['male', 'female']
# hypothesis_template = "This text speaks about a {} profession."
# classifier(sequence_to_classify, candidate_labels, hypothesis_template=hypothesis_template)

In [None]:
# 1 use case
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

if __name__ == '__main__':
    max_length = 256

    premise = "Two women are embracing while holding to go packages."
    hypothesis = "The men are fighting outside a deli."

    hg_model_hub_name = "ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli"
    # hg_model_hub_name = "ynie/albert-xxlarge-v2-snli_mnli_fever_anli_R1_R2_R3-nli"
    # hg_model_hub_name = "ynie/bart-large-snli_mnli_fever_anli_R1_R2_R3-nli"
    # hg_model_hub_name = "ynie/electra-large-discriminator-snli_mnli_fever_anli_R1_R2_R3-nli"
    # hg_model_hub_name = "ynie/xlnet-large-cased-snli_mnli_fever_anli_R1_R2_R3-nli"

    tokenizer = AutoTokenizer.from_pretrained(hg_model_hub_name)
    model = AutoModelForSequenceClassification.from_pretrained(hg_model_hub_name)

    tokenized_input_seq_pair = tokenizer.encode_plus(premise, hypothesis,
                                                     max_length=max_length,
                                                     return_token_type_ids=True, truncation=True)

    input_ids = torch.Tensor(tokenized_input_seq_pair['input_ids']).long().unsqueeze(0)
    # remember bart doesn't have 'token_type_ids', remove the line below if you are using bart.
    token_type_ids = torch.Tensor(tokenized_input_seq_pair['token_type_ids']).long().unsqueeze(0)
    attention_mask = torch.Tensor(tokenized_input_seq_pair['attention_mask']).long().unsqueeze(0)

    outputs = model(input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    labels=None)
    # Note:
    # "id2label": {
    #     "0": "entailment",
    #     "1": "neutral",
    #     "2": "contradiction"
    # },

    predicted_probability = torch.softmax(outputs[0], dim=1)[0].tolist()  # batch_size only one

    print("Premise:", premise)
    print("Hypothesis:", hypothesis)
    print("Entailment:", predicted_probability[0])
    print("Neutral:", predicted_probability[1])
    print("Contradiction:", predicted_probability[2])


In [None]:
# !ls drive/MyDrive/ResearchLLM_UWM/RobertaNLI/

In [None]:
# read in
essay_df = pd.read_excel('drive/MyDrive/ResearchLLM_UWM/RobertaNLI/StudentEssays.xlsx', engine='openpyxl')

In [None]:
essay_df

In [None]:
# import nltk
# from nltk.tokenize import sent_tokenize

# # # Sample data
# # data = {
# #     'Essay': [
# #         "The initial hill height that I choose was 45. Why? Because it seemed right.",
# #         "We chose a height of .8 meters. It was a good choice.",
# #         # ... (other essays)
# #     ]
# # }
# # df = pd.DataFrame(data)

# # Ensure you've downloaded the punkt tokenizer models
# nltk.download('punkt')

# # Tokenize sentences and expand the DataFrame
# df_sentences = essay_df['Essay'].apply(sent_tokenize).explode().reset_index(drop=True).to_frame(name='Sentence')

# print(df_sentences)


In [None]:
%%capture
!pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
# partition essays into seperate sentences
import spacy

nlp = spacy.load("en_core_web_sm")
df_sentences = essay_df['Essay'].apply(lambda essay: [sent.text for sent in nlp(essay).sents]).explode().reset_index(drop=True).to_frame(name='Sentence')

In [None]:
# sentence = sum(essay_df.Essay.apply(lambda x: x.split(r'. ')).to_list(), [])
# pd.DataFrame({"Sentence":sentence, "Science_Concept":['Concept']*len(sentence)})

In [None]:
# sentence df
df_sentences

In [None]:
# official definitions

pe_definition = """
Potential energy is energy at rest.
Potential energy is energy that a body has because of its position relative to other bodies.
Potential energy is energy stored in the car at the top of the initial drop.
Potential energy is the stored energy of the rollercoaster car when it is not moving.
Potential energy is energy that has the potential to become another form of energy.
"""

In [None]:
ke_definition = """
Kinetic energy is energy in motion.
Kinetic energy is energy that the car has because it is moving.
Kinetic energy is the work needed to accelerate the rollercoaster car from rest.
Kinetic energy is determined by the mass of the car and the velocity with which it is moving.
Kinetic energy quantifies the work an object performs due to its motion.
"""

In [None]:
lce_definition = """
LCE says that energy cannot be created or destroyed, only transformed.
LCE states that the total energy of an isolated system remains constant.
LCE states that energy can be converted from one form to another, but never created or destroyed.
LCE says that if there were no friction, the potential energy at the top of the rollercoaster would be the same as the kinetic energy at the bottom of the drop.
LCE is a physical law that states that energy cannot be created or destroyed but only transformed.
"""

In [None]:
df_sentences['pe_definition'] = pe_definition
df_sentences['ke_definition'] = ke_definition
df_sentences['lce_definition'] = lce_definition

In [None]:
df_sentences

In [None]:
df_sentences.Sentence[0]
df_sentences.pe_definition[0]

In [None]:
# from pretrained Roberta Large

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

max_length = 256

hg_model_hub_name = "ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli"
# hg_model_hub_name = "ynie/albert-xxlarge-v2-snli_mnli_fever_anli_R1_R2_R3-nli"
# hg_model_hub_name = "ynie/bart-large-snli_mnli_fever_anli_R1_R2_R3-nli"
# hg_model_hub_name = "ynie/electra-large-discriminator-snli_mnli_fever_anli_R1_R2_R3-nli"
# hg_model_hub_name = "ynie/xlnet-large-cased-snli_mnli_fever_anli_R1_R2_R3-nli"

tokenizer = AutoTokenizer.from_pretrained(hg_model_hub_name)
model = AutoModelForSequenceClassification.from_pretrained(hg_model_hub_name)


# Note:
# "id2label": {
#     "0": "entailment",
#     "1": "neutral",
#     "2": "contradiction"
# },




In [None]:
premise = df_sentences.Sentence
hypothesis = df_sentences.pe_definition

In [None]:
# premise
print(hypothesis)

In [None]:
# loop through dataframe

from tqdm import tqdm


stats_list = []

for _, row in tqdm(df_sentences.iterrows(), total=df_sentences.shape[0], desc="Processing rows"):
  # print(i)
  # print(row)
  premise = row['Sentence']
  hypothesis1 = row['pe_definition']
  hypothesis2 = row['ke_definition']
  hypothesis3 = row['lce_definition']


  tokenized_input_seq_pair1 = tokenizer.encode_plus(premise, hypothesis1,
                                                    max_length=max_length,
                                                    return_token_type_ids=True, truncation=True)

  tokenized_input_seq_pair2 = tokenizer.encode_plus(premise, hypothesis2,
                                                    max_length=max_length,
                                                    return_token_type_ids=True, truncation=True)

  tokenized_input_seq_pair3 = tokenizer.encode_plus(premise, hypothesis3,
                                                    max_length=max_length,
                                                    return_token_type_ids=True, truncation=True)


  input_ids1 = torch.Tensor(tokenized_input_seq_pair1['input_ids']).long().unsqueeze(0)
  token_type_ids1 = torch.Tensor(tokenized_input_seq_pair1['token_type_ids']).long().unsqueeze(0)
  attention_mask1 = torch.Tensor(tokenized_input_seq_pair1['attention_mask']).long().unsqueeze(0)

  input_ids2 = torch.Tensor(tokenized_input_seq_pair2['input_ids']).long().unsqueeze(0)
  token_type_ids2 = torch.Tensor(tokenized_input_seq_pair2['token_type_ids']).long().unsqueeze(0)
  attention_mask2 = torch.Tensor(tokenized_input_seq_pair2['attention_mask']).long().unsqueeze(0)

  input_ids3 = torch.Tensor(tokenized_input_seq_pair3['input_ids']).long().unsqueeze(0)
  token_type_ids3 = torch.Tensor(tokenized_input_seq_pair3['token_type_ids']).long().unsqueeze(0)
  attention_mask3 = torch.Tensor(tokenized_input_seq_pair3['attention_mask']).long().unsqueeze(0)


  outputs1 = model(input_ids1,
                  attention_mask=attention_mask1,
                  token_type_ids=token_type_ids1,
                  labels=None)

  outputs2 = model(input_ids2,
                  attention_mask=attention_mask2,
                  token_type_ids=token_type_ids2,
                  labels=None)

  outputs3 = model(input_ids3,
                  attention_mask=attention_mask3,
                  token_type_ids=token_type_ids3,
                  labels=None)

  labels = ('Entailment', 'Neutral', 'Contradiction')

  predicted_probability1 = torch.softmax(outputs1[0], dim=1)[0].tolist()
  predicted_probability2 = torch.softmax(outputs2[0], dim=1)[0].tolist()
  predicted_probability3 = torch.softmax(outputs3[0], dim=1)[0].tolist()

  label1 = labels[np.array(predicted_probability1).argmax()]
  label2 = labels[np.array(predicted_probability2).argmax()]
  label3 = labels[np.array(predicted_probability3).argmax()]


  pred = {"PE":label1, "KE":label2, "LCE":label3}
  # print("Premise:", premise)
  # print("Hypothesis:", hypothesis)
  stat_pe = {"Entailment:": predicted_probability1[0], "Neutral:": predicted_probability1[1], "Contradiction:": predicted_probability1[2]}
  stat_ke = {"Entailment:": predicted_probability2[0], "Neutral:": predicted_probability2[1], "Contradiction:": predicted_probability2[2]}
  stat_lce = {"Entailment:": predicted_probability3[0], "Neutral:": predicted_probability3[1], "Contradiction:": predicted_probability3[2]}
  # print(stat_pe)
  # print(stat_ke)
  # print(stat_lce)
  # print(pred)
  stats_composed = {"pred":pred, "stat_pe":stat_pe, "stat_ke":stat_ke, "stat_lce":stat_lce}

  # print("Entailment:", predicted_probability[0])
  # print("Neutral:", predicted_probability[1])
  # print("Contradiction:", predicted_probability[2])
  stats_list.append(stats_composed)

# Convert the list of dictionaries to a DataFrame
stats_df = pd.DataFrame(stats_list)

# Concatenate the stats DataFrame with the original df_sentences
df_sentences = pd.concat([df_sentences, stats_df], axis=1)

In [None]:
# Potentially, multi-threading will accelerate the process
# haven't implemented yet


# from tqdm import tqdm
# from multiprocessing import Pool, cpu_count, Lock
# import numpy as np

# # Worker function to process a chunk of the DataFrame
# def process_chunk(chunk):

#   stats_list = []
#   for _, row in chunk.iterrows():
#     premise = row['Sentence']
#     hypothesis1 = row['pe_definition']
#     hypothesis2 = row['ke_definition']
#     hypothesis3 = row['lce_definition']


#     tokenized_input_seq_pair1 = tokenizer.encode_plus(premise, hypothesis1,
#                                                       max_length=max_length,
#                                                       return_token_type_ids=True, truncation=True)

#     tokenized_input_seq_pair2 = tokenizer.encode_plus(premise, hypothesis2,
#                                                       max_length=max_length,
#                                                       return_token_type_ids=True, truncation=True)

#     tokenized_input_seq_pair3 = tokenizer.encode_plus(premise, hypothesis3,
#                                                       max_length=max_length,
#                                                       return_token_type_ids=True, truncation=True)


#     input_ids1 = torch.Tensor(tokenized_input_seq_pair1['input_ids']).long().unsqueeze(0)
#     token_type_ids1 = torch.Tensor(tokenized_input_seq_pair1['token_type_ids']).long().unsqueeze(0)
#     attention_mask1 = torch.Tensor(tokenized_input_seq_pair1['attention_mask']).long().unsqueeze(0)

#     input_ids2 = torch.Tensor(tokenized_input_seq_pair2['input_ids']).long().unsqueeze(0)
#     token_type_ids2 = torch.Tensor(tokenized_input_seq_pair2['token_type_ids']).long().unsqueeze(0)
#     attention_mask2 = torch.Tensor(tokenized_input_seq_pair2['attention_mask']).long().unsqueeze(0)

#     input_ids3 = torch.Tensor(tokenized_input_seq_pair3['input_ids']).long().unsqueeze(0)
#     token_type_ids3 = torch.Tensor(tokenized_input_seq_pair3['token_type_ids']).long().unsqueeze(0)
#     attention_mask3 = torch.Tensor(tokenized_input_seq_pair3['attention_mask']).long().unsqueeze(0)


#     outputs1 = model(input_ids1,
#                     attention_mask=attention_mask1,
#                     token_type_ids=token_type_ids1,
#                     labels=None)

#     outputs2 = model(input_ids2,
#                     attention_mask=attention_mask2,
#                     token_type_ids=token_type_ids2,
#                     labels=None)

#     outputs3 = model(input_ids3,
#                     attention_mask=attention_mask3,
#                     token_type_ids=token_type_ids3,
#                     labels=None)

#     labels = ('Entailment', 'Neutral', 'Contradiction')

#     predicted_probability1 = torch.softmax(outputs1[0], dim=1)[0].tolist()
#     predicted_probability2 = torch.softmax(outputs2[0], dim=1)[0].tolist()
#     predicted_probability3 = torch.softmax(outputs3[0], dim=1)[0].tolist()

#     label1 = labels[np.array(predicted_probability1).argmax()]
#     label2 = labels[np.array(predicted_probability2).argmax()]
#     label3 = labels[np.array(predicted_probability3).argmax()]


#     pred = {"PE":label1, "KE":label2, "LCE":label3}
#     # print("Premise:", premise)
#     # print("Hypothesis:", hypothesis)
#     stat_pe = {"Entailment:": predicted_probability1[0], "Neutral:": predicted_probability1[1], "Contradiction:": predicted_probability1[2]}
#     stat_ke = {"Entailment:": predicted_probability2[0], "Neutral:": predicted_probability2[1], "Contradiction:": predicted_probability2[2]}
#     stat_lce = {"Entailment:": predicted_probability3[0], "Neutral:": predicted_probability3[1], "Contradiction:": predicted_probability3[2]}
#     # print(stat_pe)
#     # print(stat_ke)
#     # print(stat_lce)
#     # print(pred)
#     stats_composed = {"pred":pred, "stat_pe":stat_pe, "stat_ke":stat_ke, "stat_lce":stat_lce}

#     # print("Entailment:", predicted_probability[0])
#     # print("Neutral:", predicted_probability[1])
#     # print("Contradiction:", predicted_probability[2])
#     stats_list.append(stats_composed)



#       # ... [rest of your processing code]
#       # stats_composed = {"pred":pred, "stat_pe":stat_pe, "stat_ke":stat_ke, "stat_lce":stat_lce}
#       # stats_list.append(stats_composed)
#   return stats_list

# # Split the DataFrame into chunks for parallel processing
# num_processes = cpu_count()
# chunk_size = int(df_sentences.shape[0] / num_processes)
# chunks = [df_sentences.iloc[i:i + chunk_size] for i in range(0, df_sentences.shape[0], chunk_size)]

# # Use a Pool of worker processes to process the chunks
# with Pool(processes=num_processes) as pool:
#     results = list(tqdm(pool.imap(process_chunk, chunks), total=len(chunks), desc="Processing chunks"))

# # Flatten the list of results and convert to a DataFrame
# stats_list = [item for sublist in results for item in sublist]
# stats_df = pd.DataFrame(stats_list)

# # Concatenate the stats DataFrame with the original df_sentences
# df_sentences = pd.concat([df_sentences, stats_df], axis=1)


In [None]:
df_sentences_final = pd.concat([df_sentences.iloc[:, :4], df_sentences.iloc[:, 5:]], axis=1)

In [None]:
df_sentences_final.to_excel('drive/MyDrive/ResearchLLM_UWM/RobertaNLI/EssayNLIRoberta.xlsx')

In [None]:
print(df_sentences_final.iloc[0].to_dict())