# 1. Prepare Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Peek at our student essay dataframe
# se = pd.read_csv('StudentEssays.xlsx')
se = pd.read_excel('StudentEssays.xlsx')

In [None]:
# There are 76 rows in total
# The essays are stored in the column "Essay"
# We need to output our classifications into the columns "PE", "KE", and "LCE"
se

In [None]:
# So, what is our "training data"?
# They are in "Interence Categories and Examples" sesstion

## Examples from our document

In [None]:
# PE
PE_AC = ['Potential energy is energy at rest',
'Potential energy is energy that a body has because of its position relative to other bodies.',
'Potential energy is energy stored in the car at the top of the initial drop.',
'Potential energy is the stored energy of the rollercoaster car when it is not moving.',
'Potential energy is energy that has the potential to become another form of energy.']

PE_UN = ["Potential energy is energy in motion.","Potential energy is the energy lost as the car goes down the hill.","Potential energy is the opposite of kinetic energy.","Potential energy is energy that is conserved by not moving.","Potential energy is energy measured in joules."]

PE_IN = ["The potential energy at the top of the rollercoaster is 4.9 joules.","Potential energy is measured in joules.","PE = m*h*9.8","There is more potential energy at the top of the hill than the bottom. ","The potential energy changes into kinetic energy as the car goes down the hill.",]

In [None]:
# KE
KE_AC = ["Kinetic energy is energy in motion.","Kinetic energy is energy that the car has because it is moving.","Kinetic energy is the work needed to accelerate the rollercoaster car from rest.","Kinetic energy is determined by the mass of the car and the velocity with which it is moving.","Kinetic energy quantifies the work an object performs due to its motion.",]

KE_UN = ["Kinetic energy is energy at rest.","Kinetic energy is never lost or gained as the car moves through the rollercoaster.","Kinetic energy is the opposite of potential energy.","Kinetic energy is measured in joules.","Kinetic energy is energy that is spent by moving up and down the hill.",]

KE_IN = ["The kinetic energy at the bottom of the hill is 4.8 joules.","Kinetic energy is measured in joules.","KE = m*1/2v^2","There is more kinetic energy at the bottom of the hill than at the top.","Kinetic energy transforms into heat through friction.",]

In [None]:
# LCE
LCE_AC = ["LCE says that energy cannot be created or destroyed, only transformed.","LCE states that the total energy of an isolated system remains constant. ","LCE states that energy can be converted from one form to another, but never created or destroyed.","LCE says that if there were no friction, the potential energy at the top of the rollercoaster would be the same as the kinetic energy at the bottom of the drop.","LCE is a physical law that states that energy cannot be created or destroyed but only transformed.",]

LCE_UN = ["LCE says that energy can be created and destroyed.","LCE states that the energy of a closed system will change.","LCE says that in an open system, energy is conserved.",]

LCE_IN = ["K1 + U1 = K2 + U2","The potential energy transforms into kinetic energy because of the law of conservation of energy.","As the car goes down the hill, some energy is lost to friction as heat.","If there were no friction, the energy would be the same at the start and at the finish."]

## Make this into a dataframe

In [None]:
# Now, how do we make this into a training data?
# First, install transformers, datasets and evaluate
# !pip install transformers datasets evaluate

In [None]:
# Ans: The dataset is in the datatype "Datasetdict" and "Dataset"
# First, we create our data
from datasets.dataset_dict import DatasetDict
from datasets import Dataset
import numpy as np

# For each "label", we have 0=unrelated, 1=unacceptable, 2=insufficient, 3=acceptable
# Create dataframe from our arrays for each label
def create_df(label):
    mark = 0
    if label=='PE':
        mark = 0
    elif label=='KE':
        mark = 1
    else:
        mark = 2
    
    classes = [0 for i in range(9)]
    classes[mark*3] = 3
    classes[mark*3+1] = 1
    classes[mark*3+2] = 2
    
    text = []
    lb = []
    texts = [PE_AC,PE_UN,PE_IN,KE_AC,KE_UN,KE_IN,LCE_AC,LCE_UN,LCE_IN]
    for i in range(len(texts)):
        for j in texts[i]:
            text.append(j)
            lb.append(classes[i])
    
    d = {'text': text, 'label': lb}
    return pd.DataFrame(data=d)

In [None]:
def create_datasetdict(label):
    df = create_df(label) # success!
    # Now, onto create data for training
    d = {'train':Dataset.from_dict({'label':df['label'],'text':df['text']}),}
    data = DatasetDict(d)
    return data

In [None]:
data = create_datasetdict('PE')
data['train'][0] # works!

# 2. Finetuning Model (For PE)

In [None]:
# Now, onto finetuning the model
import torch
from transformers import AutoTokenizer, BloomForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tokenized_data = data.map(preprocess_function, batched=True)

In [None]:
tokenized_data['train'][0]

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
id2label = {0:"unrelated", 1:"unacceptable", 2:"insufficient", 3:"acceptable"}
label2id = {"unrelated":0, "unacceptable":1, "insufficient":2, "acceptable":3}

In [None]:
model = BloomForSequenceClassification.from_pretrained("bigscience/bloom-560m",
                                                      num_labels=4,
                                                      id2label=id2label,
                                                      label2id=label2id)

In [None]:
#!pip install transformers[torch]

In [None]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="finetuned-bloom-560m-PE",
    overwrite_output_dir= True,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=8,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="no",
    load_best_model_at_end=False,
    save_total_limit = 2,
    #push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.save_model()

# 3. Evaluation

Now we will use our model against the student's essays

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
#nltk.download('popular')

# Returns a list of sentences of a given text
def split_sent(text):
    return sent_tokenize(text)

In [None]:
# Load Model
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
model = BloomForSequenceClassification.from_pretrained("./finetuned-bloom-560m-PE",
                                                      num_labels=4,
                                                      id2label=id2label,
                                                      label2id=label2id)

In [None]:
from transformers import pipeline
classifier = pipeline("text-classification", model=model, tokenizer = tokenizer)

In [None]:
#split_sent(se['Essay'][0])

In [None]:
#x = classifier(split_sent(se['Essay'][0]))

In [None]:
#y = [_['label'] for _ in x]
#y

In [None]:
#np.sum([_ == 'insufficient' for _ in y])

In [None]:
#np.sum([_ == 'unrelated' for _ in y])

In [None]:
#np.sum([_ == 'unacceptable' for _ in y])

In [None]:
#np.sum([_ == 'acceptable' for _ in y])

In [None]:
new_df = se[['Essay']]

In [None]:
#new_df

In [None]:
# np.zeros(new_df.shape[0])

In [None]:
new_df['PE_AC'] = pd.Series(np.zeros(new_df.shape[0]))
new_df['PE_IN'] = pd.Series(np.zeros(new_df.shape[0]))
new_df['PE_UN'] = pd.Series(np.zeros(new_df.shape[0]))
new_df['PE_unrelated'] = pd.Series(np.zeros(new_df.shape[0]))
new_df

In [None]:
# we will do this for every essay
# What column for each topic do we have?
# AC, IN, UN, unrelated
for i in range(new_df.shape[0]):
    sentences = split_sent(se['Essay'][i])
    res = classifier(sentences)
    predictions = pd.Series([x['label'] for x in res])
    new_df.at[i, 'PE_IN'] = np.sum([_ == 'insufficient' for _ in predictions])
    new_df.at[i, 'PE_AC'] = np.sum([_ == 'acceptable' for _ in predictions])
    new_df.at[i, 'PE_UN'] = np.sum([_ == 'unacceptable' for _ in predictions])
    new_df.at[i, 'PE_unrelated'] = np.sum([_ == 'unrelated' for _ in predictions])

In [None]:
new_df

In [None]:
new_df.to_csv('StudentEssaysPE.csv')

# 5. Now, do the same for KE and LCE

## Kinetic Energy

In [None]:
data = create_datasetdict('KE')
data['train'][0] # works!

In [None]:
tokenized_data = data.map(preprocess_function, batched=True)

In [None]:
model = BloomForSequenceClassification.from_pretrained("bigscience/bloom-560m",
                                                      num_labels=4,
                                                      id2label=id2label,
                                                      label2id=label2id)

In [None]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="finetuned-bloom-560m-KE",
    overwrite_output_dir= True,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=8,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="no",
    load_best_model_at_end=False,
    save_total_limit = 2,
    #push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.save_model()

In [None]:
model = BloomForSequenceClassification.from_pretrained("./finetuned-bloom-560m-KE",
                                                      num_labels=4,
                                                      id2label=id2label,
                                                      label2id=label2id)

In [None]:
classifier = pipeline("text-classification", model=model, tokenizer = tokenizer)

In [None]:
new_df['KE_AC'] = pd.Series(np.zeros(new_df.shape[0]))
new_df['KE_IN'] = pd.Series(np.zeros(new_df.shape[0]))
new_df['KE_UN'] = pd.Series(np.zeros(new_df.shape[0]))
new_df['KE_unrelated'] = pd.Series(np.zeros(new_df.shape[0]))
new_df

In [None]:
# we will do this for every essay
# What column for each topic do we have?
# AC, IN, UN, unrelated
for i in range(new_df.shape[0]):
    sentences = split_sent(se['Essay'][i])
    res = classifier(sentences)
    predictions = pd.Series([x['label'] for x in res])
    new_df.at[i, 'KE_IN'] = np.sum([_ == 'insufficient' for _ in predictions])
    new_df.at[i, 'KE_AC'] = np.sum([_ == 'acceptable' for _ in predictions])
    new_df.at[i, 'KE_UN'] = np.sum([_ == 'unacceptable' for _ in predictions])
    new_df.at[i, 'KE_unrelated'] = np.sum([_ == 'unrelated' for _ in predictions])

In [None]:
new_df

In [None]:
new_df.to_csv('StudentEssaysPEKE.csv')

## LCE

In [None]:
data = create_datasetdict('LCE')
data['train'][-1] # works!

In [None]:
tokenized_data = data.map(preprocess_function, batched=True)

In [None]:
model = BloomForSequenceClassification.from_pretrained("bigscience/bloom-560m",
                                                      num_labels=4,
                                                      id2label=id2label,
                                                      label2id=label2id)

In [None]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="finetuned-bloom-560m-LCE",
    overwrite_output_dir= True,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=8,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="no",
    load_best_model_at_end=False,
    save_total_limit = 2,
    #push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.save_model()

In [None]:
model = BloomForSequenceClassification.from_pretrained("./finetuned-bloom-560m-LCE",
                                                      num_labels=4,
                                                      id2label=id2label,
                                                      label2id=label2id)

In [None]:
classifier = pipeline("text-classification", model=model, tokenizer = tokenizer)

In [None]:
new_df['LCE_AC'] = pd.Series(np.zeros(new_df.shape[0]))
new_df['LCE_IN'] = pd.Series(np.zeros(new_df.shape[0]))
new_df['LCE_UN'] = pd.Series(np.zeros(new_df.shape[0]))
new_df['LCE_unrelated'] = pd.Series(np.zeros(new_df.shape[0]))
new_df

In [None]:
# we will do this for every essay
# What column for each topic do we have?
# AC, IN, UN, unrelated
for i in range(new_df.shape[0]):
    sentences = split_sent(se['Essay'][i])
    res = classifier(sentences)
    predictions = pd.Series([x['label'] for x in res])
    new_df.at[i, 'LCE_IN'] = np.sum([_ == 'insufficient' for _ in predictions])
    new_df.at[i, 'LCE_AC'] = np.sum([_ == 'acceptable' for _ in predictions])
    new_df.at[i, 'LCE_UN'] = np.sum([_ == 'unacceptable' for _ in predictions])
    new_df.at[i, 'LCE_unrelated'] = np.sum([_ == 'unrelated' for _ in predictions])

In [None]:
new_df

In [None]:
new_df.to_csv('StudentEssaysPEKELCE.csv')

In [None]:
# !pip3 install torch torchvision torchaudio

In [None]:
# !pip install --upgrade transformers

In [None]:
# Ok, "multi-label classifier" = can be 0 label or all labels
# "Multi-class classifier" = can be only 1 label
# What I "ideally" looking for is a "multi-classification" inside 3 "multi-labbel classifiers"
# For now, let's have 3 models of multi-class classification instead.