# 1. Prepare Data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Peek at our student essay dataframe
# se = pd.read_csv('StudentEssays.xlsx')
se = pd.read_excel('StudentEssays.xlsx')

In [3]:
# There are 76 rows in total
# The essays are stored in the column "Essay"
# We need to output our classifications into the columns "PE", "KE", and "LCE"
se

Unnamed: 0,Essay ID,Essay,PE,KE,LCE
0,"L24-RCFinal proposal, L3b8-23_RCFinal proposal...",The initial hill height that I choose was 45 m...,,,
1,L3b8-21_RCFinal proposal,We chose a height of .8 meters because it is a...,,,
2,"L23-RCFinal proposal, L22-RCFinal proposal, L3...",The initial drop must be the highest point of ...,,,
3,L21-RCFinal proposal & L3b8-24_RCFinal proposal,The initial drop height that I have chosen is ...,,,
4,L3b8-17_RCFinal proposal,For our initial drop the height was .77m. PE=m...,,,
...,...,...,...,...,...
71,L3b6-11_RCFinal proposal/L1c2-12_RCFinal proposal,\nFor the Gonzales family we chose .6 meters f...,,,
72,L1c2-02_RCFinal proposal/L1c3-23_RCFinal propo...,"For mass, .74kg was chosen.The initial drop he...",,,
73,L1c1-23_RCFinal proposal/L1c2-01_RCFinal proposal,My team put our car mass at .50kg because if m...,,,
74,L1c1-22_RCFinal proposal/L1c1-21_RCFinal propo...,My team and I choose 0.2 kg car mass because t...,,,


In [4]:
# So, what is our "training data"?
# They are in "Interence Categories and Examples" sesstion

## Examples from our document

In [5]:
# PE
PE_AC = ['Potential energy is energy at rest',
'Potential energy is energy that a body has because of its position relative to other bodies.',
'Potential energy is energy stored in the car at the top of the initial drop.',
'Potential energy is the stored energy of the rollercoaster car when it is not moving.',
'Potential energy is energy that has the potential to become another form of energy.']

PE_UN = ["Potential energy is energy in motion.","Potential energy is the energy lost as the car goes down the hill.","Potential energy is the opposite of kinetic energy.","Potential energy is energy that is conserved by not moving.","Potential energy is energy measured in joules."]

PE_IN = ["The potential energy at the top of the rollercoaster is 4.9 joules.","Potential energy is measured in joules.","PE = m*h*9.8","There is more potential energy at the top of the hill than the bottom. ","The potential energy changes into kinetic energy as the car goes down the hill.",]

In [6]:
# KE
KE_AC = ["Kinetic energy is energy in motion.","Kinetic energy is energy that the car has because it is moving.","Kinetic energy is the work needed to accelerate the rollercoaster car from rest.","Kinetic energy is determined by the mass of the car and the velocity with which it is moving.","Kinetic energy quantifies the work an object performs due to its motion.",]

KE_UN = ["Kinetic energy is energy at rest.","Kinetic energy is never lost or gained as the car moves through the rollercoaster.","Kinetic energy is the opposite of potential energy.","Kinetic energy is measured in joules.","Kinetic energy is energy that is spent by moving up and down the hill.",]

KE_IN = ["The kinetic energy at the bottom of the hill is 4.8 joules.","Kinetic energy is measured in joules.","KE = m*1/2v^2","There is more kinetic energy at the bottom of the hill than at the top.","Kinetic energy transforms into heat through friction.",]

In [7]:
# LCE
LCE_AC = ["LCE says that energy cannot be created or destroyed, only transformed.","LCE states that the total energy of an isolated system remains constant. ","LCE states that energy can be converted from one form to another, but never created or destroyed.","LCE says that if there were no friction, the potential energy at the top of the rollercoaster would be the same as the kinetic energy at the bottom of the drop.","LCE is a physical law that states that energy cannot be created or destroyed but only transformed.",]

LCE_UN = ["LCE says that energy can be created and destroyed.","LCE states that the energy of a closed system will change.","LCE says that in an open system, energy is conserved.",]

LCE_IN = ["K1 + U1 = K2 + U2","The potential energy transforms into kinetic energy because of the law of conservation of energy.","As the car goes down the hill, some energy is lost to friction as heat.","If there were no friction, the energy would be the same at the start and at the finish."]

## Make this into a dataframe

In [8]:
# Now, how do we make this into a training data?
# First, install transformers, datasets and evaluate
# !pip install transformers datasets evaluate

In [9]:
# Ans: The dataset is in the datatype "Datasetdict" and "Dataset"
# First, we create our data
from datasets.dataset_dict import DatasetDict
from datasets import Dataset
import numpy as np

# For each "label", we have 0=unrelated, 1=unacceptable, 2=insufficient, 3=acceptable
# Create dataframe from our arrays for each label
def create_df(label):
    mark = 0
    if label=='PE':
        mark = 0
    elif label=='KE':
        mark = 1
    else:
        mark = 2
    
    classes = [0 for i in range(9)]
    classes[mark*3] = 3
    classes[mark*3+1] = 1
    classes[mark*3+2] = 2
    
    text = []
    lb = []
    texts = [PE_AC,PE_UN,PE_IN,KE_AC,KE_UN,KE_IN,LCE_AC,LCE_UN,LCE_IN]
    for i in range(len(texts)):
        for j in texts[i]:
            text.append(j)
            lb.append(classes[i])
    
    d = {'text': text, 'label': lb}
    return pd.DataFrame(data=d)

In [10]:
def create_datasetdict(label):
    df = create_df(label) # success!
    # Now, onto create data for training
    d = {'train':Dataset.from_dict({'label':df['label'],'text':df['text']}),}
    data = DatasetDict(d)
    return data

In [11]:
data = create_datasetdict('PE')
data['train'][0] # works!

{'label': 3, 'text': 'Potential energy is energy at rest'}

# 2. Finetuning Model (For PE)

In [12]:
# Now, onto finetuning the model
import torch
from transformers import AutoTokenizer, BloomForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")

In [13]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [14]:
tokenized_data = data.map(preprocess_function, batched=True)

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [15]:
tokenized_data['train'][0]

{'label': 3,
 'text': 'Potential energy is energy at rest',
 'input_ids': [20351, 8625, 15883, 632, 15883, 919, 3097],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [16]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
import evaluate

accuracy = evaluate.load("accuracy")

In [18]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [19]:
id2label = {0:"unrelated", 1:"unacceptable", 2:"insufficient", 3:"acceptable"}
label2id = {"unrelated":0, "unacceptable":1, "insufficient":2, "acceptable":3}

In [20]:
model = BloomForSequenceClassification.from_pretrained("bigscience/bloom-560m",
                                                      num_labels=4,
                                                      id2label=id2label,
                                                      label2id=label2id)

Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
#!pip install transformers[torch]

In [22]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="finetuned-bloom-560m-PE",
    overwrite_output_dir= True,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=8,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="no",
    load_best_model_at_end=False,
    save_total_limit = 2,
    #push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,15.790364,0.571429
2,No log,7.950756,0.690476
3,No log,8.154753,0.547619
4,No log,2.868347,0.809524
5,No log,0.526321,0.904762
6,No log,0.368621,0.880952
7,No log,0.45017,0.904762
8,No log,0.41554,0.928571


TrainOutput(global_step=16, training_loss=8.139572143554688, metrics={'train_runtime': 218.1351, 'train_samples_per_second': 1.54, 'train_steps_per_second': 0.073, 'total_flos': 17819806728192.0, 'train_loss': 8.139572143554688, 'epoch': 8.0})

In [23]:
trainer.save_model()

# 3. Evaluation

Now we will use our model against the student's essays

In [24]:
from nltk.tokenize import sent_tokenize, word_tokenize
#nltk.download('popular')

# Returns a list of sentences of a given text
def split_sent(text):
    return sent_tokenize(text)

In [25]:
# Load Model
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
model = BloomForSequenceClassification.from_pretrained("./finetuned-bloom-560m-PE",
                                                      num_labels=4,
                                                      id2label=id2label,
                                                      label2id=label2id)

In [26]:
from transformers import pipeline
classifier = pipeline("text-classification", model=model, tokenizer = tokenizer)

In [27]:
#split_sent(se['Essay'][0])

In [28]:
#x = classifier(split_sent(se['Essay'][0]))

In [29]:
#y = [_['label'] for _ in x]
#y

In [30]:
#np.sum([_ == 'insufficient' for _ in y])

In [31]:
#np.sum([_ == 'unrelated' for _ in y])

In [32]:
#np.sum([_ == 'unacceptable' for _ in y])

In [33]:
#np.sum([_ == 'acceptable' for _ in y])

In [34]:
new_df = se[['Essay']]

In [35]:
#new_df

In [36]:
# np.zeros(new_df.shape[0])

In [37]:
new_df['PE_AC'] = pd.Series(np.zeros(new_df.shape[0]))
new_df['PE_IN'] = pd.Series(np.zeros(new_df.shape[0]))
new_df['PE_UN'] = pd.Series(np.zeros(new_df.shape[0]))
new_df['PE_unrelated'] = pd.Series(np.zeros(new_df.shape[0]))
new_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['PE_AC'] = pd.Series(np.zeros(new_df.shape[0]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['PE_IN'] = pd.Series(np.zeros(new_df.shape[0]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['PE_UN'] = pd.Series(np.zeros(new_df.shape[0]))


Unnamed: 0,Essay,PE_AC,PE_IN,PE_UN,PE_unrelated
0,The initial hill height that I choose was 45 m...,0.0,0.0,0.0,0.0
1,We chose a height of .8 meters because it is a...,0.0,0.0,0.0,0.0
2,The initial drop must be the highest point of ...,0.0,0.0,0.0,0.0
3,The initial drop height that I have chosen is ...,0.0,0.0,0.0,0.0
4,For our initial drop the height was .77m. PE=m...,0.0,0.0,0.0,0.0
...,...,...,...,...,...
71,\nFor the Gonzales family we chose .6 meters f...,0.0,0.0,0.0,0.0
72,"For mass, .74kg was chosen.The initial drop he...",0.0,0.0,0.0,0.0
73,My team put our car mass at .50kg because if m...,0.0,0.0,0.0,0.0
74,My team and I choose 0.2 kg car mass because t...,0.0,0.0,0.0,0.0


In [38]:
# we will do this for every essay
# What column for each topic do we have?
# AC, IN, UN, unrelated
for i in range(new_df.shape[0]):
    sentences = split_sent(se['Essay'][i])
    res = classifier(sentences)
    predictions = pd.Series([x['label'] for x in res])
    new_df.at[i, 'PE_IN'] = np.sum([_ == 'insufficient' for _ in predictions])
    new_df.at[i, 'PE_AC'] = np.sum([_ == 'acceptable' for _ in predictions])
    new_df.at[i, 'PE_UN'] = np.sum([_ == 'unacceptable' for _ in predictions])
    new_df.at[i, 'PE_unrelated'] = np.sum([_ == 'unrelated' for _ in predictions])

In [39]:
new_df

Unnamed: 0,Essay,PE_AC,PE_IN,PE_UN,PE_unrelated
0,The initial hill height that I choose was 45 m...,0.0,0.0,0.0,11.0
1,We chose a height of .8 meters because it is a...,0.0,0.0,0.0,8.0
2,The initial drop must be the highest point of ...,0.0,0.0,0.0,11.0
3,The initial drop height that I have chosen is ...,0.0,0.0,0.0,9.0
4,For our initial drop the height was .77m. PE=m...,0.0,0.0,0.0,9.0
...,...,...,...,...,...
71,\nFor the Gonzales family we chose .6 meters f...,0.0,0.0,0.0,8.0
72,"For mass, .74kg was chosen.The initial drop he...",0.0,0.0,0.0,9.0
73,My team put our car mass at .50kg because if m...,0.0,0.0,0.0,9.0
74,My team and I choose 0.2 kg car mass because t...,0.0,0.0,0.0,13.0


In [40]:
new_df.to_csv('StudentEssaysPE.csv')

# 5. Now, do the same for KE and LCE

## Kinetic Energy

In [41]:
data = create_datasetdict('KE')
data['train'][0] # works!

{'label': 0, 'text': 'Potential energy is energy at rest'}

In [42]:
tokenized_data = data.map(preprocess_function, batched=True)

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [43]:
model = BloomForSequenceClassification.from_pretrained("bigscience/bloom-560m",
                                                      num_labels=4,
                                                      id2label=id2label,
                                                      label2id=label2id)

Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="finetuned-bloom-560m-KE",
    overwrite_output_dir= True,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=8,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="no",
    load_best_model_at_end=False,
    save_total_limit = 2,
    #push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,10.670832,0.452381
2,No log,6.349006,0.714286
3,No log,18.948235,0.642857
4,No log,11.866325,0.52381
5,No log,5.265879,0.690476
6,No log,4.650671,0.785714
7,No log,4.081443,0.761905
8,No log,3.418178,0.785714


TrainOutput(global_step=16, training_loss=8.027241706848145, metrics={'train_runtime': 218.2178, 'train_samples_per_second': 1.54, 'train_steps_per_second': 0.073, 'total_flos': 17819806728192.0, 'train_loss': 8.027241706848145, 'epoch': 8.0})

In [45]:
trainer.save_model()

In [46]:
model = BloomForSequenceClassification.from_pretrained("./finetuned-bloom-560m-KE",
                                                      num_labels=4,
                                                      id2label=id2label,
                                                      label2id=label2id)

In [47]:
classifier = pipeline("text-classification", model=model, tokenizer = tokenizer)

In [48]:
new_df['KE_AC'] = pd.Series(np.zeros(new_df.shape[0]))
new_df['KE_IN'] = pd.Series(np.zeros(new_df.shape[0]))
new_df['KE_UN'] = pd.Series(np.zeros(new_df.shape[0]))
new_df['KE_unrelated'] = pd.Series(np.zeros(new_df.shape[0]))
new_df

Unnamed: 0,Essay,PE_AC,PE_IN,PE_UN,PE_unrelated,KE_AC,KE_IN,KE_UN,KE_unrelated
0,The initial hill height that I choose was 45 m...,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0
1,We chose a height of .8 meters because it is a...,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0
2,The initial drop must be the highest point of ...,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0
3,The initial drop height that I have chosen is ...,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0
4,For our initial drop the height was .77m. PE=m...,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
71,\nFor the Gonzales family we chose .6 meters f...,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0
72,"For mass, .74kg was chosen.The initial drop he...",0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0
73,My team put our car mass at .50kg because if m...,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0
74,My team and I choose 0.2 kg car mass because t...,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0


In [49]:
# we will do this for every essay
# What column for each topic do we have?
# AC, IN, UN, unrelated
for i in range(new_df.shape[0]):
    sentences = split_sent(se['Essay'][i])
    res = classifier(sentences)
    predictions = pd.Series([x['label'] for x in res])
    new_df.at[i, 'KE_IN'] = np.sum([_ == 'insufficient' for _ in predictions])
    new_df.at[i, 'KE_AC'] = np.sum([_ == 'acceptable' for _ in predictions])
    new_df.at[i, 'KE_UN'] = np.sum([_ == 'unacceptable' for _ in predictions])
    new_df.at[i, 'KE_unrelated'] = np.sum([_ == 'unrelated' for _ in predictions])

In [50]:
new_df

Unnamed: 0,Essay,PE_AC,PE_IN,PE_UN,PE_unrelated,KE_AC,KE_IN,KE_UN,KE_unrelated
0,The initial hill height that I choose was 45 m...,0.0,0.0,0.0,11.0,0.0,0.0,1.0,10.0
1,We chose a height of .8 meters because it is a...,0.0,0.0,0.0,8.0,0.0,0.0,0.0,8.0
2,The initial drop must be the highest point of ...,0.0,0.0,0.0,11.0,0.0,0.0,0.0,11.0
3,The initial drop height that I have chosen is ...,0.0,0.0,0.0,9.0,0.0,0.0,0.0,9.0
4,For our initial drop the height was .77m. PE=m...,0.0,0.0,0.0,9.0,0.0,0.0,1.0,8.0
...,...,...,...,...,...,...,...,...,...
71,\nFor the Gonzales family we chose .6 meters f...,0.0,0.0,0.0,8.0,0.0,0.0,2.0,6.0
72,"For mass, .74kg was chosen.The initial drop he...",0.0,0.0,0.0,9.0,0.0,0.0,2.0,7.0
73,My team put our car mass at .50kg because if m...,0.0,0.0,0.0,9.0,0.0,0.0,1.0,8.0
74,My team and I choose 0.2 kg car mass because t...,0.0,0.0,0.0,13.0,0.0,0.0,2.0,11.0


In [51]:
new_df.to_csv('StudentEssaysPEKE.csv')

## LCE

In [52]:
data = create_datasetdict('LCE')
data['train'][-1] # works!

{'label': 2,
 'text': 'If there were no friction, the energy would be the same at the start and at the finish.'}

In [53]:
tokenized_data = data.map(preprocess_function, batched=True)

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

In [54]:
model = BloomForSequenceClassification.from_pretrained("bigscience/bloom-560m",
                                                      num_labels=4,
                                                      id2label=id2label,
                                                      label2id=label2id)

Some weights of BloomForSequenceClassification were not initialized from the model checkpoint at bigscience/bloom-560m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [55]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="finetuned-bloom-560m-LCE",
    overwrite_output_dir= True,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=8,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="no",
    load_best_model_at_end=False,
    save_total_limit = 2,
    #push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,4.776058,0.809524
2,No log,6.599434,0.809524
3,No log,6.799906,0.809524
4,No log,4.219232,0.857143
5,No log,3.205742,0.809524
6,No log,2.357054,0.809524
7,No log,1.599576,0.833333
8,No log,1.318721,0.857143


TrainOutput(global_step=16, training_loss=4.567368507385254, metrics={'train_runtime': 211.685, 'train_samples_per_second': 1.587, 'train_steps_per_second': 0.076, 'total_flos': 17819806728192.0, 'train_loss': 4.567368507385254, 'epoch': 8.0})

In [56]:
trainer.save_model()

In [57]:
model = BloomForSequenceClassification.from_pretrained("./finetuned-bloom-560m-LCE",
                                                      num_labels=4,
                                                      id2label=id2label,
                                                      label2id=label2id)

In [58]:
classifier = pipeline("text-classification", model=model, tokenizer = tokenizer)

In [59]:
new_df['LCE_AC'] = pd.Series(np.zeros(new_df.shape[0]))
new_df['LCE_IN'] = pd.Series(np.zeros(new_df.shape[0]))
new_df['LCE_UN'] = pd.Series(np.zeros(new_df.shape[0]))
new_df['LCE_unrelated'] = pd.Series(np.zeros(new_df.shape[0]))
new_df

Unnamed: 0,Essay,PE_AC,PE_IN,PE_UN,PE_unrelated,KE_AC,KE_IN,KE_UN,KE_unrelated,LCE_AC,LCE_IN,LCE_UN,LCE_unrelated
0,The initial hill height that I choose was 45 m...,0.0,0.0,0.0,11.0,0.0,0.0,1.0,10.0,0.0,0.0,0.0,0.0
1,We chose a height of .8 meters because it is a...,0.0,0.0,0.0,8.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0
2,The initial drop must be the highest point of ...,0.0,0.0,0.0,11.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0
3,The initial drop height that I have chosen is ...,0.0,0.0,0.0,9.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0
4,For our initial drop the height was .77m. PE=m...,0.0,0.0,0.0,9.0,0.0,0.0,1.0,8.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,\nFor the Gonzales family we chose .6 meters f...,0.0,0.0,0.0,8.0,0.0,0.0,2.0,6.0,0.0,0.0,0.0,0.0
72,"For mass, .74kg was chosen.The initial drop he...",0.0,0.0,0.0,9.0,0.0,0.0,2.0,7.0,0.0,0.0,0.0,0.0
73,My team put our car mass at .50kg because if m...,0.0,0.0,0.0,9.0,0.0,0.0,1.0,8.0,0.0,0.0,0.0,0.0
74,My team and I choose 0.2 kg car mass because t...,0.0,0.0,0.0,13.0,0.0,0.0,2.0,11.0,0.0,0.0,0.0,0.0


In [60]:
# we will do this for every essay
# What column for each topic do we have?
# AC, IN, UN, unrelated
for i in range(new_df.shape[0]):
    sentences = split_sent(se['Essay'][i])
    res = classifier(sentences)
    predictions = pd.Series([x['label'] for x in res])
    new_df.at[i, 'LCE_IN'] = np.sum([_ == 'insufficient' for _ in predictions])
    new_df.at[i, 'LCE_AC'] = np.sum([_ == 'acceptable' for _ in predictions])
    new_df.at[i, 'LCE_UN'] = np.sum([_ == 'unacceptable' for _ in predictions])
    new_df.at[i, 'LCE_unrelated'] = np.sum([_ == 'unrelated' for _ in predictions])

In [61]:
new_df

Unnamed: 0,Essay,PE_AC,PE_IN,PE_UN,PE_unrelated,KE_AC,KE_IN,KE_UN,KE_unrelated,LCE_AC,LCE_IN,LCE_UN,LCE_unrelated
0,The initial hill height that I choose was 45 m...,0.0,0.0,0.0,11.0,0.0,0.0,1.0,10.0,5.0,0.0,0.0,6.0
1,We chose a height of .8 meters because it is a...,0.0,0.0,0.0,8.0,0.0,0.0,0.0,8.0,3.0,0.0,0.0,5.0
2,The initial drop must be the highest point of ...,0.0,0.0,0.0,11.0,0.0,0.0,0.0,11.0,3.0,0.0,0.0,8.0
3,The initial drop height that I have chosen is ...,0.0,0.0,0.0,9.0,0.0,0.0,0.0,9.0,5.0,0.0,0.0,4.0
4,For our initial drop the height was .77m. PE=m...,0.0,0.0,0.0,9.0,0.0,0.0,1.0,8.0,7.0,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,\nFor the Gonzales family we chose .6 meters f...,0.0,0.0,0.0,8.0,0.0,0.0,2.0,6.0,2.0,0.0,0.0,6.0
72,"For mass, .74kg was chosen.The initial drop he...",0.0,0.0,0.0,9.0,0.0,0.0,2.0,7.0,1.0,0.0,1.0,7.0
73,My team put our car mass at .50kg because if m...,0.0,0.0,0.0,9.0,0.0,0.0,1.0,8.0,4.0,0.0,1.0,4.0
74,My team and I choose 0.2 kg car mass because t...,0.0,0.0,0.0,13.0,0.0,0.0,2.0,11.0,5.0,0.0,3.0,5.0


In [62]:
new_df.to_csv('StudentEssaysPEKELCE.csv')

In [63]:
# !pip3 install torch torchvision torchaudio

In [64]:
# !pip install --upgrade transformers

In [65]:
# Ok, "multi-label classifier" = can be 0 label or all labels
# "Multi-class classifier" = can be only 1 label
# What I "ideally" looking for is a "multi-classification" inside 3 "multi-labbel classifiers"
# For now, let's have 3 models of multi-class classification instead.