In [1]:
!pip install -q simpletransformers 
!pip uninstall -q -y tqdm
!pip install -q tqdm==4.47
!pip install -q matplotlib

In [21]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import logging
import sklearn
import csv
import os
import random
import matplotlib.pyplot as plt
from csv import reader
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [34]:
random.seed(42)

## Reading Data

In [22]:
file = csv.reader(open('/content/sample_data/all_prompts.csv','r'))
first_line_flag = True
questions = []
for line in file:
  if first_line_flag:
      first_line_flag = False
      continue
  else:
    questions.append(line[2])

['The tight curriculum of our education system leaves no room for imagination and creativity. Write a response that expresses your thoughts on this statement. To what extent do you agree or disagree? Explain your reasoning.', 'Our society is disrupted by the ever-widening gap between rich and poor. One percent of the worlds population controls half of all global wealth, while a quarter of the worlds population struggles to feed themselves daily. Write a response describing the causes and consequences of this situation. What remedies might be effective?', 'Has technology become a new addiction? Have we become slaves to our own creation? Write a response that expresses your thoughts on this statement. To what extent do you agree or disagree? Explain your reasoning.', 'In the nuclear age, the production and development of weaponry challenge the very existence of humankind. How useful are weapons? Do the benefits outweigh the risks? Write a response explaining the pros and cons of the arms

In [23]:
total_data = []
with open('/content/sample_data/train.csv','r') as read_obj:
  read = reader(read_obj)
  first_line_flag = True
  for i in read:
    if first_line_flag:
      first_line_flag = False
      continue
    temp_array = []
    label = int(float(i[4])/0.5)
    if i[1] == '1':
      k = " ".join([questions[0],i[3]]) 
    elif i[1] == '2':
      k = " ".join([questions[1],i[3]])
    elif i[1] == '3':
      k = " ".join([questions[2],i[3]])
    elif i[1] == '5':
      k = " ".join([questions[3],i[3]])
    temp_array.append(k)
    temp_array.append(label)
    total_data.append(temp_array)



## Spliting into training and evaluation dataset

In [24]:
random.shuffle(total_data)
final_data = [ele for ele in total_data if ele != []]



In [25]:
len_dataset = len(final_data)
training_data = final_data[:int(len_dataset*0.8)]
evaluation_data = final_data[int(len_dataset*0.8):]

In [26]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Preparing train data
train_data = training_data
train_df = pd.DataFrame(train_data)
train_df.columns = ["text", "labels"]

In [27]:
# Preparing eval data
eval_data = evaluation_data
eval_df = pd.DataFrame(eval_data)
eval_df.columns = ["text", "labels"]

## Model Training

In [28]:
model_args = ClassificationArgs()
model_args.num_train_epochs = 25
model_args.regression = False
model_args.overwrite_output_dir = True
model_args.train_batch_size=64
model_args.save_model_every_epoch=False
# Create a ClassificationModel
model = ClassificationModel(
    "roberta",
    "roberta-base",
    num_labels=11,
    args=model_args,
    use_cuda = True,
    cuda_device = 0,
)

# Train the model
model.train_model(train_df)

# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out

HBox(children=(FloatProgress(value=0.0, max=992.0), HTML(value='')))

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_roberta_128_11_2





HBox(children=(FloatProgress(value=0.0, description='Epoch', max=25.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 25', max=16.0, style=ProgressStyle(des…

  model.parameters(), args.max_grad_norm





HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 25', max=16.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 25', max=16.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 25', max=16.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 25', max=16.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 25', max=16.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 25', max=16.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 25', max=16.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 25', max=16.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 25', max=16.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 10 of 25', max=16.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 11 of 25', max=16.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 12 of 25', max=16.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 13 of 25', max=16.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 14 of 25', max=16.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 15 of 25', max=16.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 16 of 25', max=16.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 17 of 25', max=16.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 18 of 25', max=16.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 19 of 25', max=16.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 20 of 25', max=16.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 21 of 25', max=16.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 22 of 25', max=16.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 23 of 25', max=16.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 24 of 25', max=16.0, style=ProgressStyle(de…





INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=248.0), HTML(value='')))

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_11_2





HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=31.0, style=ProgressStyle(descri…

INFO:simpletransformers.classification.classification_model:{'mcc': 0.12149368234700395, 'eval_loss': 3.1888585244455645}





In [29]:
evaluation_text = [text for text, _ in evaluation_data]
evaluation_label = [label for _, label in evaluation_data]

## Evaluation

In [30]:
pred_on_evaluation_set, _ = model.predict(evaluation_text)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=248.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))




In [33]:
print(confusion_matrix(pred_on_evaluation_set, evaluation_label))
print("accuracy = ", accuracy_score(pred_on_evaluation_set, evaluation_label))

[[ 2  2  0  1  1  1  1  0  0  0]
 [ 1  0  0  1  0  0  0  0  0  0]
 [ 1  0  2  1  1  0  0  0  0  0]
 [ 5  2  0  3  5  1  5  0  0  0]
 [ 8  5  3  6 17  7  5  2  0  0]
 [ 1  1  2  7 13 18  9  8  1  0]
 [ 1  0  2  2  6 16 24  8  4  1]
 [ 1  0  2  2  4 10 13  2  1  0]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0]]
accuracy =  0.27419354838709675


In [10]:
file = csv.reader(open('/content/sample_data/test.csv','r'))
first_line_flag = True
test_essay = []
for r in file:
   if first_line_flag:
      first_line_flag = False
      continue
   else:
    test_essay.append(r[3])

['Curriculum has been adopted in many schools. This curriculum had a great impact on the developement of the children.But nowadays we see that the education institutions had tightened the curriculum system in such a manner that the children andyoung people arenot getting enough time to be indulged in other activities. Due to the strict and tightened rules the capacity of imagination and creativity isaffected badly.They become Book worm.A good curriculum is good for the children but apart from this same preference should be given to other activities.Following curriculum is necessary but education system have to understand the need of creativity and imagination role in a life of a person.There should a enough amount of curriculum should be prepared by the education systems that it will not affect the children and he or she will not feel the cumbersome of the theories.Due to the tightened curriculum thereb is no space left for the imagination and creativity. A creative mind is better than

## Generate Test predictions

In [11]:
preds,_ = model.predict(test_essay)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


HBox(children=(FloatProgress(value=0.0, max=305.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




In [12]:
len_test = len(test_essay)
for i in range(len_test):
  preds[i] *= 0.5

In [13]:
df =  pd.read_csv('/content/sample_data/test.csv')
df["predicted_score"] = preds
df.to_csv('/content/sample_data/test.csv')

In [17]:
os.rename(r'/content/sample_data/test.csv',r'/content/sample_data/test_prediction.csv')