In [28]:
from transformers import AlbertTokenizer, AlbertForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np
import torch.nn as nn

In [2]:
with open("train.txt", "r") as f:
    train_data = f.read()

with open("val.txt", "r") as f:
    val_data = f.read()

with open("test.txt", "r") as f:
    test_data = f.read()

In [3]:
def preprocess_data(data):
    text = []
    labels = []
    label_dict = {'anger': 0, 'fear': 1, 'joy': 2, 'love': 3, 'sadness': 4, 'surprise':5}
    for d in data:
        try:
            t,l = d.split(";")
            text.append(t)
            labels.append(label_dict[l])
        except:
            pass
    return text, labels

In [4]:
train_samples = train_data.split("\n")
train_text, train_labels = preprocess_data(train_samples)

val_samples = val_data.split("\n")
val_text, val_labels = preprocess_data(val_samples)

test_samples = test_data.split("\n")
test_text, test_labels = preprocess_data(test_samples)

In [5]:
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2',num_labels=6)

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You sho

In [6]:
train_tokens = tokenizer(train_text, padding=True, return_tensors = "pt")
val_tokens = tokenizer(val_text, padding=True, return_tensors = "pt")
test_tokens = tokenizer(test_text, padding=True, return_tensors = "pt")

In [7]:
epochs = 10
batch_size = 4
iterations = train_tokens.input_ids.size()[0] * epochs / batch_size
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-5)

In [8]:
class QuotesDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = QuotesDataset(train_tokens, train_labels)
val_dataset = QuotesDataset(val_tokens, val_labels)
test_dataset = QuotesDataset(test_tokens, test_labels)

In [9]:
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=4,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs'
)

In [10]:
trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=train_dataset, 
    eval_dataset=val_dataset
)

In [11]:
trainer.train()

  import sys


Step,Training Loss
500,1.447503
1000,1.145539
1500,0.824425
2000,0.736151
2500,0.64105
3000,0.588032
3500,0.519888
4000,0.447969
4500,0.418411
5000,0.395391


  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys
  import sys


TrainOutput(global_step=12000, training_loss=0.4607625770568848)

In [12]:
trainer.save_model("quotes")

In [13]:
trainer.evaluate()

  import sys


{'eval_loss': 0.26618361473083496, 'epoch': 3.0}

In [15]:
pred = trainer.predict(test_dataset)

  import sys


In [34]:
pred_labels = torch.argmax(torch.tensor(pred[0]), -1)

In [35]:
## Calculate accuracy
(np.array(test_labels) == pred_labels.numpy()).sum()/ len(test_labels)

0.9295