In [18]:
import gzip
import shutil
import time

import pandas as pd
import requests
import torch
import torch.nn.functional as F
import torchtext

import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification

In [19]:
torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE = 'cpu'

In [20]:
texts, labels = [], []
with open("./emos/Emotion_Dataset_Custom.txt") as f:
    str = f.readline()
    while str:
        data = str.split("#$#")
        texts.append(data[0].strip())
        labels.append(data[1].strip())
        str = f.readline()

df = pd.DataFrame()
df["sentence"] = texts
df['label'] = labels
df['label'].replace(to_replace=['joy', 'fear', 'sadness', 
                                 'anger', 'disgust', 'surprise'], value=[0, 1, 2, 3, 4, 5], inplace=True)

In [21]:
train_texts = df.iloc[:250]['sentence'].values
train_labels = df.iloc[:250]['label'].values

valid_texts = df.iloc[250:275]['sentence'].values
valid_labels = df.iloc[250:275]['label'].values

test_texts = df.iloc[275:]['sentence'].values
test_labels = df.iloc[275:]['label'].values

In [22]:

valid_texts = df.iloc[250:275]['sentence'].values
valid_labels = df.iloc[250:275]['label'].values

test_texts = df.iloc[275:]['sentence'].values
test_labels = df.iloc[275:]['label'].values

In [23]:
tokenizer = DistilBertTokenizerFast.from_pretrained(
    'distilbert-base-uncased'
)

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
valid_encodings = tokenizer(list(valid_texts), truncation=True, padding=True)
test_encodings  = tokenizer(list(test_texts), truncation=True, padding=True)

In [24]:
class EMoSDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx])
            for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        
        return item
    
    def __len__(self):
        return len(self.labels)

In [25]:
train_dataset = EMoSDataset(train_encodings, train_labels)
valid_dataset = EMoSDataset(valid_encodings, valid_labels)
test_dataset = EMoSDataset(test_encodings, test_labels)


In [26]:
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=16, shuffle=True)

valid_loader = torch.utils.data.DataLoader(
valid_dataset, batch_size=16, shuffle=False)

test_loader = torch.utils.data.DataLoader(
test_dataset, batch_size=16, shuffle=False)

In [22]:
model = DistilBertForSequenceClassification.from_pretrained(
'distilbert-base-uncased', num_labels=6)
model.to(DEVICE)
model.train()
optim = torch.optim.Adam(model.parameters(), lr=5e-5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
def compute_accuracy(model, data_loader, device):

    with torch.no_grad():
        correct_pred, num_examples = 0, 0
        for batch_idx, batch in enumerate(data_loader):
            ### Prepare data
            input_ids = batch['input_ids'].to(device)
            attention_mask = \
            batch['attention_mask'].to(device)

            labels = batch['labels'].to(device)
            outputs = model(input_ids,attention_mask=attention_mask)
            
            logits = outputs['logits']
            
            predicted_labels = torch.argmax(logits, 1)
            num_examples += labels.size(0)
            
            correct_pred += \
            (predicted_labels == labels).sum()
        return correct_pred.float()/num_examples * 100

In [24]:
NUM_EPOCHS = 100
start_time = time.time()
for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, batch in enumerate(train_loader):
        ### Prepare data
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        
        ### Forward pass
        outputs = model(input_ids,
        attention_mask=attention_mask,
        labels=labels)
        loss, logits = outputs['loss'], outputs['logits']
        
        ### Backward pass
        optim.zero_grad()
        loss.backward()
        optim.step()

        ### Logging
        if not batch_idx % 250:
            print(f'Epoch: {epoch+1:04d}/{NUM_EPOCHS:04d}'
            f' | Batch'
            f'{batch_idx:04d}/'
            f'{len(train_loader):04d} | '
            f'Loss: {loss:.4f}')
            model.eval()
            
    with torch.set_grad_enabled(False):
        print(f'Training accuracy: ' \
        f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
        f'\nValid accuracy: ' \
        f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')

    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')


print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

Epoch: 0001/0100 | Batch0000/0016 | Loss: 1.7886
Training accuracy: 66.00%
Valid accuracy: 60.00%
Time elapsed: 0.14 min
Epoch: 0002/0100 | Batch0000/0016 | Loss: 0.9554
Training accuracy: 79.20%
Valid accuracy: 72.00%
Time elapsed: 0.26 min
Epoch: 0003/0100 | Batch0000/0016 | Loss: 0.3841
Training accuracy: 97.60%
Valid accuracy: 64.00%
Time elapsed: 0.44 min
Epoch: 0004/0100 | Batch0000/0016 | Loss: 0.1941
Training accuracy: 98.80%
Valid accuracy: 52.00%
Time elapsed: 0.62 min
Epoch: 0005/0100 | Batch0000/0016 | Loss: 0.1129
Training accuracy: 98.80%
Valid accuracy: 32.00%
Time elapsed: 0.82 min
Epoch: 0006/0100 | Batch0000/0016 | Loss: 0.0466
Training accuracy: 99.60%
Valid accuracy: 60.00%
Time elapsed: 1.01 min
Epoch: 0007/0100 | Batch0000/0016 | Loss: 0.0261
Training accuracy: 99.60%
Valid accuracy: 68.00%
Time elapsed: 1.20 min
Epoch: 0008/0100 | Batch0000/0016 | Loss: 0.0345
Training accuracy: 100.00%
Valid accuracy: 60.00%
Time elapsed: 1.39 min
Epoch: 0009/0100 | Batch0000/00

KeyboardInterrupt: 

In [10]:
import gzip
import shutil
import time

import pandas as pd
import requests
import torch
import torch.nn.functional as F
import torchtext

import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification

torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#--------------------------------------------------------------------------------------
#----------------------------HUGGINGFACE DATASET --------------------------------------
#--------------------------------------------------------------------------------------

from datasets import *
dataset = load_dataset('json', split='train', data_files='./emos/data.jsonl')

# 90% train, 10% test + validation
train_testvalid = dataset.train_test_split(test_size=0.1)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)

# # gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

train_test_valid_dataset

tokenizer = DistilBertTokenizerFast.from_pretrained(
    'distilbert-base-uncased'
)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = train_test_valid_dataset.map(tokenize_function, batched=True)

train_dataset = tokenized_datasets["train"].shuffle(seed=42)
valid_dataset = tokenized_datasets["valid"].shuffle(seed=42)
test_dataset = tokenized_datasets["test"].shuffle(seed=42)

Map: 100%|██████████| 375128/375128 [00:33<00:00, 11349.86 examples/s]
Map: 100%|██████████| 20841/20841 [00:01<00:00, 11336.06 examples/s]
Map: 100%|██████████| 20840/20840 [00:01<00:00, 11242.39 examples/s]


In [11]:
model = DistilBertForSequenceClassification.from_pretrained(
'distilbert-base-uncased', num_labels=6)
model.to(DEVICE)
model.train()
optim = torch.optim.Adam(model.parameters(), lr=5e-5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
import numpy as np

import evaluate

metric = evaluate.load("accuracy")

In [13]:
def compute_metrics(eval_pred):

    logits, labels = eval_pred

    predictions = np.argmax(logits, axis=-1)

    return metric.compute(predictions=predictions, references=labels)

In [14]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [16]:
trainer = Trainer(

    model=model,

    args=training_args,

    train_dataset=train_dataset,

    eval_dataset=test_dataset,

    compute_metrics=compute_metrics,

)

In [17]:
trainer.train()

                                                     
  0%|          | 95/140673 [02:32<5:46:27,  6.76it/s] 

{'loss': 0.7026, 'learning_rate': 4.982228288299816e-05, 'epoch': 0.01}


                                                     
  0%|          | 95/140673 [03:42<5:46:27,  6.76it/s]  

{'loss': 0.3396, 'learning_rate': 4.964456576599632e-05, 'epoch': 0.02}


                                                     
  0%|          | 95/140673 [04:58<5:46:27,  6.76it/s]  

{'loss': 0.2939, 'learning_rate': 4.946684864899448e-05, 'epoch': 0.03}


                                                     
  0%|          | 95/140673 [06:11<5:46:27,  6.76it/s]  

{'loss': 0.2546, 'learning_rate': 4.9289131531992636e-05, 'epoch': 0.04}




KeyboardInterrupt: 