In [4]:
!python3 --version

Python 3.10.12


In [2]:
from transformers import DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments, AdamW
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizerFast
from sklearn.model_selection import train_test_split
from pathlib import Path
import numpy as np
import time
from sklearn.metrics import f1_score
from collections import Counter
import copy
from tqdm import tqdm


X_train_filename = "./drive/MyDrive/CSCI467Data/X_train.tsv"
X_dev_filename = "./drive/MyDrive/CSCI467Data/X_dev.tsv"
X_test_filename = "./drive/MyDrive/CSCI467Data/X_test.tsv"
y_train_filename = "./drive/MyDrive/CSCI467Data/y_train.npy"
y_dev_filename = "./drive/MyDrive/CSCI467Data/y_dev.npy"
y_test_filename = "./drive/MyDrive/CSCI467Data/y_test.npy"

save_path =  "./drive/MyDrive/CSCI467Data"



class EmotionDataset(Dataset):
    def __init__(self, X, y, transform=None, target_transform=None):
        self.encodings = X
        self.labels = y


    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item



def read_file(filename):
    data=[]

    with open(filename) as file:
        for line in file:
            words = line.strip().split('\t')
            data.append(words)
    return data


def get_class_weights(y,a):
    c = Counter(y)
    denom = sum([c[w]**a for w in range(6)])
    class_weights = [((c[w]**a)/denom) for w in range(6)]
    return class_weights


#load data
X_train = read_file(X_train_filename)
X_dev = read_file(X_dev_filename)
X_test = read_file(X_test_filename)
y_train = np.load(y_train_filename)
y_dev = np.load(y_dev_filename)
y_test = np.load(y_test_filename)


class_weights = get_class_weights(y_train, a=0.25)



train_texts = [" ".join(words) for words in X_train[:100000]]
dev_texts = [" ".join(words) for words in X_dev[:20000]]
test_texts = [" ".join(words) for words in X_test[:10000]]
train_labels = y_train.tolist()[:100000]
dev_labels = y_dev.tolist()[:20000]
test_labels = y_test.tolist()[:10000]


tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')


train_encodings = tokenizer(train_texts, truncation=True, padding=True)
dev_encodings = tokenizer(dev_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)


train_set = EmotionDataset(train_encodings, train_labels)
dev_set = EmotionDataset(dev_encodings, dev_labels)
test_set = EmotionDataset(test_encodings, test_labels)




print('Done with encodings')

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6)
# model = DistilBertForSequenceClassification.from_pretrained(save_path)
model.to(device)
model.train()

train_loader = DataLoader(train_set, batch_size=128, shuffle=True)
dev_loader = DataLoader(dev_set, batch_size=128, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)
best_dev_f1 = -1
best_checkpoint_f1 = None
best_epoch_f1 = -1

for epoch in range(4):
    epoch_loss = 0
    train_num_correct = 0

    all_labels=[]
    all_preds=[]
    start_time = time.time()
    for batch in tqdm(train_loader, desc=f"Epoch train: {epoch}"):


        optim.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs[1] #Bxnum_labels

        loss = outputs[0]
        loss.backward()
        optim.step()

        preds = torch.argmax(logits, dim=1)
        train_num_correct += torch.sum(preds == labels).item()

        epoch_loss += loss

        all_labels += labels.tolist()
        all_preds += preds.tolist()

    train_f1s = f1_score(all_labels, all_preds, average=None)
    train_f1 = np.dot(class_weights, train_f1s)

    end_time = time.time()
    # print(f'Time for epoch: {end_time - start_time:.2f}')

    model.eval()
    all_labels=[]
    all_preds=[]
    dev_num_correct = 0
    with torch.no_grad():  # Don't allocate memory for storing gradients, more efficient when not training
        for batch in tqdm(dev_loader, desc=f'Epoch dev: {epoch}'):

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs[1] #Bxnum_labels


            preds = torch.argmax(logits, dim=1)
            dev_num_correct += torch.sum(preds == labels).item()

            all_labels += labels.tolist()
            all_preds += preds.tolist()

        dev_f1s = f1_score(all_labels, all_preds, average=None)
        dev_f1 = np.dot(class_weights, dev_f1s)

        dev_acc = dev_num_correct / len(dev_set)
        if dev_f1 > best_dev_f1:
          best_dev_f1 = dev_f1
          best_checkpoint_f1 = copy.deepcopy(model.state_dict())
          best_epoch_f1 = epoch





    train_acc = train_num_correct / len(train_set)
    print(f'epoch: {epoch}, train_acc={train_acc:.5f} dev_acc={dev_acc:.5f} train_f1={train_f1:.5f} dev_f1={dev_f1:.5f} loss: {epoch_loss:.5f}')


print(f'Best epoch was: {best_epoch_f1}')
model.load_state_dict(best_checkpoint_f1)
model.save_pretrained(save_path)

# model = DistilBertForSequenceClassification.from_pretrained(save_path).to(device)
print('\nEvaluating on Test set\n')
model.eval()
test_loader = DataLoader(test_set, batch_size=128, shuffle=True)
test_num_correct=0
all_labels=[]
all_preds=[]
with torch.no_grad():
  for batch in tqdm(test_loader, desc=f"Test"):


      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)
      outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
      logits = outputs[1] #Bxnum_labels


      preds = torch.argmax(logits, dim=1)
      test_num_correct += torch.sum(preds == labels).item()

      all_labels += labels.tolist()
      all_preds += preds.tolist()
test_f1s = f1_score(all_labels, all_preds, average=None)
test_f1 = np.dot(class_weights, test_f1s)

test_acc = test_num_correct / len(test_set)
print(f'test_acc={test_acc:.5f} test_f1={test_f1:.5f}')



Done with encodings


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch train: 0: 100%|██████████| 782/782 [12:39<00:00,  1.03it/s]


Time for epoch: 760.11


Epoch dev: 0: 100%|██████████| 157/157 [00:40<00:00,  3.92it/s]


epoch: 0, train_acc=0.90580 dev_acc=0.94085 train_f1=0.87917 dev_f1=0.92022 loss: 174.29707


Epoch train: 1: 100%|██████████| 782/782 [12:27<00:00,  1.05it/s]


Time for epoch: 748.03


Epoch dev: 1: 100%|██████████| 157/157 [00:40<00:00,  3.91it/s]


epoch: 1, train_acc=0.94261 dev_acc=0.94195 train_f1=0.92027 dev_f1=0.92362 loss: 71.60219


Epoch train: 2: 100%|██████████| 782/782 [12:28<00:00,  1.05it/s]


Time for epoch: 748.36


Epoch dev: 2: 100%|██████████| 157/157 [00:40<00:00,  3.91it/s]


epoch: 2, train_acc=0.94639 dev_acc=0.94225 train_f1=0.92495 dev_f1=0.92075 loss: 67.27609


Epoch train: 3: 100%|██████████| 782/782 [12:28<00:00,  1.04it/s]


Time for epoch: 748.94


Epoch dev: 3: 100%|██████████| 157/157 [00:40<00:00,  3.89it/s]


epoch: 3, train_acc=0.94728 dev_acc=0.93980 train_f1=0.92593 dev_f1=0.91481 loss: 64.29540
Best epoch was: 1
EVALUATING


Test: 100%|██████████| 79/79 [00:23<00:00,  3.42it/s]

test_acc=0.93920 test_f1=0.92235



