# installations and imports

In [None]:
# install packages
!pip install -qq transformers

In [None]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm
from collections import defaultdict

import torch
from torch import nn, optim, cuda
from transformers import RobertaForSequenceClassification, RobertaTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score

import logging
logging.basicConfig(level=logging.ERROR)

# mount google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# set up device for GPU usage
device = 'cuda' if cuda.is_available() else 'cpu'

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

Mounted at /content/drive


<torch._C.Generator at 0x78c605805ab0>

In [None]:
# import data
df_train = pd.read_csv('/content/drive/MyDrive/final_sets/slang-random_train.tsv', delimiter='\t', encoding='utf-8',usecols=['sentence', 'label'])
df_dev = pd.read_csv('/content/drive/MyDrive/final_sets/slang-random_dev.tsv', delimiter='\t', encoding='utf-8',usecols=['sentence', 'label'])
df_test = pd.read_csv('/content/drive/MyDrive/final_sets/full_test.tsv', delimiter='\t', encoding='utf-8',usecols=['sentence', 'label'])


# shuffle df
df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)
df_dev = df_dev.sample(frac=1, random_state=42).reset_index(drop=True)
df_test = df_test.sample(frac=1, random_state=42).reset_index(drop=True)

print(df_train.shape)
print(df_dev.shape)
print(df_test.shape)

print(df_train.head())
print(df_dev.head())

(2400, 2)
(300, 2)
(1000, 2)
                                            sentence  label
0        It's crazy how fast this year is flying by.      0
1  I started reading a new book last night, and I...      0
2  Sarah's birthday is this weekend, and I heard ...      1
3  I'm really looking forward to summer break, I ...      0
4  Did you end up going on that hiking trip you w...      0
                                            sentence  label
0  I'm thinking about taking up yoga to help with...      0
1  My family and I are planning a trip to Europe ...      0
2                            It got amazing reviews.      0
3  The way they're avoiding direct answers about ...      1
4  Did you hear about that new restaurant that op...      0


In [None]:
print(df_train.info())
print('~~~~~~~~~~~~~~~~~~~')
print(df_dev.info())
print('~~~~~~~~~~~~~~~~~~~')
print(df_train.label.value_counts())
print('~~~~~~~~~~~~~~~~~~~')
print(df_dev.label.value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2400 entries, 0 to 2399
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  2400 non-null   object
 1   label     2400 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 37.6+ KB
None
~~~~~~~~~~~~~~~~~~~
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  300 non-null    object
 1   label     300 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 4.8+ KB
None
~~~~~~~~~~~~~~~~~~~
0    1200
1    1200
Name: label, dtype: int64
~~~~~~~~~~~~~~~~~~~
0    150
1    150
Name: label, dtype: int64


In [None]:
# choose model
model_name = "roberta-base"
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = RobertaTokenizer.from_pretrained(model_name)

max_len = 512

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
class SlangDataset(Dataset):
    """
    Create a PyTorch dataset for the Slang data.
    """
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = str(self.data.iloc[index]['sentence'])
        label = int(self.data.iloc[index]['label'])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  """
  Create a PyTorch DataLoader for the Slang data.
  """
  ds = SlangDataset(
    dataframe=df,
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )


batch_size = 16

train_data_loader = create_data_loader(df_train, tokenizer, max_len, batch_size)
dev_data_loader = create_data_loader(df_dev, tokenizer, max_len, batch_size)
test_data_loader = create_data_loader(df_test, tokenizer, max_len, batch_size)

In [None]:
data = next(iter(train_data_loader))

print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data)

torch.Size([16, 512])
torch.Size([16, 512])
{'text': ["It's crazy how fast this year is flying by.", "I started reading a new book last night, and I can't put it down.", "Sarah's birthday is this weekend, and I heard her parents are letting her have a turnt sleepover.", "I'm really looking forward to summer break, I need some time off from all the stress of school.", 'Did you end up going on that hiking trip you were planning?', "The way Ethan keeps avoiding my texts is making me think he's up to something sus.", 'It was amazing!', 'Have you heard about that new volunteer opportunity at the animal shelter?', 'Have you started thinking about where you want to go to college yet?', 'My friend keeps texting his crush paragraphs about how amazing she is...he needs to chill out with the simping.', "I'm thinking of trying out for the school play this year; acting has always been a passion of mine.", "I don't know why, but the new girl in our class seems super sus to me.", "She's the biggest s

In [None]:
model = model.to(device)

In [None]:
EPOCHS = 4
L2_lambda = 0.5
learning_rate = 0.00005

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=L2_lambda)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

In [None]:
import torch
import numpy as np
import torch.nn as nn

def train_epoch(model, data_loader, optimizer, device, scheduler, n_examples):
    """
    Train the model for one epoch.
    """

    model = model.train()
    losses = []
    correct_predictions = 0

    for batch in data_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["label"].to(device)

        output = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=targets
        )

        logits = output.logits
        loss = output.loss

        # Apply softmax to logits
        softmax_probs = nn.functional.softmax(logits, dim=1)

        # Get predicted class using argmax
        preds = torch.argmax(softmax_probs, dim=1).cpu().numpy()
        label_ids = targets.to('cpu').numpy()

        correct_predictions += np.sum(preds == label_ids)

        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions / n_examples, np.mean(losses)


In [None]:
import torch
import numpy as np
import torch.nn as nn

def eval_model(model, data_loader, device, n_examples):
    """
    Evaluate the model on the validation set.
    """

    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            targets = batch["label"].to(device)

            output = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=targets
            )

            logits = output.logits
            loss = output.loss

            # apply softmax to logits
            softmax_probs = nn.functional.softmax(logits, dim=1)

            # get predicted class using argmax
            preds = torch.argmax(softmax_probs, dim=1).cpu().numpy()
            label_ids = targets.to('cpu').numpy()

            correct_predictions += np.sum(preds == label_ids)
            losses.append(loss.item())

    return correct_predictions / n_examples, np.mean(losses)


In [None]:
def get_predictions(model, data_loader, device):
  """
  Get predictions from the model.
  """
  model = model.eval()
  all_predictions = []
  all_targets = []
  all_logits = []

  with torch.no_grad():
    for batch in data_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["label"].to(device)

        output = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=targets
        )

        logits = output.logits
        predictions = torch.argmax(logits, dim=1)

        all_predictions.extend(predictions.cpu().numpy())
        all_targets.extend(targets.cpu().numpy())
        all_logits.extend(logits.cpu().numpy())

  return all_predictions, all_targets, all_logits

In [None]:
from collections import defaultdict
from sklearn.metrics import precision_score, recall_score, f1_score

history = defaultdict(list)

best_acc = 0

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(model, train_data_loader, optimizer, device, scheduler, len(df_train))

    print(f'Train loss {train_loss} Accuracy {train_acc}')

    dev_acc, dev_loss = eval_model(model, dev_data_loader, device, len(df_dev))

    print(f'Dev   loss {dev_loss} Accuracy {dev_acc}')

    dev_predictions, dev_targets, dev_logits = get_predictions(model, dev_data_loader, device)
    dev_precision = precision_score(dev_targets, dev_predictions, zero_division=0)
    dev_recall = recall_score(dev_targets, dev_predictions)
    dev_f1 = f1_score(dev_targets, dev_predictions)

    print(f'Dev Precision Score: {dev_precision}')
    print(f'Dev Recall Score: {dev_recall}')
    print(f'Dev F1 Score: {dev_f1}')

    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['dev_acc'].append(dev_acc)
    history['dev_loss'].append(dev_loss)
    history['dev_precision'].append(dev_precision)
    history['dev_recall'].append(dev_recall)
    history['dev_f1'].append(dev_f1)

    if dev_acc > best_acc:
        torch.save(model.state_dict(), 'best_model.bin')
        best_acc = dev_acc

    print(dev_predictions)
    print(dev_targets)
    print("Dev Logits:\n", dev_logits[:10])


Epoch 1/4
----------
Train loss 0.11493491897776645 Accuracy 0.95
Dev   loss 0.0065007575771903105 Accuracy 0.9966666666666667
Dev Precision Score: 1.0
Dev Recall Score: 0.9933333333333333
Dev F1 Score: 0.9966555183946488

[0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,

In [None]:
print(optimizer)
print("L2 Lambda:",L2_lambda)
print("Learning Rate:", learning_rate)

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 0.0
    maximize: False
    weight_decay: 0.5
)
L2 Lambda: 0.5
Learning Rate: 5e-05


In [None]:
print(history)

defaultdict(<class 'list'>, {'train_acc': [0.95, 0.9970833333333333, 1.0, 1.0], 'train_loss': [0.11493491897776645, 0.013076715621209586, 3.681391803790272e-05, 2.9390450484546213e-05], 'dev_acc': [0.9966666666666667, 1.0, 1.0, 1.0], 'dev_loss': [0.0065007575771903105, 2.080716196934717e-05, 1.4167374166917349e-05, 1.2724987037041788e-05], 'dev_precision': [1.0, 1.0, 1.0, 1.0], 'dev_recall': [0.9933333333333333, 1.0, 1.0, 1.0], 'dev_f1': [0.9966555183946488, 1.0, 1.0, 1.0]})


In [None]:
path1 = "/content/best_model.bin"

model_name = "roberta-base"
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)
model = model.to(device)

model.load_state_dict(torch.load(path1))

model = model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
test_acc, _ = eval_model(model, test_data_loader, device, len(df_test))
test_acc.item()

0.667

In [None]:
test_predictions, test_targets, test_logits = get_predictions(model, test_data_loader, device)
test_precision = precision_score(test_targets, test_predictions, zero_division=0)
test_recall = recall_score(test_targets, test_predictions)
test_f1 = f1_score(test_targets, test_predictions)

In [None]:
print(f'Test Precision Score: {test_precision}')
print(f'Test Recall Score: {test_recall}')
print(f'Test F1 Score: {test_f1}')

Test Precision Score: 0.9513513513513514
Test Recall Score: 0.352
Test F1 Score: 0.5138686131386861


In [None]:
print(test_predictions)
print(test_targets)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 

In [None]:
# append test results to test dataframe and export
df_test['prediction'] = test_predictions

df_test['match'] = np.where(df_test['label'] == df_test['prediction'], True, False)
df_test

df_test.to_csv('/content/drive/MyDrive/final_sets/results.tsv',sep='\t',index=False)