In [1]:
import numpy as np
import scipy
import seaborn as sns
import datasets
import torch
from torch import nn
import evaluate
from tqdm.notebook import tqdm
import os
os.environ['WANDB_DISABLED'] = 'true'
# import wandb
from nltk.tokenize import word_tokenize
from collections import Counter

import utils

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

2023-01-04 07:00:49.337439: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.1/lib64
2023-01-04 07:00:49.337486: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu' 

In [3]:
train_ds, valid_ds, test_ds = utils.load_dataset()

Found cached dataset civil_comments (/home/johnny/.cache/huggingface/datasets/civil_comments/default/0.9.0/e7a3aacd2ab7d135fa958e7209d10b1fa03807d44c486e3c34897aa08ea8ffab)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
balanced = True

if balanced:
    labels = np.array(train_ds['label'])
    num_minority = np.sum(labels == 1)

    np.random.seed(0)

    indexes = []
    for label in [0, 1]:
        idx = np.arange(len(labels))
        idx = idx[labels == label]

        print(len(idx))
        sample = np.random.choice(idx, size=num_minority, replace=False)
        indexes.append(sample)

    indexes = np.concatenate(indexes)
    train_ds = train_ds.select(indexes)
    train_ds

91671
5649


In [5]:
all_words = []
for raw_text in valid_ds['text']:
    words = word_tokenize(raw_text)
    words = [ i.lower() for i in words ]
    all_words.extend(words)
counter = Counter(all_words)

# 21,053 unique words
vocab = [ x for x, count in counter.items() if count >= 2 ]
vocab = sorted(vocab)
vocab = { x : i for i, x in enumerate(vocab, 1) }
reverse_vocab = { i : x for x, i in vocab.items() }
len(vocab)

10508

In [6]:
class CBOW(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()

        self.embeddings = nn.Embedding(vocab_size+1, embedding_dim, padding_idx=0)
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.activation_function1 = nn.ReLU()
        
        self.dropout = nn.Dropout(p=0.5)

        self.linear3 = nn.Linear(128, 2)
        self.activation_function3 = nn.ReLU()
        

    def forward(self, input_ids, label, attention_mask):
        embeds = self.embeddings(input_ids)        
        out = torch.mean(embeds, axis=1)
        out = self.linear1(out)
        out = self.activation_function1(out)
        out = self.dropout(out)
        out = self.linear3(out)
        out = self.activation_function3(out)
        return out

    def get_word_emdedding(self, word):
        word = torch.tensor([word_to_ix[word]])
        return self.embeddings(word).view(1,-1)


model = CBOW(len(vocab), 128)

In [7]:
class SentimentNet(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        super(SentimentNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size+1, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, output_size)
        
    def forward(self, input_ids, label, attention_mask):
        batch_size = input_ids.size(0)
        embeds = self.embedding(input_ids)
        hidden = self.init_hidden(batch_size)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
        out = self.fc(out)
        
        out = out.view(batch_size, -1)
        return out
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        return hidden
    
model = SentimentNet(len(vocab), 2, 128, 128, 2)

In [8]:
def tokenize_function(examples, cutoff=256):
    words = word_tokenize(examples['text'])
    words = [ i.lower() for i in words ]
    words = [ vocab[i] for i in words if i in vocab ]
    
    if len(words) < cutoff:
        words = words + [0] * (cutoff - len(words))
        mask = [1] * len(words) + [0] * (cutoff - len(words))
    else:
        words = words[:cutoff]
        mask = [1] * cutoff
    
    return {'input_ids' : words, 'attention_mask' : mask}

tokenized_train = train_ds.map(tokenize_function, batched=False)
tokenized_valid = valid_ds.map(tokenize_function, batched=False)

Loading cached processed dataset at /home/johnny/.cache/huggingface/datasets/civil_comments/default/0.9.0/e7a3aacd2ab7d135fa958e7209d10b1fa03807d44c486e3c34897aa08ea8ffab/cache-a05ca6d1490976db.arrow
Loading cached processed dataset at /home/johnny/.cache/huggingface/datasets/civil_comments/default/0.9.0/e7a3aacd2ab7d135fa958e7209d10b1fa03807d44c486e3c34897aa08ea8ffab/cache-39f3d85df3eb9263.arrow


In [9]:
tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
tokenized_valid.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [10]:
training_args = TrainingArguments(output_dir="trainer",
                                  max_steps=1000,
                                  per_device_train_batch_size=32,
                                  evaluation_strategy='steps',
                                  eval_steps=100,
                                  save_strategy='no',
                                  num_train_epochs=1,
                                 )

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [11]:
metric = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    batch_scores = scipy.special.softmax(logits, axis=-1)
    return metric.compute(prediction_scores=batch_scores[:,1], references=labels)

In [12]:
def _evaluate(model, val_dataloader):
    model.eval()
    val_pbar = tqdm(total=len(val_dataloader))

    scores = []
    for batch in val_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs
        predictions = torch.argmax(logits, dim=-1).cpu().tolist()

        batch_scores = torch.nn.Softmax()(logits)[:,-1]
        scores.append(batch_scores)

        metric.add_batch(prediction_scores=batch_scores, references=batch["label"])
        val_pbar.update(1)
    eval_dict = metric.compute()
    val_pbar.set_description('roc_auc: %.2f' % eval_dict['roc_auc'])

    eval_dict['predictions'] = torch.cat(scores)

    return eval_dict

In [13]:
max_steps = 5000
eval_steps = 500
learning_rate = 0.01
batch_size = 32
# adam should default to correct_bias = True
adam_epsilon = 1e-6
adam_beta1 = 0.9
adam_beta2 = 0.999
max_grad_norm = 1.0
weight_decay = 0.00

In [14]:
def train(model, train_dataset, val_dataset, tolerance=3, metric_name='accuracy'):
    steps = 0
    epochs = 0
    best_acc = None
    patience = 0
    pbar = tqdm(total=max_steps)
    train_dataloader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=batch_size, pin_memory=True)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, shuffle=False, batch_size=batch_size, pin_memory=True)
    it = iter(train_dataloader)
    
    xe = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(params=model.parameters(), lr=learning_rate, betas=(adam_beta1, adam_beta2), eps=adam_epsilon, weight_decay=weight_decay)
    

    while steps < max_steps:
        # training
        model.train()
        total_loss = 0.

        try:
            batch = next(it)
        except:
            epochs += 1
            it = iter(train_dataloader)
            batch = next(it)

        steps += 1
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = xe(outputs, batch['label'])
        loss.backward()
        total_loss += loss.cpu()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        optimizer.zero_grad()

        pbar.set_description('Epoch: %d, Avg batch loss: %.2f' % (epochs, total_loss / steps))
        pbar.update(1)

        if steps % eval_steps == 0:
            model.eval()
            eval_dict = _evaluate(model, val_dataloader)

            # early stopping
            if not best_acc or eval_dict[metric_name] > best_acc:
                best_acc = eval_dict[metric_name]
            else:
                patience += 1

            if patience >= tolerance:
                break

In [15]:
train(model, tokenized_train, tokenized_valid, metric_name='roc_auc')

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  batch_scores = torch.nn.Softmax()(logits)[:,-1]


  0%|          | 0/157 [00:00<?, ?it/s]

  batch_scores = torch.nn.Softmax()(logits)[:,-1]


  0%|          | 0/157 [00:00<?, ?it/s]

  batch_scores = torch.nn.Softmax()(logits)[:,-1]


  0%|          | 0/157 [00:00<?, ?it/s]

  batch_scores = torch.nn.Softmax()(logits)[:,-1]
IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [16]:
tokenized_test = test_ds.map(tokenize_function, batched=False)
tokenized_test.set_format(type="torch", columns=["input_ids", "label"])

val_dataloader = torch.utils.data.DataLoader(tokenized_test, shuffle=False, batch_size=64, pin_memory=True)
eval_dict = _evaluate(model, val_dataloader)

  0%|          | 0/1804874 [00:00<?, ?ex/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
predictions = eval_dict['predictions'].cpu().numpy()
np.save('scores/cbow', predictions)

In [None]:
torch.save(model)