In [None]:
!pip install keras

In [None]:
from IPython.utils import io

with io.capture_output() as captured:
    !pip install matplotlib transformers numpy torch sklearn nltk pytorch-pretrained-bert pytorch-nlp

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
#If there's a GPU available...
if torch.cuda.is_available():        
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")    
    print('There are %d GPU(s) available.' % torch.cuda.device_count())    
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [4]:
import torch
import numpy as np
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

# Pre-processing Code

In [5]:
import os

# Import PushIO CSV
neg_data = pd.read_csv("/bigtemp/rm5tx/nlp_project/2016-05_all.csv", usecols=['body'], dtype="string")
neg_data.rename(columns={"body":"data"}, inplace=True)
neg_data["label"] = 0

In [6]:
# Reddit Norm Violations
import pandas as pd
import os

pos_temp = []
directory = os.path.abspath("/bigtemp/rm5tx/nlp_project/reddit-norm-violations/data/macro-norm-violations/")
for root,dirs,files in os.walk(directory):
    for file in files:
        with open(root+ "/" +file) as f:
            pos_temp += f.readlines()
pos_data = pd.DataFrame(data=pos_temp, dtype = "string")
pos_data.rename(columns={0:"data"}, inplace=True)
pos_data["label"] = 1

In [7]:
###preprocessing -
print("Preprocessing... 1. split new lines, 2. convert to lowercase, and 3. strip numbers and punct")

### 1) remove newlines
pos_data['data'] = pos_data['data'].replace('\n', ' ', regex = True)

## 2) convert to lowercase
pos_data['data'] = pos_data['data'].str.lower()

# ### 3) remove punct and numbers: https://stackoverflow.com/questions/47947438/preprocessing-string-data-in-pandas-dataframe
import re
pos_data["data"] = pos_data.data.apply(lambda x : " ".join(re.findall('[\w]+',x)))

Preprocessing... 1. split new lines, 2. convert to lowercase, and 3. strip numbers and punct


In [8]:
frames = [neg_data, pos_data]
dataset = pd.concat(frames)
dataset.dropna(inplace=True)
# print(len(neg_data) + len(pos_data) == len(data))
dataset.head(3)

Unnamed: 0,data,label
0,thats 1 case per 5000 people. nice!,0
1,What's the difference between roast beef and p...,0
2,"Ya, you got tuh stick it to da man!",0


In [10]:
# 60% - train set, 20% - validation set, 20% - test set
train, validate, test = np.split(dataset.sample(frac=1, random_state=42), 
                       [int(.6*len(dataset)), int(.8*len(dataset))])
# train, test = train_test_split(shuffled_dataset, test_size=0.4)
# test, validate = train_test_split(test, test_size = 0.5)

In [11]:
X_train, y_train = train["data"], train["label"]
X_val, y_val = validate["data"], validate["label"]
X_test, y_test = test["data"], test["label"]

In [12]:
X_train = X_train[:1000]
y_train = y_train[:1000]
X_val = X_val[:1000]
y_val = y_val[:1000]
X_test = X_test[:1000]
y_test = y_test[:1000]

In [13]:
MODEL_NAME = "bert-base-uncased"
MAX_LEN = 512    # Bert Max Len input

tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME, do_lower_case=True)

pre_train_input_ids = []
pre_val_input_ids = []
pre_test_input_ids = []

for sent in X_train:
    tokenized_text = tokenizer.encode(
                                    sent,                      # Sentence to encode
                                    add_special_tokens = True, # Add '[CLS]' and '[SEP]' tokens
                                    max_length = MAX_LEN,      # Truncate senences
                                    truncation=True,
                                    )
    pre_train_input_ids.append(tokenized_text)
    
for sent in X_val:
    tokenized_text = tokenizer.encode(
                                    sent,                    
                                    add_special_tokens = True, 
                                    max_length = MAX_LEN,
                                    truncation=True,
                                    )
    pre_val_input_ids.append(tokenized_text)

for sent in X_test:
    tokenized_text = tokenizer.encode(
                                    sent,                     
                                    add_special_tokens = True, 
                                    max_length = MAX_LEN,        
                                    truncation=True,
                                    )
    pre_test_input_ids.append(tokenized_text)

In [14]:
len(pre_train_input_ids)

1000

In [15]:
def trunc_n_pad(input_id_list):
    ret_list = []
    for input_id in input_id_list:
        if len(input_id) > MAX_LEN:
            ret_list.append(input_id[:MAX_LEN])
        elif len(input_id) < MAX_LEN:
            temp_sublist = input_id + [0] * (MAX_LEN - len(input_id))
            ret_list.append(temp_sublist)
        else:
            ret_list.append(input_id)
    return ret_list

In [16]:
# Appears that CS Serv don[t have tf version 2.2]
# from keras.preprocessing.sequence import pad_sequences

# # Pad our input tokens
# train_input_ids = pad_sequences(train_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# val_input_ids = pad_sequences(val_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

train_input_ids = trunc_n_pad(pre_train_input_ids)
val_input_ids = trunc_n_pad(pre_val_input_ids)
test_input_ids = trunc_n_pad(pre_test_input_ids)

# Create attention masks
train_attention_masks = []
val_attention_masks = []
test_attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in train_input_ids:
    seq_mask = [float(i>0) for i in seq]
    train_attention_masks.append(seq_mask)
for seq in val_input_ids:
    seq_mask = [float(i>0) for i in seq]
    val_attention_masks.append(seq_mask)
for seq in test_input_ids:
    seq_mask = [float(i>0) for i in seq]
    test_attention_masks.append(seq_mask)

In [17]:
# print(train_input_ids[0])
print(len(train_input_ids))

1000


In [18]:
# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_input_ids)
validation_inputs = torch.tensor(val_input_ids)

train_labels = torch.tensor(y_train.values.tolist())
validation_labels = torch.tensor(y_val.values.tolist())

train_masks = torch.tensor(train_attention_masks)
validation_masks = torch.tensor(val_attention_masks)

test_inputs = torch.tensor(test_input_ids)
test_labels = torch.tensor(y_test.values.tolist())

test_masks = torch.tensor(test_attention_masks)

In [19]:
print(len(train_inputs), len(train_masks), len(train_labels))

1000 1000 1000


In [20]:
# Create an iterator of our data with torch DataLoader. 
train_data = TensorDataset(train_inputs, train_masks, train_labels)
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
prediction_data = TensorDataset(test_inputs, test_masks, test_labels)

# dataset_dir = 'dataset/{}'.format(args.dataset)
# if not os.path.exists(dataset_dir):
#     os.makedirs(dataset_dir)

# torch.save(train_data, dataset_dir+'/train.pt')
# torch.save(validation_data, dataset_dir+'/val.pt')
# torch.save(prediction_data, dataset_dir+'/test.pt')

In [23]:
print(train_sampler)

<torch.utils.data.sampler.RandomSampler object at 0x7fe5d2577850>


In [24]:
TRAIN_BATCH_SIZE = 32
VAL_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32

train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=TRAIN_BATCH_SIZE)

validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=VAL_BATCH_SIZE)

prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=TEST_BATCH_SIZE)

In [26]:
from torch import nn

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
    model = nn.DataParallel(model)
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Let's use 4 GPUs!


DataParallel(
  (module): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=Tru

In [30]:
learning_rate = 0.1
epochs = 3
weight_decay = 0.2

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay_rate': weight_decay},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
    'weight_decay_rate': 0.0}
]


optimizer = torch.optim.Adam(optimizer_grouped_parameters, lr=learning_rate, eps=1e-9)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.1)
t_total = len(train_dataloader) * epochs
# Store our loss and accuracy for plotting

best_val = -np.inf

In [32]:
seed = 7

# trange is a tqdm wrapper around the normal python range
for epoch in trange(epochs, desc="Epoch"): 
# Training
    # Set our model to training mode (as opposed to evaluation mode)
    # Tracking variables
    tr_loss =  0
    nb_tr_examples, nb_tr_steps = 0, 0
    model.train()

    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):

        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch

        loss_ce = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)[0]
        if torch.cuda.device_count() > 1:
            loss_ce = loss_ce.mean()
        loss_ce.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient
        optimizer.step()

        # Update tracking variables
        tr_loss += loss_ce.item()

        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train cross entropy loss: {}".format(tr_loss/nb_tr_steps))
    
    # Validation
    # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()
    # Tracking variables 
    eval_accurate_nb = 0
    nb_eval_examples = 0
    logits_list = []
    labels_list = []

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():
        # Forward pass, calculate logit predictions
            logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0] 
            logits_list.append(logits)
            labels_list.append(b_labels)
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_nb = accurate_nb(logits, label_ids)

        eval_accurate_nb += tmp_eval_nb
        nb_eval_examples += label_ids.shape[0]
    eval_accuracy = eval_accurate_nb/nb_eval_examples
    print("Validation Accuracy: {}".format(eval_accuracy))
    scheduler.step(eval_accuracy)

    logits_ece = torch.cat(logits_list)
    labels_ece = torch.cat(labels_list)
    ece = ece_criterion(logits_ece, labels_ece).item()
    print('ECE on val data: {}'.format(ece))


    if eval_accuracy > best_val:
        dirname = '{}/BERT-base-{}'.format(dataset, seed)

        output_dir = './model_save/{}'.format(dirname)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        print("Saving model to %s" % output_dir)
        model_to_save = model.module if hasattr(model, 'module') else model 
        model_to_save.save_pretrained(output_dir)   
        #tokenizer.save_pretrained(output_dir)

        best_val = eval_accuracy

# ##### test model on test data
    # Put model in evaluation mode
    model.eval()
    # Tracking variables 
    eval_accurate_nb = 0
    nb_test_examples = 0
    logits_list = []
    labels_list = []
    # Predict 
    for batch in prediction_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Telling the model not to compute or store gradients, saving memory and speeding up prediction
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]
            logits_list.append(logits)
            labels_list.append(b_labels)
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_nb = accurate_nb(logits, label_ids)
        eval_accurate_nb += tmp_eval_nb
        nb_test_examples += label_ids.shape[0]

    print("Test Accuracy: {}".format(eval_accurate_nb/nb_test_examples))

    logits_ece = torch.cat(logits_list)
    labels_ece = torch.cat(labels_list)
    ece = ece_criterion(logits_ece, labels_ece).item()

    print('ECE on test data: {}'.format(ece))

Epoch:   0%|          | 0/3 [00:23<?, ?it/s]


RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/u/lab/jw6qs/.conda/envs/ml_proj/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
    output = module(*input, **kwargs)
  File "/u/lab/jw6qs/.conda/envs/ml_proj/lib/python3.9/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/u/lab/jw6qs/.conda/envs/ml_proj/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 1501, in forward
    outputs = self.bert(
  File "/u/lab/jw6qs/.conda/envs/ml_proj/lib/python3.9/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/u/lab/jw6qs/.conda/envs/ml_proj/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 971, in forward
    encoder_outputs = self.encoder(
  File "/u/lab/jw6qs/.conda/envs/ml_proj/lib/python3.9/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/u/lab/jw6qs/.conda/envs/ml_proj/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 568, in forward
    layer_outputs = layer_module(
  File "/u/lab/jw6qs/.conda/envs/ml_proj/lib/python3.9/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/u/lab/jw6qs/.conda/envs/ml_proj/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 456, in forward
    self_attention_outputs = self.attention(
  File "/u/lab/jw6qs/.conda/envs/ml_proj/lib/python3.9/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/u/lab/jw6qs/.conda/envs/ml_proj/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 387, in forward
    self_outputs = self.self(
  File "/u/lab/jw6qs/.conda/envs/ml_proj/lib/python3.9/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/u/lab/jw6qs/.conda/envs/ml_proj/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py", line 309, in forward
    attention_scores = attention_scores / math.sqrt(self.attention_head_size)
RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 10.76 GiB total capacity; 2.08 GiB already allocated; 72.44 MiB free; 2.12 GiB reserved in total by PyTorch)


In [None]:
# Old Code below... delete soonish???

In [None]:
# # Function to calculate the accuracy of our predictions vs labels
# def accuracy(preds, labels):
#     pred_flat = np.argmax(preds, axis=1).flatten()
#     labels_flat = labels.flatten()
#     return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
# t = [] 

# # Store our loss and accuracy for plotting
# train_loss_set = []

# # Number of training epochs (authors recommend between 2 and 4)
# epochs = 4

# # trange is a tqdm wrapper around the normal python range
# for _ in trange(epochs, desc="Epoch"):
  
  
#   # Training
  
#   # Set our model to training mode (as opposed to evaluation mode)
#   model.train()
  
#   # Tracking variables
#   tr_loss = 0
#   nb_tr_examples, nb_tr_steps = 0, 0
  
#   # Train the data for one epoch
#   for step, batch in enumerate(train_dataloader):
#     # Add batch to GPU
#     batch = tuple(t.to(device) for t in batch)
#     # Unpack the inputs from our dataloader
#     b_input_ids, b_input_mask, b_labels = batch
#     # Clear out the gradients (by default they accumulate)
#     optimizer.zero_grad()
#     # Forward pass
#     loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
#     train_loss_set.append(loss.item())    
#     # Backward pass
#     loss.backward()
#     # Update parameters and take a step using the computed gradient
#     optimizer.step()
    
    
#     # Update tracking variables
#     tr_loss += loss.item()
#     nb_tr_examples += b_input_ids.size(0)
#     nb_tr_steps += 1

#   print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
#   # Validation

#   # Put model in evaluation mode to evaluate loss on the validation set
#   model.eval()

#   # Tracking variables 
#   eval_loss, eval_accuracy = 0, 0
#   nb_eval_steps, nb_eval_examples = 0, 0

#   # Evaluate data for one epoch
#   for batch in validation_dataloader:
#     # Add batch to GPU
#     batch = tuple(t.to(device) for t in batch)
#     # Unpack the inputs from our dataloader
#     b_input_ids, b_input_mask, b_labels = batch
#     # Telling the model not to compute or store gradients, saving memory and speeding up validation
#     with torch.no_grad():
#       # Forward pass, calculate logit predictions
#       logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
#     # Move logits and labels to CPU
#     logits = logits.detach().cpu().numpy()
#     label_ids = b_labels.to('cpu').numpy()

#     tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
#     eval_accuracy += tmp_eval_accuracy
#     nb_eval_steps += 1

#   print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

In [None]:
# plt.figure(figsize=(15,8))
# plt.title("Training loss")
# plt.xlabel("Batch")
# plt.ylabel("Loss")
# plt.plot(train_loss_set)
# plt.show()