In [1]:
!pip install keras matplotlib transformers numpy torch sklearn nltk pytorch-pretrained-bert pytorch-nlp

Collecting pytorch-pretrained-bert
  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K    100% |████████████████████████████████| 133kB 3.0MB/s eta 0:00:01
[?25hCollecting pytorch-nlp
  Downloading https://files.pythonhosted.org/packages/4f/51/f0ee1efb75f7cc2e3065c5da1363d6be2eec79691b2821594f3f2329528c/pytorch_nlp-0.5.0-py3-none-any.whl (90kB)
[K    100% |████████████████████████████████| 92kB 3.1MB/s eta 0:00:01
Collecting boto3 (from pytorch-pretrained-bert)
  Downloading https://files.pythonhosted.org/packages/aa/98/e9459d65ad8ab27886bf9d86a537e6f65b5bdbc4c7de68ba45b524ef74a1/boto3-1.17.52-py2.py3-none-any.whl (131kB)
[K    100% |████████████████████████████████| 133kB 3.0MB/s eta 0:00:01
Collecting s3transfer<0.4.0,>=0.3.0 (from boto3->pytorch-pretrained-bert)
  Downloading https://files.pythonhosted.org/packages/00/89/0cb4e92c239e6425b9b0035227b8cdf9d

In [1]:
from pytorch_lightning.loggers import TensorBoardLogger

ModuleNotFoundError: No module named 'pytorch_lightning'

In [1]:
#If there's a GPU available...
import torch

if torch.cuda.is_available():        
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")    
    print('There are %d GPU(s) available.' % torch.cuda.device_count())    
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 4 GPU(s) available.
We will use the GPU: GeForce RTX 2080 Ti


# Pre-processing Code

In [3]:
%%time

# Import PushIO CSV
import pandas as pd

def get_pushio_dataset(path=""):
    if path:
        neg_data = pd.read_csv(path, usecols=['body'], dtype="string")
    else:
        neg_data = pd.read_csv("/bigtemp/rm5tx/nlp_project/2016-05_all.csv", usecols=['body'], dtype="string")
    
    # We want a unify col name for when we concat pos and neg data
    neg_data.rename(columns={"body":"data"}, inplace=True)
    neg_data["label"] = 0
    return neg_data

CPU times: user 506 ms, sys: 97.1 ms, total: 603 ms
Wall time: 5.8 s


In [4]:
%%time
# Reddit Norm Violations
import os
import re

def get_rnv_dataset(path=""):
    if path:
        directory = os.path.abspath(path)
    else:
        directory = os.path.abspath("/bigtemp/rm5tx/nlp_project/reddit-norm-violations/data/macro-norm-violations/")

    pos_temp = []
    for root,dirs,files in os.walk(directory):
        for file in files:
            with open(root+ "/" +file) as f:
                pos_temp += f.readlines()
    pos_data = pd.DataFrame(data=pos_temp, dtype = "string")
    pos_data.rename(columns={0:"data"}, inplace=True)
    pos_data["label"] = 1
    
    # RNV uses a special preprocess step
    print("Preprocessing... 1. split new lines, 2. convert to lowercase, and 3. strip numbers and punct")
    ### 1) remove newlines
    pos_data['data'] = pos_data['data'].replace('\n', ' ', regex = True)

    ## 2) convert to lowercase
    pos_data['data'] = pos_data['data'].str.lower()

    # ### 3) remove punct and numbers: https://stackoverflow.com/questions/47947438/preprocessing-string-data-in-pandas-dataframe
    pos_data["data"] = pos_data.data.apply(lambda x : " ".join(re.findall('[\w]+',x)))
    return pos_data

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 7.63 µs


In [5]:
def concat_datasets(data_a, data_b):
    frames = [data_a, data_b]
    dataset = pd.concat(frames)
    dataset.dropna(inplace=True)
    return dataset

In [12]:
%%time
from transformers import BertTokenizerFast, BertForSequenceClassification

MODEL_NAME = "bert-base-uncased"
MAX_LEN = 32  # Bert Max Len input
TOKENIZER = BertTokenizerFast.from_pretrained(MODEL_NAME, do_lower_case=True)

def tokenize_datasets(X_dataset, tokenizer, max_len=512):
    input_ids = []
    for sent in X_dataset:
        tokenized_text = tokenizer.encode(
                                        sent,                      # Sentence to encode
                                        add_special_tokens = True, # Add '[CLS]' and '[SEP]' tokens
                                        max_length = max_len,      # Truncate senences
                                        truncation=True,
                                        )
        input_ids.append(tokenized_text)
    return input_ids

CPU times: user 177 ms, sys: 6.78 ms, total: 184 ms
Wall time: 565 ms


In [13]:
%%time
# Appears that CS Serv don[t have tf version 2.2]
# Thus, we cannot use the convenient pad_sequences from keras

def trunc_n_pad(input_id_list):
    ret_list = []
    for input_id in input_id_list:
        if len(input_id) > MAX_LEN:
            ret_list.append(input_id[:MAX_LEN])
        elif len(input_id) < MAX_LEN:
            temp_sublist = input_id + [0] * (MAX_LEN - len(input_id))
            ret_list.append(temp_sublist)
        else:
            ret_list.append(input_id)
    return ret_list

CPU times: user 10 µs, sys: 1e+03 ns, total: 11 µs
Wall time: 17.6 µs


In [14]:
# Create attention masks
def create_attention_masks(input_ids):
    attention_masks = []
    for seq in input_ids:
        # Create a mask of 1s for each token followed by 0s for padding
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
    return attention_masks

In [15]:
import numpy as np

def accurate_nb(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat)

In [16]:
TRAIN_BATCH_SIZE = 32
VAL_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32

LEARNING_RATE = 0.1
EPOCHS = 3
WEIGHT_DECAY = 0.2

SEED = 7

In [17]:
%%time
from torch.utils.data import TensorDataset, RandomSampler, DataLoader, SequentialSampler
from torch import nn
from tqdm import trange 

def main():
    
    ###
    # Preprocessing Data
    ###
    neg_data = get_pushio_dataset()
    pos_data = get_rnv_dataset()
    dataset = concat_datasets(neg_data, pos_data)

    # 60% - train set, 20% - validation set, 20% - test set
    train, validate, test = np.split(dataset.sample(frac=1, random_state=42), 
                       [int(.6*len(dataset)), int(.8*len(dataset))])

    X_train, y_train = train["data"], train["label"]
    X_val, y_val = validate["data"], validate["label"]
    X_test, y_test = test["data"], test["label"]

    # NOTE: This is a small subset used for testing... likely will remove in final ver.
    X_train = X_train[:1000]
    y_train = y_train[:1000]
    X_val = X_val[:1000]
    y_val = y_val[:1000]
    X_test = X_test[:1000]
    y_test = y_test[:1000]

    ###
    # Tokenization
    ###
    # Convert texts into tokens. (These are not truncated or padded yet)
    pre_train_input_ids = tokenize_datasets(X_train, TOKENIZER, MAX_LEN)
    pre_val_input_ids = tokenize_datasets(X_val, TOKENIZER, MAX_LEN)
    pre_test_input_ids = tokenize_datasets(X_test, TOKENIZER, MAX_LEN)
    
    # Truncate and Pad your tokens
    train_input_ids = trunc_n_pad(pre_train_input_ids)
    val_input_ids = trunc_n_pad(pre_val_input_ids)
    test_input_ids = trunc_n_pad(pre_test_input_ids)

    ###
    # Misc.
    ###
    # Create attention masks
    train_attention_masks = create_attention_masks(train_input_ids)
    val_attention_masks = create_attention_masks(val_input_ids)
    test_attention_masks = create_attention_masks(test_input_ids)
    
    # Convert all of our data into torch tensors, the required datatype for our model
    train_inputs = torch.tensor(train_input_ids)
    validation_inputs = torch.tensor(val_input_ids)

    train_labels = torch.tensor(y_train.values.tolist())
    validation_labels = torch.tensor(y_val.values.tolist())

    train_masks = torch.tensor(train_attention_masks)
    validation_masks = torch.tensor(val_attention_masks)

    test_inputs = torch.tensor(test_input_ids)
    test_labels = torch.tensor(y_test.values.tolist())

    test_masks = torch.tensor(test_attention_masks)
    
    # Create an iterator of our data with torch DataLoader. 
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
    prediction_data = TensorDataset(test_inputs, test_masks, test_labels)
    
    # Create Dataloaders- a Python iterable over a dataset
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=TRAIN_BATCH_SIZE)

    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=VAL_BATCH_SIZE)

    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=TEST_BATCH_SIZE)
    
    
    ###
    # Model And Param Optim.
    ###
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
        model = nn.DataParallel(model)
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate': WEIGHT_DECAY},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate': 0.0}
    ]


    optimizer = torch.optim.Adam(optimizer_grouped_parameters, lr=LEARNING_RATE, eps=1e-9)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.1)
    t_total = len(train_dataloader) * EPOCHS
    # Store our loss and accuracy for plotting

    best_val = -np.inf
    
    # trange is a tqdm wrapper around the normal python range
    for epoch in trange(EPOCHS, desc="Epoch"): 
    # Training
        # Set our model to training mode (as opposed to evaluation mode)
        # Tracking variables
        tr_loss =  0
        nb_tr_examples, nb_tr_steps = 0, 0
        model.train()

        # Train the data for one epoch
        for step, batch in enumerate(train_dataloader):

            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels = batch

            loss_ce = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)[0]
            if torch.cuda.device_count() > 1:
                loss_ce = loss_ce.mean()
            loss_ce.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and take a step using the computed gradient
            optimizer.step()

            # Update tracking variables
            tr_loss += loss_ce.item()

            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

        print("Train cross entropy loss: {}".format(tr_loss/nb_tr_steps))

        # Validation
        # Put model in evaluation mode to evaluate loss on the validation set
        model.eval()
        # Tracking variables 
        eval_accurate_nb = 0
        nb_eval_examples = 0
        logits_list = []
        labels_list = []

        # Evaluate data for one epoch
        for batch in validation_dataloader:
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels = batch
            # Telling the model not to compute or store gradients, saving memory and speeding up validation
            with torch.no_grad():
            # Forward pass, calculate logit predictions
                logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0] 
                logits_list.append(logits)
                labels_list.append(b_labels)
            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            tmp_eval_nb = accurate_nb(logits, label_ids)

            eval_accurate_nb += tmp_eval_nb
            nb_eval_examples += label_ids.shape[0]
        eval_accuracy = eval_accurate_nb/nb_eval_examples
        print("Validation Accuracy: {}".format(eval_accuracy))
        scheduler.step(eval_accuracy)


        if eval_accuracy > best_val:
            dirname = '{}/BERT-base-{}'.format(dataset, SEED)
            # Directory names at longest can be 255
            dirname = dirname[:255]
            output_dir = './model_save/{}'.format(dirname)
            os.makedirs(output_dir, exist_ok=True)
            print("Saving model to %s" % output_dir)
            model_to_save = model.module if hasattr(model, 'module') else model 
            model_to_save.save_pretrained(output_dir)   
            #tokenizer.save_pretrained(output_dir)

            best_val = eval_accuracy

    # ##### test model on test data
        # Put model in evaluation mode
        model.eval()
        # Tracking variables 
        eval_accurate_nb = 0
        nb_test_examples = 0
        logits_list = []
        labels_list = []
        # Predict 
        for batch in prediction_dataloader:
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels = batch
            # Telling the model not to compute or store gradients, saving memory and speeding up prediction
            with torch.no_grad():
                # Forward pass, calculate logit predictions 
                logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]
                logits_list.append(logits)
                labels_list.append(b_labels)
            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            tmp_eval_nb = accurate_nb(logits, label_ids)
            eval_accurate_nb += tmp_eval_nb
            nb_test_examples += label_ids.shape[0]

        print("Test Accuracy: {}".format(eval_accurate_nb/nb_test_examples))
main()

Preprocessing... 1. split new lines, 2. convert to lowercase, and 3. strip numbers and punct


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Let's use 4 GPUs!


RuntimeError: CUDA out of memory. Tried to allocate 90.00 MiB (GPU 0; 10.73 GiB total capacity; 362.40 MiB already allocated; 5.56 MiB free; 392.00 MiB reserved in total by PyTorch)

In [None]:
if __name__ == "__main__":
    main()