In [9]:
# %%

#!pip install pytorch-lightning 

#!pip install pytorch-lightning transformers keras matplotlib numpy sklearn nltk pytorch-pretrained-bert pytorch-nlp matplotlib sklearn keras
#Lightning Imports
import os
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
import pytorch_lightning as pl

#Other Imports
import pandas as pd
import re
from torch.utils.data import TensorDataset, RandomSampler, DataLoader, SequentialSampler
import numpy as np
from transformers import BertForSequenceClassification
# from transformers import BerttokenizerFast
from transformers import AutoTokenizer

from pytorch_lightning.loggers import TensorBoardLogger


# from torch import nn
# from tqdm import trange 

# Possible Bert-Light imports:
# from argparse import ArgumentParser
# from datetime import datetime
# from typing import Optional

# import datasets
# import numpy as np
# import pytorch_lightning as pl
# import torch
# from torch.utils.data import DataLoader
# from transformers import (
#     AdamW,
#     AutoModelForSequenceClassification,
#     AutoConfig,
#     AutoTokenizer,
#     get_linear_schedule_with_warmup,
# )

class ProjData(pl.LightningDataModule):

    def __init__(self, data_dir: str = "~/tmp", batch_size: int = 32, max_len : int = 64, ratio : int = 2):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.max_len = max_len # Bert Max Len input
        self.rat = ratio
        self.name = "a-" + str(max_len) + "-" + str(ratio)

    def setup(self, stage=None):
        
        # *** tokenizer isn't actually a constant and the do_lower_case should be redundant if preprocessing was correct.
        # self.tokenizer = BerttokenizerFast.from_pretrained(MODEL_NAME, do_lower_case=False)
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', use_fast=True)


        ###
        # Preprocessing Data
        ###
        print("Getting pos")
        pos_data = self.get_rnv_dataset()
        print("Getting neg")
        neg_data = self.get_pushio_dataset()

        # *** TODO: Proper subset selection either in concat_datasets or in get_pushio_dataset
        print("Joining")
        dataset = self.concat_datasets(pos_data, neg_data)

        # 60% - train set, 20% - validation set, 20% - test set
        train, validate, test = np.split(dataset.sample(frac=1, random_state=42), 
                       [int(.6*len(dataset)), int(.8*len(dataset))])

        X_train, y_train = train["data"], train["label"]
        X_val, y_val = validate["data"], validate["label"]
        X_test, y_test = test["data"], test["label"]    


        

        # NOTE: This is a small subset used for testing... likely will remove in final ver.
        # *** Set to "None" to skip - if there any reason you take the tails of the lists rather than the heads?
        PROTOTYPE_NUM = 1000
        # **** NEED TO SHUFFLE FIRST IF YOU USE THIS
        if PROTOTYPE_NUM:
            X_train = X_train[:PROTOTYPE_NUM * 10]
            y_train = y_train[:PROTOTYPE_NUM * 10]
            X_val = X_val[:PROTOTYPE_NUM]
            y_val = y_val[:PROTOTYPE_NUM]
            X_test = X_test[:PROTOTYPE_NUM]
            y_test = y_test[:PROTOTYPE_NUM]

        ###
        # Tokenization
        ###
        # Convert texts into tokens. (These are not truncated or padded yet)
        print("Tokenizing")
        pre_train_input_ids = self.tokenize_datasets(X_train, self.tokenizer)
        pre_val_input_ids = self.tokenize_datasets(X_val, self.tokenizer)
        pre_test_input_ids = self.tokenize_datasets(X_test, self.tokenizer)
        
        # Truncate and Pad your tokens
        print("Padding")
        train_input_ids = self.trunc_n_pad(pre_train_input_ids)
        val_input_ids = self.trunc_n_pad(pre_val_input_ids)
        test_input_ids = self.trunc_n_pad(pre_test_input_ids)

        ###
        # Misc.
        ###
        # Create attention masks
        print("Creating masks")
        train_attention_masks = self.create_attention_masks(train_input_ids)
        val_attention_masks = self.create_attention_masks(val_input_ids)
        test_attention_masks = self.create_attention_masks(test_input_ids)

        # # Convert all of our data into torch tensors, the required datatype for our model
        train_inputs = torch.tensor(train_input_ids)
        validation_inputs = torch.tensor(val_input_ids)

        train_labels = torch.tensor(y_train.values.tolist())
        validation_labels = torch.tensor(y_val.values.tolist())

        train_masks = torch.tensor(train_attention_masks)
        validation_masks = torch.tensor(val_attention_masks)

        test_inputs = torch.tensor(test_input_ids)
        test_labels = torch.tensor(y_test.values.tolist())

        test_masks = torch.tensor(test_attention_masks)

        # Create an iterator of our data with torch DataLoader. 
        self.train = TensorDataset(train_inputs, train_masks, train_labels)
        self.val = TensorDataset(validation_inputs, validation_masks, validation_labels)
        self.test = TensorDataset(test_inputs, test_masks, test_labels)
    def save(self, path="/bigtemp/rm5tx/nlp_project/data_cache/"):
        path = "/bigtemp/jw6qs/"

        self.tokenizer.save_pretrained(os.path.abspath(path+self.name+"/tokenizer/"))
        torch.save(self.train, open(path+self.name+"train.pt", "wb"))
        torch.save(self.val, open(path+self.name+"val.pt", "wb"))
        torch.save(self.test, open(path+self.name+"test.pt", "wb"))

    def load(self, path="/bigtemp/rm5tx/nlp_project/data_cache/"):
        path = "/bigtemp/jw6qs/"

        self.tokenizer = AutoTokenizer.from_pretrained(os.path.abspath(path+self.name+"/tokenizer/", use_fast=True))
        self.train = torch.load(open(path+self.name+"train.pt", "rb"))
        self.val = torch.load(open(path+self.name+"val.pt", "rb"))
        self.test = torch.load(open(path+self.name+"test.pt", "rb"))     

    # Import PushIO CSV    
    def get_pushio_dataset(path=None):
        # if path:
            # neg_data = pd.read_csv(path, usecols=['body'], dtype="string")
        # else:
        neg_data = pd.read_csv("/bigtemp/rm5tx/nlp_project/2016-05_all.csv", usecols=['body'], dtype="string")
    
    
        # We want a unify col name for when we concat pos and neg data
        neg_data.rename(columns={"body":"data"}, inplace=True)
        neg_data["label"] = 0
        
        # RNV uses a special preprocess step
        print("Preprocessing... 1. split new lines, 2. convert to lowercase, and 3. strip numbers and punct")
        ### 1) remove newlines
        neg_data['data'] = neg_data['data'].replace('\n', ' ', regex = True)

        ## 2) convert to lowercase
        neg_data['data'] = neg_data['data'].str.lower()

#         print("Check HERE", neg_data['data'][0], type(neg_data['data'][0]))
        # ### 3) remove punct and numbers: https://stackoverflow.com/questions/47947438/preprocessing-string-data-in-pandas-dataframe
        neg_data["data"] = neg_data.data.apply(lambda x : " ".join(re.findall('[\w]+',str(x))))
        
        return neg_data

    # Reddit Norm Violations
    def get_rnv_dataset(path=None):
        # if path:
        #     directory = os.path.abspath(path)
        # else:
        directory = os.path.abspath("/bigtemp/rm5tx/nlp_project/reddit-norm-violations/data/macro-norm-violations/")

        pos_temp = []
        for root, dirs, files in os.walk(directory):
            for file in files:
                with open(root+ "/" +file) as f:
                    pos_temp += f.readlines()
        pos_data = pd.DataFrame(data=pos_temp, dtype = "string")
        pos_data.rename(columns={0:"data"}, inplace=True)
        pos_data["label"] = 1
        
        return pos_data

    def concat_datasets(self, data_a, data_b):
        if len(data_a.index) < len(data_b.index):
            data_a, data_b = data_b, data_a
        frames = [data_a, data_b[(self.rat * len(data_a.index)):]]
        print("Using ", len(data_a.index), (self.rat * len(data_a.index)), " samples.")
        dataset = pd.concat(frames)
        dataset.dropna(inplace=True) #???
        return dataset

    ###Pre-processing Code###
    def tokenize_datasets(self, X_dataset, tokenizer):
        input_ids = []
        for sent in X_dataset:
            tokenized_text = tokenizer.encode(
                                            sent,                      # Sentence to encode
                                            add_special_tokens = True, # Add '[CLS]' and '[SEP]' tokens
                                            max_length = self.max_len,      # Truncate senences
                                            truncation=True,
                                            )
            input_ids.append(tokenized_text)
        return input_ids


    # Appears that CS Serv don[t have tf version 2.2]
    # Thus, we cannot use the convenient pad_sequences from keras
    def trunc_n_pad(self, input_id_list):
        ret_list = []
        for input_id in input_id_list:
            if len(input_id) > self.max_len:
                ret_list.append(input_id[:self.max_len])
            elif len(input_id) < self.max_len:
                temp_sublist = input_id + [0] * (self.max_len - len(input_id))
                ret_list.append(temp_sublist)
            else:
                ret_list.append(input_id)
        return ret_list

    # Create attention masks
    def create_attention_masks(self, input_ids):
        attention_masks = []
        for seq in input_ids:
            # Create a mask of 1s for each token followed by 0s for padding
            seq_mask = [float(i>0) for i in seq]
            attention_masks.append(seq_mask)
        return attention_masks

    def train_dataloader(self, batch_size):
        return DataLoader(self.train, batch_size=self.batch_size)

    def val_dataloader(self, batch_size):
        return DataLoader(self.val, batch_size=self.batch_size)

    def test_dataloader(self, batch_size):
        return DataLoader(self.test, batch_size=self.batch_size)

class ProjModel(pl.LightningModule):
    def __init__(
        self,
        # model_name_or_path: str,
        num_labels: int = 2,
        learning_rate: float = 2e-5,
        adam_epsilon: float = 1e-9,
        warmup_steps: int = 0,
        weight_decay: float = 0.0,
        # eval_splits: Optional[list] = None,
        **kwargs
        
    ):
        super().__init__()

        self.save_hyperparameters()
        self.model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
        # self.config = AutoConfig.from_pretrained(model_name_or_path, num_labels=num_labels)
        # self.metric = ...

    def forward(self, x):
        pred =self.model(x)
        # logits = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]
        # pred = torch.argmax(pred, 1)
        # return pred
        return pred

    def training_step(self, batch, batch_idx):
        b_input_ids, b_input_mask, b_labels = batch
        loss = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)[0]
        logits = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]

        acc = self.accurate_nb(logits, b_labels)

        self.log('train_acc', acc)
        self.log('train_loss', loss)
        return loss

    # *** Old Monitor, NYI -- Do we want to use a scheduler at all?
    # https://pytorch-lightning.readthedocs.io/en/latest/common/optimizers.html#automatic-optimization
    #     # for batch in validation_dataloader:
    #     #     with torch.no_grad():
    #     #         logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0] 

    #     #     tmp_eval_nb = accurate_nb(logits, b_labels)

    #     #     eval_accurate_nb += tmp_eval_nb
    #     #     nb_eval_examples += label_ids.shape[0]
    #     # eval_accuracy = eval_accurate_nb/nb_eval_examples
    #     # print("Validation Accuracy: {}".format(eval_accuracy))
    #     # scheduler.step(eval_accuracy)


    def validation_step(self, batch, batch_idx):
        b_input_ids, b_input_mask, b_labels = batch
        loss = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)[0]
        logits = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]
        
        acc = self.accurate_nb(logits, b_labels)

        self.log('valid_acc', acc)
        self.log('valid_loss', loss, on_step=True)


    def test_step(self, batch, batch_idx):
        b_input_ids, b_input_mask, b_labels = batch
        loss = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)[0]
        logits = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0]

        acc = self.accurate_nb(logits, b_labels)

        self.log('test_acc', acc)
        self.log('test_loss', loss)

    def configure_optimizers(self):
        ###
        # Param Optim.
        ###
        param_optimizer = list(self.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
            'weight_decay_rate': self.hparams.weight_decay},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay_rate': 0.0}
        ]
        optimizer = torch.optim.Adam(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
        # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.1)
        # return {'optimizer': optimizer, 'lr_scheduler': scheduler, 'monitor': 'eval_acc'}
        return optimizer



    def accurate_nb(self, preds, labels):
        pred_flat = torch.argmax(preds, dim=1).flatten()
        labels_flat = labels.flatten()
        return torch.sum(pred_flat == labels_flat) / labels_flat.shape[0]

In [10]:
def main():
    TRAIN_BATCH_SIZE = 216
    VAL_BATCH_SIZE = 216
    TEST_BATCH_SIZE = 32

    LEARNING_RATE = 0.1
    EPOCHS = 3
    WEIGHT_DECAY = 0.2

    SEED = 7
    
    data = ProjData(max_len=128, ratio=2)
    try:
        data.load()
        print("Loaded Saved Data")
    except:
        data.setup()
        data.save()
    # data.setup()
    # data.save()
    
    model = ProjModel(learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

    logger = TensorBoardLogger("tb_logs", name="...")

    # trainer = pl.Trainer(logger=logger, accelerator='dp', gpus=4, fast_dev_run=False, weights_summary='full', gradient_clip_val=1.0)
    # trainer = pl.Trainer(logger=logger, accelerator='dp', gpus=4, gradient_clip_val=1.0, max_epochs=4)
    trainer = pl.Trainer(logger=logger, accelerator='dp', gpus=1, gradient_clip_val=1.0, max_epochs=20)
    trainer.fit(model, data.train_dataloader(batch_size=TRAIN_BATCH_SIZE), data.val_dataloader(batch_size=VAL_BATCH_SIZE))


In [11]:
torch.cuda.is_available()

True

In [12]:
if __name__ == '__main__':
    main()

Getting pos
Getting neg
Preprocessing... 1. split new lines, 2. convert to lowercase, and 3. strip numbers and punct
Joining
Using  24234469 48468938  samples.
Tokenizing
Padding
Creating masks


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Validation sanity check: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 48.00 MiB (GPU 0; 10.76 GiB total capacity; 8.20 GiB already allocated; 46.44 MiB free; 8.34 GiB reserved in total by PyTorch)