In [3]:
import random

import sklearn
from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import KFold
import torch


import numpy as np
import pandas as pd



from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, ConcatDataset
import torch.optim as optim
from torch.optim import Optimizer
from torch.optim.lr_scheduler import LambdaLR



# !pip install transformers
import transformers
from transformers import BertTokenizer, BertModel, BertConfig, AdamW
from transformers import DistilBertTokenizer, DistilBertModel, DistilBertConfig
from transformers import pipeline



# ensure reproducability
#torch.seed(25)
torch.manual_seed(25)
torch.cuda.manual_seed_all(25)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(25)
random.seed(25)




In [5]:
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
#device = 'cuda'

In [4]:
# Sections of config

# Defining some key variables that will be used later on in the training
model_type = 'distilbert'
model_version = 'distilbert-base-uncased'
do_lower_case = True

freeze_layer_count = 11


MAX_LEN = 128  #128 works ok

TRAIN_BATCH_SIZE = 6 #10 works ok # 32 works 
VALID_BATCH_SIZE = 6

EPOCHS = 7 # 15 works (14)

#LEARNING_RATE = 1e-05
LEARNING_RATE = None
#tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)
#tokenizer = DistilBertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case, output_hidden_states=True)
tokenizer = DistilBertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)

#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# kfold Configuration options
k_folds = 15 #(10)
  
# For fold results
results = {}


In [6]:

# Initializing a Distillbert bert-base-uncased style configuration
# SECOND SET OF Hyper Parameters
#vocab_size= 30522
#n_heads=12
# hidden_dim=3072
#dropout = .1
# attention_dropout = .1
config = transformers.DistilBertConfig(vocab_size= 30522, max_position_embeddings=1024, sinusoidal_pos_embds=False, 
                           n_layers=6, n_heads=6 , dim=768, hidden_dim=3072, dropout=0.1, attention_dropout=0.1, activation='gelu', 
                           initializer_range=0.02, qa_dropout=0.1, seq_classif_dropout=0.2, pad_token_id= 0, use_cache=True,  output_hidden_states=True)

#DistilBertConfig.from_pretrained('distilbert-base-cased', output_hidden_states=True)

In [7]:
class DistilBertClass(torch.nn.Module):
    def __init__(self):
        super(DistilBertClass, self).__init__()
            
        self.concat = 0
        #self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased', output_attentions=True)
        #self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased', output_attentions=False)
        
        self.l1 = transformers.DistilBertModel.from_pretrained("distilbert-base-uncased")
        #lock down for transfer learned laayers
        for param in  self.l1.parameters():
            param.requires_grad = False
        
        # now add new layers
        self.concat = 3 # 1 used base data # 2 base + derived hate  #3 uses base + derived hate + derived sentiment  #4 base + derived  sentiment  # else default to base
        
        if self.concat == 1: 
            dim = 768
        elif self.concat == 2:
            dim = 771
        elif self.concat == 3:
            dim = 773
        elif self.concat == 4:
            dim = 770
        else: 
            dim = 768
            
        
        self.pre_classifier = torch.nn.Linear(dim,1024) #768
        
      #  self.fc2 = torch.nn.Linear(756, 128)    
       # self.fc3 = torch.nn.Linear(1024, 512)
       # self.fc4 = torch.nn.Linear(512, 128)
        
        self.classifier = torch.nn.Linear(1024, 22)
        
        self.dropout = torch.nn.Dropout(0.2) #.3
        
    #768
    # def forward(self, ids, mask, token_type_ids):
    def forward(self, input_ids, attention_mask, token_type_ids,  hate,sent):
        # model inputs
        # (input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, output_attentions=None, output_hidden_states=None, return_dict=None)
        
        #print(ids)
        #print(mask)
        #_,
        #print(type(self.l1(input_ids =ids, attention_mask = mask, token_type_ids = token_type_ids)))
        #print(self.l1(input_ids =ids, attention_mask = mask, token_type_ids = token_type_ids))
        
        # transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions -> out of self.l1, need the tensor poooler_output
        #output_1= self.l1(input_ids =ids, attention_mask = mask, token_type_ids = token_type_ids).pooler_output
        #output_1 = self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        #print(self.l1)
        #print(type(self.l1))
        #print(self.l1)
        
        #output_1 = self.l1(ids, attention_mask = mask)
        #output_1 = self.l1(input_ids = ids, attention_mask = mask)

        output_1 = self.l1(input_ids=input_ids, attention_mask =attention_mask)
        #self.l1()
        #print(type(output_1))
        #print(output_1)
        #print(_)
        #print(type(_))
    
        # ONLY GET LAST HIDDEN STATE

        #output_2 = self.l2(output_1.last_hidden_state)
        #output_2 = self.l2(output_1)
        #output = self.l3(output_2)
        #print(output.shape)
        #output = output[:,0,:].numpy()
        
        hidden_state = output_1[0]
        x1 = hidden_state[:, 0]
        
        # prepare hidden state to append additional features (hate and sentiment; hotencoded)
        x1 = x1.view(x1.size(0), -1)
        #print(x1.shape)
        x2 = hate
        #print(x2.shape)
        x3 = sent
        
        if self.concat == 1:
            x = x1
            #print(x.shape)
        elif self.concat == 2: 
            x = torch.cat((x1, x2), dim = 1)
        elif self.concat == 3:
            x = torch.cat((x1, x2, x3), dim=1)
        elif self.concat == 4:  
            x = torch.cat((x1, x3), dim=1)
        else:
            x = hidden_state[:, 0]
            
        
        #print(x1.shape)
        #print(x2.shape)
        #print(x3.shape)
        #print(x.shape)
        
        #print(x2)
        #print(x3)
        
        fc_output = x
        #print(fc_output.shape)
        # fc_output = hidden_state[:, 0]
        fc_output = self.pre_classifier(fc_output)
        
       # fc_output = torch.nn.GELU()(fc_output)
       # fc_output = self.dropout(fc_output)

        #pooler = torch.nn.Tanh()(pooler)
        #pooler = torch.nn.ReLU()(pooler)
        #pooler = self.dropout(pooler)
        
        
        #pooler = torch.nn.ReLU()(pooler)
        #pooler = self.dropout(pooler)
        
        #output = self.classifier(pooler)
        
        
        #print(fc_output.shape)
    #    fc_output = self.fc2(fc_output)
    #    fc_output = torch.nn.GELU()(fc_output)
    #    fc_output = self.dropout(fc_output)
        
        #GELU
        
     #   fc_output = self.fc3(fc_output)
    #    fc_output = torch.nn.GELU()(fc_output)
    #    fc_output = self.dropout(fc_output)
        
        
    #    fc_output = self.fc2(fc_output)
        fc_output = torch.nn.GELU()(fc_output)  # using gelu except for the last one to allow for classification
        fc_output = self.dropout(fc_output)
        
        output = self.classifier(fc_output)

        #output = torch.softmax(output, dim = -1)
        #print(output)
        #print(output1)
        
        #pooler = self.dropout(torch.nn.GELU())
        #pooler = self.dropout(torch.nn.GELU(self.fc3(pooler)))
        #pooler = self.dropout(torch.nn.GELU(self.fc4(pooler)))
        #output = self.classifier(pooler)
        
        return output, output_1




In [10]:

def instantiate_model(config, lock_layer_count):
    #model = DistilBertClass(config=config)
    model = DistilBertClass()
    # Accessing the model configuration
    #configuration = model.config
    
    # Log metrics with wandb
    #wandb.watch(model)


    #freeze model
  #  if lock_layer_count:
    # We freeze here the embeddings of the model
    #    for param in model.l1.embeddings.parameters():
   #         param.requires_grad = False
            
     #   for param in model.l1.parameters():
     #       param.requires_grad = False
            


       # if lock_layer_count != -1:
            # if freeze_layer_count == -1, we only freeze the embedding layer
            # otherwise we freeze the first `freeze_layer_count` encoder layers
        #    for layer in model.l1.encoder.layer[:lock_layer_count]:
         #       for param in layer.parameters():
         #           param.requires_grad = False

    # view layers                
    #for name, param in model.named_parameters():
        #print(name)
    #    if param.requires_grad:
    #        print(name)

    # learnable parameters
    pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print("Total Learnable Parms: " + str(pytorch_total_params))
    
    return model #, configuration


# contains a sigmoids activation function built in.
# https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html

def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)



In [11]:
model = instantiate_model(config,freeze_layer_count)


Total Learnable Parms: 815126


In [12]:
for name, param in model.named_parameters():
    #print(name)
    if param.requires_grad:
        print(name)


pre_classifier.weight
pre_classifier.bias
classifier.weight
classifier.bias


In [None]:
model.to(device)

In [None]:
# replace AdamW with Adafactor

optimizer = transformers.Adafactor(
    model.parameters(),
    lr=LEARNING_RATE,  # when using warm up and relative step, LR is auto determined
    #lr=1e-3,
    eps=(1e-30, 1e-3),
    clip_threshold=1.0,
    decay_rate=-0.8, #0.8
    beta1=None, # <- used for L1 regularization
    weight_decay=0.0002, # L2 regularization, to prevent overfitting  (beta2)
    relative_step=True,
    scale_parameter=True, # https://github.com/pytorch/pytorch/issues/25081 this setting keeps the gradients from reaching 0 (using the clip threshold) (if this is enabled, must modify in training)
    warmup_init=True
)

In [None]:
kfold = KFold(n_splits=k_folds, shuffle=True)

In [None]:
#print("Combined Dataset: {}".format(len(df_combined)))

print("Train Dataset: {}".format(len(df_training)))
print("Val Dataset: {}".format(len(df_val)))

In [None]:
def train_model(model, trn_loader, tst_loader, optimizer, num_epochs=5):
    scaler = torch.cuda.amp.GradScaler() # used to minimize model footprint, strategy is called model quantitization.
    #model.train()
    torch.cuda.empty_cache()
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    losses_train = []
    losses_val = []
    accuracy_train = []
    accuracy_val= []
    clip = 1
    
    #scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=3, num_training_steps=3)
    num_training_steps = num_epochs+1
    num_warmup_steps = 2
    
    #scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) #bug is not allowing for this to work :(

    for epoch in range(num_epochs):
       
       
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 20)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
                loader = trn_loader
            else:
                model.eval()   # Set model to evaluate mode
                loader = tst_loader

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for _, data in enumerate(loader):
                # zero the parameter gradients
                optimizer.zero_grad()
                
                #input into model (takes 3 items: ids, mask and token)
                ids = data['ids'].to(device, dtype = torch.long)
                mask = data['mask'].to(device, dtype = torch.long)
                token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
                #print(type(data))
                #print(data)
                #print(data.keys())
                hate = data['hate'].to(device, dtype = torch.long)
                sent = data['sentiment'].to(device, dtype = torch.long)
                
            
                # gold label
                labels = data['targets'].to(device)

                
                # forward
                # track history if only in train                
                with torch.cuda.amp.autocast():  # used to minimize model footprint, strategy is called model quantitization.
                    #print('id')
                    #print(ids)
                    #print('mask')
                    #print(mask)
                    #print('token')
                    #print(token_type_ids)
                    
                    
                    preds, model_output_object  = model(ids, mask, None, hate, sent)
                    #loss, preds, hidden_states_output, attention_mask_output = model(ids, mask, token_type_ids)
                    #print(type(preds))
                    #print(preds.info())
                    features = preds
                    #features = preds[:,0,:].cpu()
                    #features = features.to('cuda')
                    #time.sleep(5)
                    # model output
                    # 'ids': torch.tensor(ids, dtype=torch.long),
                    # 'mask': torch.tensor(mask, dtype=torch.long),
                    # 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                    #'targets'
                    #print(preds[1].shape)
                    
                    #print('pred')
                    #print(preds)
                    loss = loss_fn(preds, labels) 
                    #print('loss')
                    #print(loss)
                    #preds = preds.detach().cpu().numpy()
                
                    #print(loss)
                    
                    # backward + optimize only if in training phase
                if phase == 'train':
                    #scaler.scale(loss).backward() # used to minimize model footprint, strategy is called model quantitization.
                    #print(loss)
                    #scaler.step(optimizer)
                    #unsale and clip to remove any possible inf or nulls from result set; this can happen due to scaling using 8 or 16 bit vs 32 or 64.
                    
                    #added this try except block to catch gradient that become inf or NaN. (supposedly this is automatic but google says there is currently a bug)
                    #implemented gradient scaling in the try block
                    
                    try:
                        optimizer.zero_grad()
                        # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
                        # Backward passes under autocast are not recommended.
                        #Backward ops run in the same dtype autocast chose for corresponding forward ops.
                       # scaler.scale(loss).backward()
                        
                        # scaler.step() first unscales the gradients of the optimizer's assigned params.
                        # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
                        # otherwise, optimizer.step() is skipped.
                       # scaler.step(optimizer)
                        
                        # Updates the scale for next iteration.
                       # scaler.update()
                        
                        
                        scaler.scale(loss).backward()

                        # Unscales the gradients of optimizer's assigned params in-place
                        scaler.unscale_(optimizer)

                        # Since the gradients of optimizer's assigned params are unscaled, clips as usual:
                        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.25)
                        # optimizer's gradients are already unscaled, so scaler.step does not unscale them,
                        # although it still skips optimizer.step() if the gradients contain infs or NaNs.
                        scaler.step(optimizer)

                        # Updates the scale for next iteration.
                        scaler.update()
        
                        # scheduler.step()  # Update learning rate schedule  # bug is not letting this part work either.
                        optimizer.step()
                        #optimizer.zero_grad()
                        
                    except AssertionError:
                            print("scaler encountered inf, did not step, will try next iteration.")
                            pass
                    
                    

                # statistics
                running_loss += loss.detach().cpu().numpy() * ids.size(0)
                #print(running_loss)
                running_corrects += torch.sum(features.data == labels.data).detach().cpu().numpy()
                
                

                #print(running_corrects)
               # print(preds.detach().cpu().numpy())
                #print(labels.data)

            epoch_loss = running_loss / (len(loader) * loader.batch_size)
            # epoch_acc = running_corrects/ (len(loader) * loader.batch_size)
            
            ac_labels = labels.detach().cpu().numpy()
            ac_pred =outputs = [pl>0.50 for pl in features.detach().cpu().numpy()]
            epoch_acc =  accuracy = metrics.accuracy_score(ac_labels, ac_pred) 
           

            
            if phase == 'train':
                losses_train.append(epoch_loss) 
                accuracy_train.append(epoch_acc)
            else:
                losses_val.append(epoch_loss)
                accuracy_val.append(epoch_acc)
                wandb.log({"Test Accuracy": accuracy_val, "Test Loss": losses_val})

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
           
     
            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
    
        
        time_elapsed = time.time() - since
        print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
        print('Best val Acc: {:4f}'.format(best_acc))
        internal_lr = 0.00
        if(optimizer.param_groups[0]["lr"] == None):
            internal_lr = 0.00
        else:
            internal_lr = optimizer.param_groups[0]["lr"] 
        print('Adjusted - Learning Rate:  {:.5f}'.format(internal_lr))
        # load best model weights
        model.load_state_dict(best_model_wts)
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': epoch_loss,
            }, "./model.pkl")
    return model, losses_train, losses_val, accuracy_train, accuracy_val, model_output_object

In [None]:

# K-fold Cross Validation model evaluation
for fold, (kfold_cv_train_ids, kfold_cv_test_ids) in enumerate(kfold.split(df_training)):

    # Print
    print(f'FOLD {fold}')
    print('--------------------------------')
    
    # Sample elements randomly from a given list of ids, no replacement.
    #train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    #test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)
    
    # Define data loaders for training and testing data in this fold
    #trainloader = torch.utils.data.DataLoader(
    #                  dataset, 
    #                  batch_size=10, sampler=train_subsampler)
    #testloader = torch.utils.data.DataLoader(
    #                  dataset,
    #                  batch_size=10, sampler=test_subsampler)
    torch.cuda.empty_cache()
    
    kfold_cv_train_loader, kfold_cv_test_loader, val_loader = create_data_loader_kfold(df_training, df_val, kfold_cv_train_ids, kfold_cv_test_ids)
    # Init the neural network
    #network = SimpleConvNet()
    
    # Initialize optimizer
    #optimizer = torch.optim.Adam(network.parameters(), lr=1e-4)

    
    
    model, loss_train, loss_val, accuracy_train,accuracy_val, model_output_object_train = train_model(model, kfold_cv_train_loader, kfold_cv_test_loader, optimizer, num_epochs=EPOCHS)
  
            
    # Process is complete.
    print('Training process has finished. Saving trained model.')

    # Print about testing
    print('Starting Validation')
    
    # Saving the model
    save_path = f'./model-fold-{fold}.pth'
    torch.save(model.state_dict(), save_path)
    torch.save(model.state_dict(), os.path.join(wandb.run.dir, save_path))

    # Evaluationfor this fold
    correct, total = 0, 0
    #with torch.no_grad():

      # Iterate over the test data and generate predictions
      #for i, data in enumerate(testloader, 0):

        # Get inputs
       # inputs, targets = data

        # Generate outputs
        #outputs = network(inputs)

        # Set total and correct
       # _, predicted = torch.max(outputs.data, 1)
       # total += targets.size(0)
       # correct += (predicted == targets).sum().item()
    
    true_labels, true_bools, pred_labels, model_output_object_val = validation(model, val_loader)

    correct = len(true_bools)
    total = len(true_labels)
    #print(loss_val)
    # Print accuracy
    ave = sum(loss_val) / len(loss_val)
    print('Accuracy for fold %d: %d %%' % (fold, 100.0 * ave))
    print('--------------------------------')
    results[fold] = 100.0 * ave
    
    
    
    
# Print fold results
print(f'K-FOLD CROSS VALIDATION RESULTS FOR {k_folds} FOLDS')
print('--------------------------------')
total = 0.0
for key, value in results.items():
    print(f'Fold {key}: {value} %')
    total += value
print(f'Average: {total/len(results.items())} %')




    