# Setup
Click the file icon on the left, then the upload file icon in the panel. Upload `implicit_hate_train.csv`, `implicit_hate_dev.csv`, and `implicit_hate_test.csv`

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import pandas as pd

In [3]:
from imblearn.under_sampling import RandomUnderSampler

In [4]:
folder = '/content/drive/MyDrive/CIS 530 project/'

In [5]:
import sys
sys.path.append(folder)

In [6]:
major_label_2_idx = {'not_hate' : 0, 'implicit_hate': 1, 'explicit_hate' : 2}
major_idx_2_label = {0: 'not_hate', 1:'implicit_hate',2:'explicit_hate'}


major_class_train_data = pd.read_csv(folder+"implicit_hate_train.csv")
major_class_dev_data = pd.read_csv(folder+"implicit_hate_dev.csv")
major_class_test_data = pd.read_csv(folder+"implicit_hate_test.csv")

for df in [major_class_train_data,major_class_dev_data,major_class_test_data]:
  df['labels'] = df['class'].apply(lambda x: major_label_2_idx[x])
print("major_class relevant keys: `post`, `class`")
print("  post: the tweet (str)")
print("  class: takes the values `explicit_hate`, `implicit_hate`, or `not_hate` (str)")

FileNotFoundError: ignored

In [None]:
minor_label_2_idx = {'incitement':0,'inferiority':1,'irony':2,'stereotypical':3,'threatening':4,'white_grievance':5,'other':6}
minor_idx_2_label = {0:'incitement', 1:'inferiority',2:'irony', 3:'stereotypical',4:'threatening',5:'white_grievance',6:'other'}


minor_class_train_data = major_class_train_data[major_class_train_data['implicit_class'].notnull()].reset_index(drop=True)
minor_class_dev_data = major_class_dev_data[major_class_dev_data['implicit_class'].notnull()].reset_index(drop=True)
minor_class_test_data = major_class_test_data[major_class_test_data['implicit_class'].notnull()].reset_index(drop=True)

for df in [minor_class_train_data,minor_class_dev_data,minor_class_test_data]:
  df['labels'] = df['implicit_class'].apply(lambda x: minor_label_2_idx[x])

print("minor_class relevant keys: `post`, `implicit_class`")
print("  post: the tweet (str)")
print("  implicit_class: takes the values `incitement`, `inferiority`, `irony`, `stereotypical`, `threatening`, or `white_grievance` (str)")

In [None]:
major_class_train_data

In [None]:
major_rus = RandomUnderSampler()
major_train_balanced, _ = major_rus.fit_resample(major_class_train_data,major_class_train_data['labels'])

minor_rus = RandomUnderSampler()
minor_train_balanced, _ = minor_rus.fit_resample(minor_class_train_data,minor_class_train_data['labels'])

## Testing BERT finetuning levels

In [None]:
pip install transformers

In [None]:
pip install datasets

In [None]:
pip install wandb

In [None]:
import wandb

In [None]:
wandb.login()

In [None]:
wandb.init(project="test-project", entity="cis530-project")

In [None]:
import numpy as np
import torch

from torch.utils.data import Dataset, DataLoader
import datasets #huggingface
from transformers import BertTokenizer, BertModel, BertForSequenceClassification#DistilBertModel, DistilBertTokenizer #change to model type you want, e.g. Bert or AlBert
from evaluation import *

In [None]:
import torch.nn as nn

In [None]:
from tqdm.autonotebook import tqdm

In [None]:
import torch.nn.functional as F

In [None]:
from transformers import Trainer, TrainingArguments

In [None]:
def tokenize(data):
    return tokenizer(data['post'], truncation=True,padding='max_length')
def tokenize_dataset(dataset,tokenizer):
    return dataset.map(tokenize,batched=True)


In [None]:
#if needed, change the tokenizer here, this is used in the function to get CLS embeddings
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") #not worried about uppercase vs lowercase
#bert = BertModel.from_pretrained("bert-base-uncased") #if needed, change model here
#bert = BertForSequenceClassification.from_pretrained("bert-base-uncased")

In [None]:
#use pandas instead to map the classes

In [None]:
"""
#used to make datasets and dataloaders.
#x = non-tokenized text
#y = labels
class ImplicitHateDataset(Dataset):
  def __init__(self,text,labels):
    self.x = text
    self.y = labels
  def __len__(self):
    return len(self.y)
  def __getitem__(self,idx):
    
    return self.x[idx],self.y[idx]
"""

In [None]:
class FineTunedBertModel(nn.Module):
  def __init__(self,bert_model, output_size):
    super(FineTunedBertModel,self).__init__()
    self.bert = bert_model
    self.dropout = nn.Dropout(p=0.5)
    self.feedforward = nn.Linear(768,output_size)
  
  def forward(self,data):
    embeddings = self.bert(**data).last_hidden_state
    non_cls_embedding_mean = torch.mean(embeddings[:,1:,:],1)
    logits = self.feedforward(self.dropout(non_cls_embedding_mean))
    return logits

In [None]:
class FineTunedClassifier():
  def __init__(self,tokenizer,bert_model,label_name,num_unfrozen_layers,epochs,lr,device):
    #label_name is class or implicit_class
    #train_hyperparams = learning rate, optimizer, ....
    
    
    if label_name == 'class':
      self.label_2_idx = {'not_hate' : 0, 'implicit_hate': 1, 'explicit_hate' : 2}
      self.idx_2_label = {0: 'not_hate', 1:'implicit_hate',2:'explicit_hate'}
      output_size = 3
      
    elif label_name == 'implicit_class':
      self.label_2_idx = {'incitement':0,'inferiority':1,'irony':2,'stereotypical':3,'threatening':4,'white_grievance':5,'other':6}
      self.idx_2_label = {0:'incitement', 1:'inferiority',2:'irony', 3:'stereotypical',4:'threatening',5:'white_grievance',6:'other'}
      output_size = 6
      
    self.label2idx_func = np.vectorize(lambda x: self.label_2_idx[x])
    self.idx2label_func = np.vectorize(lambda x: self.idx_2_label[x])
      
    self.tokenizer = tokenizer
    
    self.label_name = label_name
    
    self.finetuned_bert = FineTunedBertModel(bert_model,output_size).to(device)

    self.epochs = epochs
    self.lr = lr
    num_bert_layers = len(self.finetuned_bert.bert.encoder.layer)


    #make sure num_unfrozen is valid, if not make it 0
    #see: https://discuss.huggingface.co/t/how-to-freeze-some-layers-of-bertmodel/917
    unfrozen_layers = num_unfrozen_layers
    if num_unfrozen_layers not in range(num_bert_layers-1):
      print('invalid number of layers specified, will freeze all of Bert')
      unfrozen_layers = 0
    for param in self.finetuned_bert.bert.encoder.layer[:num_bert_layers - unfrozen_layers-1].parameters():
      param.requires_grad = False
    

    
      output_size = 6

  def tokenize(self,data):
    return self.tokenizer(data['post'], truncation=True,padding='max_length')
  def tokenize_dataset(self,dataset):
    return dataset.map(self.tokenize,batched=True)

  def preprocess(self, data, is_train):
    #is_train: boolean whether or not we're working on the training set, to shuffle it.
    
    hf_dataset = datasets.Dataset.from_pandas(data[['post','labels']])
    print('Tokenizing data:')
    hf_dataset = self.tokenize_dataset(hf_dataset)
    #hf_dataset = hf_dataset.remove_columns(['post'])
    hf_dataset.set_format('torch')
    #label_name is class or implicit_class
    loader = DataLoader(hf_dataset, shuffle=is_train, batch_size=8)
    return loader
    

  """
  def make_embeddings_array(self,loader):
    
    tokenizes each post in batches, and gets the embedding for its CLS token 
    (which is put in while we tokenize)
  
    Params:
    -------
    loader: dataloader containing the raw text and labels

    Returns:
    --------
    all_cls_embeddings: a 2D numpy array where each row is the CLS embedding of its corresponding post
    
    all_cls_embeddings = None
    with torch.no_grad():
      for texts, labels in tqdm(loader):
        tokenized_texts = self.tokenizer(texts,padding=True,truncation=True, return_tensors='pt')
        embeddings = self.bert(**tokenized_texts).last_hidden_state
        cls_token_embeddings = torch.mean(embeddings[:,1:,:],1)
        if all_cls_embeddings is None:
          all_cls_embeddings = cls_token_embeddings.numpy()
        else:
          all_cls_embeddings = np.concatenate((all_cls_embeddings,cls_token_embeddings),axis=0)
        
    return all_cls_embeddings
  """


  def train(self,data):
    self.finetuned_bert.train()
    loader = self.preprocess(data,True)
    epochs = self.epochs
    learning_rate = self.lr
    #[p for p in model.parameters() if p.requires_grad()]
    optimizer = torch.optim.Adam(self.finetuned_bert.parameters(),lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    wandb.init(config={
        "epochs":epochs,
        "batch_size":8,
        "learning_rate":learning_rate
    }
    
    )

    config = wandb.config

    for epoch in range(epochs):
      print('Training epoch:',epoch+1)
      running_loss = 0
      #progress_bar = tqdm(range(len(loader)))
      for batch in tqdm(loader):
        optimizer.zero_grad()
        labels = batch['labels'].to(device)
        data = {k: torch.Tensor(v).to(device) for k, v in batch.items() if k != 'labels' and k!= 'post'}
        #print(data)
        
        
        logits = self.finetuned_bert(data)

        loss = criterion(logits,labels)
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
        wandb.log({"loss":loss.item()})
        #progress_bar.update(1)
      running_loss /=len(loader)
      wandb.log({"running_loss":running_loss})
      print('Loss for epoch',(epoch+1),':',running_loss)


  def predict(self,data):
    self.finetuned_bert.eval()
    loader = self.preprocess(data,False)
    all_predictions = None
    for batch in loader:
      
      labels = batch['labels'].to(device)
      data = {k: torch.Tensor(v).to(device) for k, v in batch.items() if k != 'labels' and k!= 'post'}
      logits = self.finetuned_bert(data)
      predicted = torch.argmax(logits,dim=1)
      if all_predictions is None:
        all_predictions = predicted.detach().cpu().numpy()
      else:
        all_predictions = np.concatenate((all_predictions,predicted.detach().cpu().numpy()),axis=0)
    return self.idx2label_func(all_predictions)
  

In [None]:
class ClsTokenBert():
  def __init__(self,tokenizer,label_name,num_unfrozen_layers,epochs,lr,device):
    if label_name == 'class':
      self.label_2_idx = {'not_hate' : 0, 'implicit_hate': 1, 'explicit_hate' : 2}
      self.idx_2_label = {0: 'not_hate', 1:'implicit_hate',2:'explicit_hate'}
      output_size = 3
      
    elif label_name == 'implicit_class':
      self.label_2_idx = {'incitement':0,'inferiority':1,'irony':2,'stereotypical':3,'threatening':4,'white_grievance':5,'other':6}
      self.idx_2_label = {0:'incitement', 1:'inferiority',2:'irony', 3:'stereotypical',4:'threatening',5:'white_grievance',6:'other'}
      output_size = 6
      
    self.label2idx_func = np.vectorize(lambda x: self.label_2_idx[x])
    self.idx2label_func = np.vectorize(lambda x: self.idx_2_label[x])
      
    self.tokenizer = tokenizer
    
    self.label_name = label_name
    
    self.finetuned_bert = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=output_size).to(device)
    

    self.epochs = epochs
    self.lr = lr
    num_bert_layers = len(self.finetuned_bert.bert.encoder.layer)


    #make sure num_unfrozen is valid, if not make it 0
    #see: https://discuss.huggingface.co/t/how-to-freeze-some-layers-of-bertmodel/917
    unfrozen_layers = num_unfrozen_layers
    if num_unfrozen_layers not in range(num_bert_layers-1):
      print('invalid number of layers specified, will freeze all of Bert')
      unfrozen_layers = 0
    for param in self.finetuned_bert.bert.encoder.layer[:num_bert_layers - unfrozen_layers-1].parameters():
      param.requires_grad = False
    

    
      output_size = 6

  def tokenize(self,data):
    return self.tokenizer(data['post'], truncation=True,padding='max_length')
  def tokenize_dataset(self,dataset):
    return dataset.map(self.tokenize,batched=True)

  def preprocess(self, data, is_train):
    #is_train: boolean whether or not we're working on the training set, to shuffle it.
    
    hf_dataset = datasets.Dataset.from_pandas(data[['post','labels']])
    print('Tokenizing data:')
    hf_dataset = self.tokenize_dataset(hf_dataset)
    #hf_dataset = hf_dataset.remove_columns(['post'])
    hf_dataset.set_format('torch')
    #label_name is class or implicit_class
    loader = DataLoader(hf_dataset, shuffle=is_train, batch_size=12)
    return loader
  def train(self,data):
    self.finetuned_bert.train()
    loader = self.preprocess(data,True)
    epochs = self.epochs
    learning_rate = self.lr
    #[p for p in model.parameters() if p.requires_grad()]
    optimizer = torch.optim.Adam(self.finetuned_bert.parameters(),lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    wandb.init(config={
        "epochs":epochs,
        "batch_size":8,
        "learning_rate":learning_rate
    }
    
    )

    config = wandb.config

    for epoch in range(epochs):
      print('Training epoch:',epoch+1)
      running_loss = 0
      #progress_bar = tqdm(range(len(loader)))
      for batch in tqdm(loader):
        optimizer.zero_grad()
        #print(batch)
        labels = batch['labels'].to(device)
        data = {k: v for k, v in batch.items() if k != 'labels' and k!= 'post'}
        #print(data)
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        token_type_ids = data['token_type_ids'].to(device)

        
        logits = self.finetuned_bert(input_ids=input_ids,
                                     attention_mask=attention_mask,
                                     token_type_ids=token_type_ids).logits.to(device)
        
        loss = criterion(logits,labels)
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
        wandb.log({"loss":loss.item()})
        #progress_bar.update(1)
      running_loss /=len(loader)
      wandb.log({"running_loss":running_loss})
      print('Loss for epoch',(epoch+1),':',running_loss)
    wandb.finish()

  def predict(self,data):
    self.finetuned_bert.eval()
    loader = self.preprocess(data,False)
    all_predictions = None
    for batch in loader:
      
      labels = batch['labels'].to(device)
      data = {k: v for k, v in batch.items() if k != 'labels' and k!= 'post'}
      input_ids = data['input_ids'].to(device)
      attention_mask = data['attention_mask'].to(device)
      token_type_ids = data['token_type_ids'].to(device)

        
      logits = self.finetuned_bert(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids).logits
      predicted = torch.argmax(logits,dim=1)
      if all_predictions is None:
        all_predictions = predicted.detach().cpu().numpy()
      else:
        all_predictions = np.concatenate((all_predictions,predicted.detach().cpu().numpy()),axis=0)
    return self.idx2label_func(all_predictions)

In [None]:
class UnfrozenBert():
  def __init__(self,tokenizer,label_name,num_unfrozen_layers,epochs,lr,device):
    if label_name == 'class':
      self.label_2_idx = {'not_hate' : 0, 'implicit_hate': 1, 'explicit_hate' : 2}
      self.idx_2_label = {0: 'not_hate', 1:'implicit_hate',2:'explicit_hate'}
      output_size = 3
      
    elif label_name == 'implicit_class':
      self.label_2_idx = {'incitement':0,'inferiority':1,'irony':2,'stereotypical':3,'threatening':4,'white_grievance':5,'other':6}
      self.idx_2_label = {0:'incitement', 1:'inferiority',2:'irony', 3:'stereotypical',4:'threatening',5:'white_grievance',6:'other'}
      output_size = 6
      
    self.label2idx_func = np.vectorize(lambda x: self.label_2_idx[x])
    self.idx2label_func = np.vectorize(lambda x: self.idx_2_label[x])
      
    self.tokenizer = tokenizer
    
    self.label_name = label_name
    
    self.finetuned_bert = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=output_size).to(device)
    

    self.epochs = epochs
    self.lr = lr
    num_bert_layers = len(self.finetuned_bert.bert.encoder.layer)

    """
    #make sure num_unfrozen is valid, if not make it 0
    #see: https://discuss.huggingface.co/t/how-to-freeze-some-layers-of-bertmodel/917
    unfrozen_layers = num_unfrozen_layers
    if num_unfrozen_layers not in range(num_bert_layers-1):
      print('invalid number of layers specified, will freeze all of Bert')
      unfrozen_layers = 0
    for param in self.finetuned_bert.bert.encoder.layer[:num_bert_layers - unfrozen_layers-1].parameters():
      param.requires_grad = False
    

    
      output_size = 6
    """
  def tokenize(self,data):
    return self.tokenizer(data['post'], truncation=True,padding='max_length')
  def tokenize_dataset(self,dataset):
    return dataset.map(self.tokenize,batched=True)

  def preprocess(self, data, is_train):
    #is_train: boolean whether or not we're working on the training set, to shuffle it.
    
    hf_dataset = datasets.Dataset.from_pandas(data[['post','labels']])
    print('Tokenizing data:')
    hf_dataset = self.tokenize_dataset(hf_dataset)
    #hf_dataset = hf_dataset.remove_columns(['post'])
    hf_dataset.set_format('torch')
    #label_name is class or implicit_class
    loader = DataLoader(hf_dataset, shuffle=is_train, batch_size=12)
    return loader
  def train(self,data):
    self.finetuned_bert.train()
    loader = self.preprocess(data,True)
    epochs = self.epochs
    learning_rate = self.lr
    #[p for p in model.parameters() if p.requires_grad()]
    optimizer = torch.optim.Adam(self.finetuned_bert.parameters(),lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    wandb.init(config={
        "epochs":epochs,
        "batch_size":8,
        "learning_rate":learning_rate
    }
    
    )

    config = wandb.config

    for epoch in range(epochs):
      print('Training epoch:',epoch+1)
      running_loss = 0
      #progress_bar = tqdm(range(len(loader)))
      for batch in tqdm(loader):
        optimizer.zero_grad()
        #print(batch)
        labels = batch['labels'].to(device)
        data = {k: v for k, v in batch.items() if k != 'labels' and k!= 'post'}
        #print(data)
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        token_type_ids = data['token_type_ids'].to(device)

        
        logits = self.finetuned_bert(input_ids=input_ids,
                                     attention_mask=attention_mask,
                                     token_type_ids=token_type_ids).logits.to(device)
        
        loss = criterion(logits,labels)
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
        wandb.log({"loss":loss.item()})
        #progress_bar.update(1)
      running_loss /=len(loader)
      wandb.log({"running_loss":running_loss})
      print('Loss for epoch',(epoch+1),':',running_loss)
    wandb.finish()

  def predict(self,data):
    self.finetuned_bert.eval()
    loader = self.preprocess(data,False)
    all_predictions = None
    for batch in loader:
      
      labels = batch['labels'].to(device)
      data = {k: v for k, v in batch.items() if k != 'labels' and k!= 'post'}
      input_ids = data['input_ids'].to(device)
      attention_mask = data['attention_mask'].to(device)
      token_type_ids = data['token_type_ids'].to(device)

        
      logits = self.finetuned_bert(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids).logits
      predicted = torch.argmax(logits,dim=1)
      if all_predictions is None:
        all_predictions = predicted.detach().cpu().numpy()
      else:
        all_predictions = np.concatenate((all_predictions,predicted.detach().cpu().numpy()),axis=0)
    return self.idx2label_func(all_predictions)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

Testing with ALL of Bert unfrozen

In [None]:
epochs = 10
lr = 2e-6
num_unfrozen_layers = 12

#major_model = FineTunedClassifier(tokenizer,bert,"class",256,0,None)
major_class_model = UnfrozenBert(tokenizer,"class",num_unfrozen_layers,epochs,lr,device)


minor_class_model = UnfrozenBert(tokenizer,"implicit_class",num_unfrozen_layers,epochs,lr,device)


In [None]:
train_and_evaluate(major_class_model, major_train_balanced, major_class_test_data, "class")

In [None]:
train_and_evaluate(minor_class_model, minor_train_balanced, minor_class_test_data, "implicit_class")

Testing with no unfrozen Bert layers

In [None]:
epochs = 10
lr = 2e-5
num_unfrozen_layers = 0

#major_model = FineTunedClassifier(tokenizer,bert,"class",256,0,None)
major_class_model = ClsTokenBert(tokenizer,"class",num_unfrozen_layers,epochs,lr,device)


minor_class_model = ClsTokenBert(tokenizer,"implicit_class",num_unfrozen_layers,epochs,lr,device)


In [None]:
train_and_evaluate(major_class_model, major_train_balanced, major_class_test_data, "class")

In [None]:
train_and_evaluate(minor_class_model, minor_train_balanced, minor_class_test_data, "implicit_class")

Testing one unfrozen Bert layer

In [None]:
num_unfrozen_layers = 1
major_class_model = ClsTokenBert(tokenizer,"class",num_unfrozen_layers,epochs,lr,device)


minor_class_model = ClsTokenBert(tokenizer,"implicit_class",num_unfrozen_layers,epochs,lr,device)

In [None]:
train_and_evaluate(major_class_model, major_train_balanced, major_class_test_data, "class")

In [None]:
train_and_evaluate(minor_class_model, minor_train_balanced, minor_class_test_data, "implicit_class")

Testing two unfrozen Bert layers

In [None]:
num_unfrozen_layers = 2
major_class_model = ClsTokenBert(tokenizer,"class",num_unfrozen_layers,epochs,lr,device)


minor_class_model = ClsTokenBert(tokenizer,"implicit_class",num_unfrozen_layers,epochs,lr,device)

In [None]:
train_and_evaluate(major_class_model, major_train_balanced, major_class_test_data, "class")

In [None]:
train_and_evaluate(minor_class_model, minor_train_balanced, minor_class_test_data, "implicit_class")

Testing three unfrozen Bert layers

In [None]:
num_unfrozen_layers = 3
major_class_model = ClsTokenBert(tokenizer,"class",num_unfrozen_layers,epochs,lr,device)


minor_class_model = ClsTokenBert(tokenizer,"implicit_class",num_unfrozen_layers,epochs,lr,device)

In [None]:
train_and_evaluate(major_class_model, major_train_balanced, major_class_test_data, "class")

In [None]:
train_and_evaluate(minor_class_model, minor_train_balanced, minor_class_test_data, "implicit_class")

A look at the bert encoder layers to see the architecture

In [None]:
print(minor_class_model.finetuned_bert.bert.encoder)