In [29]:
# mount google drive

from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
#Download the dataset
!wget https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv

In [None]:
#Install the transformers library
!pip install transformers

In [None]:
# Install contractions library
!pip install contractions

In [4]:
# Importing the necessary modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import contractions
import re

from collections import Counter

import torch
from tqdm.notebook import tqdm
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from transformers import BertTokenizer, BertForSequenceClassification

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [5]:
# Setting up NLTK
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
# Read the dataset
df = pd.read_csv('labeled_data.csv', on_bad_lines='skip', index_col = 0)

[Data Description](https://github.com/t-davidson/hate-speech-and-offensive-language/blob/master/data/readme.md)<br>
Here, class 0 is hatespeech, 1 is offensive language and 2 is neither. We just want to detect hatespeech, so we'll extract a portion of the dataset.

In [7]:
print(len(df))
df.head()

24783


Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [8]:
df['class'].value_counts()

1    19190
2     4163
0     1430
Name: class, dtype: int64

In [9]:
hatespeech = df[df['class'] == 0]

offensive = df[df['class'] == 1]
offensive = offensive.sample(n = 715)

neither = df[df['class'] == 2]
neither = neither.sample(n = 715)

In [10]:
# create new dataset
data = pd.concat([hatespeech, offensive, neither])

In [11]:
data = data.sample(frac = 1)
data.reset_index(drop = True, inplace = True)
data['class'].replace(2, 1, inplace = True) # replace offensive language and neither category with a single label
data.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,1,#mt #commission #gouache #tiki #wahine #monkey...
1,3,2,1,0,0,"@_IAMKEN fuck Brett Farve redneck ass, he stuc..."
2,3,2,1,0,0,For all we know this nigga a rapist. Y'all hoe...
3,3,0,0,3,1,"""@worldtraveIs: Baby albino &#128034; http://t..."
4,3,2,0,1,0,@JGardenofEden @KySportsRadio @ryanlemond But....


In [12]:
len(data)

2860

In [13]:
data['class'].value_counts()

1    1430
0    1430
Name: class, dtype: int64

In [14]:
# remove emojis from the tweets
def remove_emoji(s):
    emoji_pattern = re.compile(
          '['
          u'\U0001F600-\U0001F64F'  # emoticons
          u'\U0001F300-\U0001F5FF'  # symbols & pictographs
          u'\U0001F680-\U0001F6FF'  # transport & map symbols
          u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
          u'\U00002702-\U000027B0'
          u'\U000024C2-\U0001F251'
          ']+',
          flags=re.UNICODE)
    return emoji_pattern.sub(r'', s)

# remove URL from the tweets
def remove_URL(s):
    url = re.compile(r'https?://\S+|www\.||pic.twitter.com\S+')
    return url.sub(r'', s)

# to remove any special characters and extra space
def normalizeString(s): 
    s = re.sub("@[A-Za-z0-9]+","",s)
    s = s.replace('&amp', '')
    s = remove_emoji(s)
    s = contractions.fix(s)
    s = remove_URL(s)
    s = s.replace("'s", 's')
    s = re.sub(r"[^a-zA-Z0-9.,? ]+", r" ", s)
    s = s.strip()
    
    return s

# cleaning tweets in the dataset
def preprocessing(text):
  for i in range(len(text)):
      text[i] = str(text[i])
      text[i] = ' '.join(text[i].split('\n'))
      text[i] = ' '.join(normalizeString(str).strip() for str in text[i].split())

  return text

In [15]:
tweet_text = data['tweet'].str.lower() # convert the tweets into lower case
tweet_text = tweet_text.tolist() # create a list of tweets

tweet_text = preprocessing(tweet_text)

In [16]:
tweet_text[1]

'iamken fuck brett farve redneck ass, he stuckup he do not give a damn lol he be on campus acting like he the shit'

In [17]:
data['processed_tweets'] = tweet_text

In [18]:
data.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,processed_tweets
0,3,0,0,3,1,#mt #commission #gouache #tiki #wahine #monkey...,mt commission gouache tiki wahine monkey tubed...
1,3,2,1,0,0,"@_IAMKEN fuck Brett Farve redneck ass, he stuc...","iamken fuck brett farve redneck ass, he stucku..."
2,3,2,1,0,0,For all we know this nigga a rapist. Y'all hoe...,for all we know this nigga a rapist. you all h...
3,3,0,0,3,1,"""@worldtraveIs: Baby albino &#128034; http://t...",baby albino 128034
4,3,2,0,1,0,@JGardenofEden @KySportsRadio @ryanlemond But....,but...country music listeners really are hi...


In [19]:
train_df, test_df = train_test_split(data, test_size=0.2, random_state=9)

In [20]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)
                                          
encoded_data_train = tokenizer.batch_encode_plus(
    train_df.processed_tweets.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    test_df.processed_tweets.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(train_df['class'].values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(test_df['class'].values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [21]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=2,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [23]:
batch_size = 8

dataloader_train = DataLoader(dataset_train, 
                              sampler=SequentialSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler = SequentialSampler(dataset_val),
                                   batch_size=batch_size)

In [24]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-4, 
                  eps=1e-6)
                  
epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

loss_fn = torch.nn.CrossEntropyLoss()

In [25]:
from sklearn.metrics import f1_score

label_dict = {'No Hate Speech': 1, 'Hate Speech': 0}

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [26]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        logits = outputs[1]
        loss = loss_fn(logits.cpu(), batch[2].cpu())
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [27]:
import torch.nn.functional as F 

for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        logits = outputs[1].cpu()
        loss = loss_fn(logits, batch[2].cpu())
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/286 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.574989821084521
Validation loss: 0.4611227913863129
F1 Score (Weighted): 0.8097222493613542


Epoch 2:   0%|          | 0/286 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.45547294399173527
Validation loss: 0.5090057846262224
F1 Score (Weighted): 0.8149029053138642


Epoch 3:   0%|          | 0/286 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.3943453873931007
Validation loss: 0.5342092715824643
F1 Score (Weighted): 0.8254338254338255


Epoch 4:   0%|          | 0/286 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.3404985636055886
Validation loss: 0.5955228974214859
F1 Score (Weighted): 0.8288494228252822


Epoch 5:   0%|          | 0/286 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.33222201422599335
Validation loss: 0.6559404027389569
F1 Score (Weighted): 0.8110780065005417


In [28]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=2,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

model.load_state_dict(torch.load('finetuned_BERT_epoch_4.model', map_location=torch.device('cpu')))

_, predictions, true_vals = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Class: Hate Speech
Accuracy: 232/264

Class: No Hate Speech
Accuracy: 242/308



In [30]:
torch.save(model.state_dict(), f'/content/drive/My Drive/Hack 36/finetuned_BERT_hatespeech.model')