In [None]:
# mount google drive

from google.colab import drive
drive.mount('/content/drive/')

In [None]:
#Download the dataset
!wget https://raw.githubusercontent.com/chantelmariediaz/Predicting-Cyberbulling-on-Twitter/master/cleanprojectdataset.csv

In [None]:
#Install the transformers library
!pip install transformers

In [None]:
# Install contractions library
!pip install contractions

In [5]:
# Importing the necessary modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import contractions
import re

from collections import Counter

import torch
from tqdm.notebook import tqdm
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from transformers import BertTokenizer, BertForSequenceClassification

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [6]:
# Setting up NLTK
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
# Read the dataset

df = pd.read_csv('cleanprojectdataset.csv', on_bad_lines='skip')

In [8]:
print(len(df))
df.head()

1065


Unnamed: 0,Tweet,Text Label
0,.omg why are poc wearing fugly blue contacts s...,Non-Bullying
1,.Sorry but most of the runners popular right n...,Non-Bullying
2,".those jeans are hideous, and I?m afraid he?s ...",Non-Bullying
3,.I had to dress up for a presentation in class...,Non-Bullying
4,.Am I the only one who thinks justin bieber is...,Non-Bullying


In [9]:
df['Text Label'].value_counts()

Non-Bullying    638
Bullying        427
Name: Text Label, dtype: int64

In [10]:
# remove emojis from the tweets
def remove_emoji(s):
    emoji_pattern = re.compile(
          '['
          u'\U0001F600-\U0001F64F'  # emoticons
          u'\U0001F300-\U0001F5FF'  # symbols & pictographs
          u'\U0001F680-\U0001F6FF'  # transport & map symbols
          u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
          u'\U00002702-\U000027B0'
          u'\U000024C2-\U0001F251'
          ']+',
          flags=re.UNICODE)
    return emoji_pattern.sub(r'', s)

# remove URL from the tweets
def remove_URL(s):
    url = re.compile(r'https?://\S+|www\.||pic.twitter.com\S+')
    return url.sub(r'', s)

# to remove any special characters and extra space
def normalizeString(s): 
    s = re.sub("@[A-Za-z0-9]+","",s)
    s = s.replace('&amp', '')
    s = remove_emoji(s)
    s = contractions.fix(s)
    s = remove_URL(s)
    s = s.replace("'s", 's')
    s = re.sub(r"[^a-zA-Z0-9.,? ]+", r" ", s)
    s = s.strip()
    
    return s

# cleaning tweets in the dataset
def preprocessing(text):
  for i in range(len(text)):
      text[i] = str(text[i])
      text[i] = ' '.join(text[i].split('\n'))
      text[i] = ' '.join(normalizeString(str).strip() for str in text[i].split())

  return text

In [11]:
tweet_text = df['Tweet'].str.lower() # convert the tweets into lower case
tweet_text = tweet_text.tolist() # create a list of tweets

tweet_text = preprocessing(tweet_text)

In [12]:
tweet_text[5]

'we carry on? we as in fugly lookin unwanted people?'

In [13]:
df['processed_tweets'] = tweet_text

In [14]:
possible_labels = df['Text Label'].unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
    
label_dict

{'Bullying': 1, 'Non-Bullying': 0}

In [15]:
df['label'] = df['Text Label'].replace(label_dict)

In [16]:
df.head()

Unnamed: 0,Tweet,Text Label,processed_tweets,label
0,.omg why are poc wearing fugly blue contacts s...,Non-Bullying,.omg why are poc wearing fugly blue contacts s...,0
1,.Sorry but most of the runners popular right n...,Non-Bullying,.sorry but most of the runners popular right n...,0
2,".those jeans are hideous, and I?m afraid he?s ...",Non-Bullying,".those jeans are hideous, and i?m afraid he?s ...",0
3,.I had to dress up for a presentation in class...,Non-Bullying,.i had to dress up for a presentation in class...,0
4,.Am I the only one who thinks justin bieber is...,Non-Bullying,.am i the only one who thinks justin bieber is...,0


In [17]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=9)

In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)
                                          
encoded_data_train = tokenizer.batch_encode_plus(
    train_df.processed_tweets.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    test_df.processed_tweets.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(train_df.label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(test_df.label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [19]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=2,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [21]:
batch_size = 8

dataloader_train = DataLoader(dataset_train, 
                              sampler=SequentialSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler = SequentialSampler(dataset_val),
                                   batch_size=batch_size)

In [22]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-4, 
                  eps=1e-6)
                  
epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

loss_fn = torch.nn.CrossEntropyLoss()

In [23]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [24]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        logits = outputs[1]
        loss = loss_fn(logits.cpu(), batch[2].cpu())
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [25]:
import torch.nn.functional as F 

for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        logits = outputs[1].cpu()
        loss = loss_fn(logits, batch[2].cpu())
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/107 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.6228790797084292
Validation loss: 0.4815447093160064
F1 Score (Weighted): 0.8095667872887206


Epoch 2:   0%|          | 0/107 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.4672870650260805
Validation loss: 0.5066509561406242
F1 Score (Weighted): 0.8186190545006423


Epoch 3:   0%|          | 0/107 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.4245822963252594
Validation loss: 0.9095972752440031
F1 Score (Weighted): 0.7864508704290827


Epoch 4:   0%|          | 0/107 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.2686563767188133
Validation loss: 0.7959268861708956
F1 Score (Weighted): 0.8293021285608386


Epoch 5:   0%|          | 0/107 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.1664860560101779
Validation loss: 0.7602019086997542
F1 Score (Weighted): 0.8309859154929577


In [27]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=2,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

model.load_state_dict(torch.load('finetuned_BERT_epoch_5.model', map_location=torch.device('cpu')))

_, predictions, true_vals = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Class: Non-Bullying
Accuracy: 111/129

Class: Bullying
Accuracy: 66/84



In [28]:
torch.save(model.state_dict(), f'/content/drive/My Drive/Hack 36/finetuned_BERT_bullying.model')