In [None]:
pip install transformers datasets torch evaluate

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset,DatasetDict,load_dataset
import matplotlib.pyplot as plt
import seaborn as sns
import re

from transformers import AutoTokenizer,AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

**LOADING THE DATA**

In [None]:
# dataset = load_dataset("selimyagci/dynamic-hate-speech-data")
# dataframe = pd.DataFrame(dataset['train'])
dataframe = pd.read_csv('hateEn.csv')

**DATA OVERVIEW**

In [None]:
dataframe.info()

In [None]:
dataframe['label'].value_counts()

In [None]:
import wordcloud
from wordcloud import WordCloud
words = ' '.join([txt for txt in dataframe['text']])
wordCloud = WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(words)

plt.figure(figsize = (10, 8))
plt.imshow(wordCloud, interpolation="bilinear")
plt.axis('off')
plt.show()

**DATA PREPARATION**

you can lower case all words for consistency


In [None]:
dataframe["text"] = dataframe["text"].str.lower()

In [None]:
dataframe = dataframe.dropna()

In [None]:
# You can remove non alphanumeric characters if desired
pattern = '[^a-zA-Z0-9äöüÄÖÜß]'
dataframe = dataframe.applymap(lambda x: re.sub(pattern, " ", x) if pd.notnull(x) else x).sort_values('text')

Remove urls

In [None]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

dataframe["text"] = dataframe["text"].apply(lambda text: remove_urls(text))

*Removing emojis*

In [None]:
#dataframe = dataframe.drop(dataframe[dataframe['text'].str.isspace()].index)
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)
dataframe["text"] = dataframe["text"].apply(lambda text: remove_emoji(text))

DATA SPLITTING

In [None]:
# You can change split sizes, following is 80-10-10 split
train, tst = train_test_split(dataframe, test_size=0.2, shuffle=True)
valid, test = train_test_split(tst, test_size=0.5)

**TOKENIZATION**

In [None]:
PRETRAINED = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED)

In [None]:
def encode(docs):
  encoded_dict = tokenizer.batch_encode_plus(docs,
                                             add_special_tokens=True,
                                             max_length=128,
                                             padding='max_length',
                                             return_attention_mask=True,
                                             truncation=True,
                                             return_tensors='pt')
  input_ids = encoded_dict['input_ids']
  attention_masks = encoded_dict['attention_mask']
  return input_ids, attention_masks

In [None]:
train_input_ids, train_att_masks = encode(train['text'].values.tolist())
valid_input_ids, valid_att_masks = encode(valid['text'].values.tolist())
test_input_ids, test_att_masks = encode(test['text'].values.tolist())

In [None]:
train_y = torch.LongTensor(train['label'].values.tolist())
valid_y = torch.LongTensor(valid['label'].values.tolist())
test_y = torch.LongTensor(test['label'].values.tolist())

Creating dataloaders

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

BATCH_SIZE = 32 # You can change batch size i.e. 16, 64, 128, it affects the runtime and generalization
train_dataset = TensorDataset(train_input_ids, train_att_masks, train_y)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

valid_dataset = TensorDataset(valid_input_ids, valid_att_masks, valid_y)
valid_sampler = SequentialSampler(valid_dataset)
valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_input_ids, test_att_masks, test_y)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED, num_labels=2,output_attentions=False,output_hidden_states=False)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model.to(device)

**FINETUNING PRETRAINED MODEL**

In [None]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

EPOCHS = 3 # hyperparameter suggested between 2-5 epochs, if validation loss continues to decrease choose a higher epoch size
LEARNING_RATE = 1e-5 # optimal hyperparameter value for learning rate, it is step size for optimizer

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer,
             num_warmup_steps=0,
            num_training_steps=len(train_dataloader)*EPOCHS )

In [None]:
from torch.nn.utils import clip_grad_norm_
from tqdm.notebook import tqdm
import numpy as np
import math

train_loss_per_epoch = []
val_loss_per_epoch = []


for epoch_num in range(EPOCHS):
    print('Epoch: ', epoch_num + 1)
    '''
    Training
    '''
    model.train()
    train_loss = 0
    for step_num, batch_data in enumerate(tqdm(train_dataloader,desc='Training')):
        input_ids, att_mask, labels = [data.to(device) for data in batch_data]
        output = model(input_ids = input_ids, attention_mask=att_mask, labels= labels)

        loss = output.loss
        train_loss += loss.item()

        model.zero_grad()
        loss.backward()
        del loss

        clip_grad_norm_(parameters=model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

    train_loss_per_epoch.append(train_loss / (step_num + 1))


    '''
    Validation
    '''
    model.eval()
    valid_loss = 0
    valid_pred = []
    with torch.no_grad():
        for step_num_e, batch_data in enumerate(tqdm(valid_dataloader,desc='Validation')):
            input_ids, att_mask, labels = [data.to(device) for data in batch_data]
            output = model(input_ids = input_ids, attention_mask=att_mask, labels= labels)

            loss = output.loss
            valid_loss += loss.item()

            valid_pred.append(np.argmax(output.logits.cpu().detach().numpy(),axis=-1))

    val_loss_per_epoch.append(valid_loss / (step_num_e + 1))
    valid_pred = np.concatenate(valid_pred)

    '''
    Loss message
    '''
    print("{0}/{1} train loss: {2} ".format(step_num+1, math.ceil(len(train) / BATCH_SIZE), train_loss / (step_num + 1)))
    print("{0}/{1} val loss: {2} ".format(step_num_e+1, math.ceil(len(valid) / BATCH_SIZE), valid_loss / (step_num_e + 1)))

Training - Validation loss curves

In [None]:
from matplotlib import pyplot as plt
epochs = range(1, EPOCHS +1 )
fig, ax = plt.subplots()
ax.plot(epochs,train_loss_per_epoch,label ='training loss')
ax.plot(epochs, val_loss_per_epoch, label = 'validation loss' )
ax.set_title('Training and Validation loss')
ax.set_xlabel('Epochs')
ax.set_ylabel('Loss')
ax.legend()
plt.show()

**Test the performance on test set**

In [None]:
model.eval()
test_pred = []
test_loss= 0
with torch.no_grad():
    for step_num, batch_data in tqdm(enumerate(test_dataloader)):
        input_ids, att_mask, labels = [data.to(device) for data in batch_data]
        output = model(input_ids = input_ids, attention_mask=att_mask, labels= labels)

        loss = output.loss
        test_loss += loss.item()

        test_pred.append(np.argmax(output.logits.cpu().detach().numpy(),axis=-1))
test_pred = np.concatenate(test_pred)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(test_pred, test['label'].to_numpy(),target_names=['notmis','mis'])) # name your labels

              precision    recall  f1-score   support

      notmis       0.81      0.80      0.80       210
         mis       0.78      0.79      0.78       190

    accuracy                           0.79       400
   macro avg       0.79      0.79      0.79       400
weighted avg       0.79      0.79      0.79       400



output misclassified ones

In [None]:
test['pred'] = test_pred
test.reset_index(level=0)
print(test[test['label']!=test['pred']].shape)
test[test['label']!=test['pred']][['text','label','pred']].head(10)

Saving the model
- either pushing to Hugging Face hub
- or you can save to your drive and download later

In [None]:
model.push_to_hub('user/your-model-name')
tokenizer.push_to_hub('your-model-name')

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

In [None]:
model_save_name = 'your-model-name.pt'
path = F"/content/gdrive/MyDrive/hate/{model_save_name}"
torch.save(model,path)