In [1]:
import torch

from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, AutoModel
from torch.optim import AdamW
import pandas as pd
from torch.utils.data import random_split, DataLoader, TensorDataset, RandomSampler, SequentialSampler
from sklearn.preprocessing import LabelEncoder
from pytorch_tabnet.tab_model import TabNetClassifier
import torch.nn as nn
from sklearn.utils import compute_class_weight

import os
import tqdm as notebook_tqdm

import numpy as np
import time
import datetime
import random

import matplotlib.pyplot as plt
from matplotlib import font_manager
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.manual_seed(42)

if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [None]:
model_name = "google/muril-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           num_labels=2,
                                                           output_hidden_states=True)

model.to(device)

In [4]:
def encode_label(y):
    encoder = LabelEncoder()
    encoder.fit(y)
    y = encoder.transform(y)
    print(f"Encoder has the following classes: {encoder.classes_}")
    print(f"The new data type for y is {type(y)}")

    return y

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


def get_TP_FP_FN(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    TP = np.sum((pred_flat == 1) & (labels_flat == 1))
    FP = np.sum((pred_flat == 1) & (labels_flat == 0))
    FN = np.sum((pred_flat == 0) & (labels_flat == 1))

    return {'TP': TP,
            'FP': FP,
            'FN': FN}


def precision(TP, FP):
    if TP + FP == 0:
        return 0
    else:
        return TP / (TP + FP)


def recall(TP, FN):
    if TP + FN == 0:
        return 0
    else:
        return TP / (TP + FN)


def f1_score(precision, recall):
    if precision + recall == 0:
        return 0
    else:
        return 2 * precision * recall / (precision + recall)


def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [3]:
df = pd.read_csv("data/hate-dataset.csv")
df = df.drop(['text'], axis=1)
df.head()

Unnamed: 0,label,data
0,non-hate,nice buro yes i am हुसैन खान is me happy owesi...
1,hate,भवन k लोडे कटुए काट k फेंक देंगे बहनचोद तुझे सुअर
2,hate,ye india के सारे मुसलमान बाबर or अकबर की नागरा...
3,hate,भाई इनकी मां चोद do मील को आहा b मिले हिन्दू म...
4,hate,गलती हो गई बाबा साहब अंबेडकर se तुन जैसे lunf ...


In [6]:
X = df.data.values
y = df.label.values

y = encode_label(y)

class_weight = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weight = torch.tensor(class_weight, dtype=torch.float)
class_weight = class_weight.to(device)

input_ids = []
attention_masks = []

# convert the sentences into tokens for MuRIL
# pre-processing, append input_ids and
# attention_masks into a list
for sentence in X:
    encoded_dict = tokenizer.encode_plus(
                                        sentence,
                                        add_special_tokens=True,
                                        max_length=128,
                                        padding='max_length',
                                        truncation=True,
                                        return_attention_mask=True,
                                        return_tensors='pt'
    )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])


# convert the lists into tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(y)

# print a visualisation
print('Original: ', X[0])
print('Token IDs:', input_ids[0])
print('Label:', labels[0])

Encoder has the following classes: ['hate' 'non-hate']
The new data type for y is <class 'numpy.ndarray'>


Original:  nice buro yes i am हुसैन खान is me happy owesi brothers and cold and my friend is all is well with your video
Token IDs: tensor([   104,  19634,    173,  55696,  29768,    180,   3516,  26436,   3907,
          1121,   1868,   9971,    186, 107972,   1206,  19412,   1111,  15756,
          1111,   1725,   8462,   1121,   1375,   1121,   1999,   1147,   1427,
          3080,    105,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,     

In [7]:
batch_size = 32

dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print(f"{train_size} training samples")
print(f"{val_size} validation samples")

train_dataloader = DataLoader(
                train_dataset,
                sampler = RandomSampler(train_dataset),
                batch_size = batch_size
)

validation_dataloader = DataLoader(
                val_dataset,
                sampler = SequentialSampler(val_dataset),
                batch_size = batch_size
)

22672 training samples
5668 validation samples


In [8]:
epochs = 3

optimizer = AdamW(model.parameters(),
                lr = 2e-5,
                eps = 1e-8
                )

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [9]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []

total_t0 = time.time()


for epoch_i in range(0, epochs):
    print(f'\n======== Epoch {epoch_i + 1} / {epochs} ========')
    print('Training...')

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)

            print('\tBatch {:>5,}  of  {:>5,}.\tElapsed: {:}.'.format(step, len(train_dataloader), elapsed))


        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()


        outputs = model(b_input_ids,
                             token_type_ids=None,
                             attention_mask=b_input_mask,
                             labels=b_labels)

        #loss = outputs.loss
        criterion = nn.CrossEntropyLoss(weight=class_weight, reduction='mean')
        
        loss = criterion(outputs.logits, b_labels)
        logits = outputs.logits

        total_train_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()


    avg_train_loss = total_train_loss / len(train_dataloader)

    training_time = format_time(time.time() - t0)

    print("\nAverage training loss: {0:.2f}".format(avg_train_loss))
    print("\nTraining epoch took: {:}".format(training_time))


    print("\nRunning validation...")

    t0 = time.time()

    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    total_prec = 0
    total_rec = 0

    prec = 0
    rec = 0
    f1_val = 0

    TP = 0
    FP = 0
    FN = 0

    for batch in validation_dataloader:

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids,
                                token_type_ids=None,
                                attention_mask=b_input_mask,
                                labels=b_labels)

            criterion = nn.CrossEntropyLoss(weight=class_weight, reduction='mean')

            loss = criterion(outputs.logits, b_labels)
            logits = outputs.logits

        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)

        counts = get_TP_FP_FN(logits, label_ids)

        TP = counts['TP']
        FP = counts['FP']
        FN = counts['FN']

        total_prec += precision(TP, FP)
        total_rec += recall(TP, FN)


    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("\tAccuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(validation_dataloader)

    validation_time = format_time(time.time() - t0)

    print("\nValidation Loss: {0:.2f}".format(avg_val_loss))
    print("\nValidation took: {:}".format(validation_time))


    prec = total_prec / len(validation_dataloader)
    rec = total_rec / len(validation_dataloader)
    f1_val = f1_score(prec, rec)

    print("\nPrecision: {0:.2f}".format(prec))
    print("Recall: {0:.2f}".format(rec))
    print("F1 Score: {0:.2f}".format(f1_val))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time,
            'Precision': prec,
            'Recall': rec,
            'F1 Score': f1_val,
        }
    )

print("\nTraining complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...


	Batch    40  of    709.	Elapsed: 0:00:14.
	Batch    80  of    709.	Elapsed: 0:00:28.
	Batch   120  of    709.	Elapsed: 0:00:42.
	Batch   160  of    709.	Elapsed: 0:00:56.
	Batch   200  of    709.	Elapsed: 0:01:10.
	Batch   240  of    709.	Elapsed: 0:01:24.
	Batch   280  of    709.	Elapsed: 0:01:39.
	Batch   320  of    709.	Elapsed: 0:01:53.
	Batch   360  of    709.	Elapsed: 0:02:07.
	Batch   400  of    709.	Elapsed: 0:02:21.
	Batch   440  of    709.	Elapsed: 0:02:35.
	Batch   480  of    709.	Elapsed: 0:02:49.
	Batch   520  of    709.	Elapsed: 0:03:04.
	Batch   560  of    709.	Elapsed: 0:03:18.
	Batch   600  of    709.	Elapsed: 0:03:32.
	Batch   640  of    709.	Elapsed: 0:03:46.
	Batch   680  of    709.	Elapsed: 0:04:00.

Average training loss: 0.60

Training epoch took: 0:04:10

Running validation...
	Accuracy: 0.76

Validation Loss: 0.53

Validation took: 0:00:19

Precision: 0.81
Recall: 0.80
F1 Score: 0.80

Training...
	Batch    40  of    709.	Elapsed: 0:00:14.
	Batch    80  of    7

In [14]:
output_dir = "./model_save/"

model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

args = {"learning_rate": 2e-5, 
        "adam_epsilon": 1e-8}

torch.save(args, os.path.join(output_dir, 'training_args.bin'))

('./model_save/tokenizer_config.json',
 './model_save/special_tokens_map.json',
 './model_save/vocab.txt',
 './model_save/added_tokens.json',
 './model_save/tokenizer.json')

In [3]:
output_dir = "./model_save/"

muril_model = AutoModelForSequenceClassification.from_pretrained(output_dir, num_labels=2, output_hidden_states=False, output_attentions=False)
tokenizer_model = AutoTokenizer.from_pretrained(output_dir)

muril_model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(197285, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [19]:
#sent = "चूड़ियां पहन लो शर्म करो Baised ho तुम log मुल्लो pr एक action नहीं ले पाते संविधान की कसम खाते ho संविधान के रक्षक हो न की ममता या तृणमूल के इतनी चाटुकारिता सही नहीं"

sent = "muslims have good jawline क्यूंकि उनका muslim होता है"

input_ids = []
attention_masks = []

encoded_dict = tokenizer_model.encode_plus(
                                        sent,
                                        add_special_tokens=True,
                                        max_length=128,
                                        padding='max_length',
                                        truncation=True,
                                        return_attention_mask=True,
                                        return_tensors='pt'
)

input_id = encoded_dict['input_ids']
attention_mask = encoded_dict['attention_mask']


In [20]:
muril_model.eval()

with torch.no_grad():
    outputs = muril_model(input_id,
                    token_type_ids=None,
                    attention_mask=attention_mask
                    )

    logits = outputs.logits


pred_flat = np.argmax(logits, axis=1).flatten()
pred_flat

tensor([1])