In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import pandas as pd
import time

In [2]:
!pip install transformers --quiet

In [3]:
from transformers import (AutoTokenizer, AutoModel, 
                          AutoModelForSequenceClassification, 
                          DataCollatorWithPadding, AdamW, get_scheduler,
                          get_linear_schedule_with_warmup,
                          )

In [4]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import random

In [5]:
import numpy as np
# Setting up seed value
seed_value = 42
random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)

In [6]:
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
tweets = pd.read_csv('tweets.csv')

In [8]:
tweets.head()

Unnamed: 0,tweets,cleaned_tweets
0,b'RT @kaushikrj6: More disturbing videos comin...,bkaushikrj More disturbing videos coming from ...
1,b'RT @Priyankaind_: \xe0\xa4\xb5\xe0\xa4\xbe\x...,bPriyankaind xexaxbxexaxbexexaxaxexaxbe xexaxx...
2,b'RT @zoo_bear: Thanks to UP CM for transformi...,bzoobear Thanks to UP CM for transforming Wes...
3,b'RT @zoo_bear: \xe0\xa4\xae\xe0\xa4\xbf\xe0\x...,bzoobear xexaxaexexaxbfxexaxaxexaxbfxexaxb xex...
4,b'RT @AunindyoC: The Mystery of the Murderous ...,bAunindyoC The Mystery of the Murderous MobsOn...


In [9]:
# tokenize and encode sequences in the actual test set
sub_tokens = tokenizer.batch_encode_plus(tweets["cleaned_tweets"].tolist(),
                                         max_length = 200,
                                         pad_to_max_length=True,
                                         truncation=True,
                                         return_token_type_ids=False
                                         )



In [None]:
sub_tokens

In [10]:
sub_seq = torch.tensor(sub_tokens['input_ids'])
sub_mask = torch.tensor(sub_tokens['attention_mask'])

In [None]:
sub_seq

In [11]:
sub_data = TensorDataset(sub_seq, sub_mask)
print(sub_data)

<torch.utils.data.dataset.TensorDataset object at 0x7f94f86dbc10>


In [12]:
batch_size = 32

In [13]:
# dataLoader for validation set
sub_dataloader = DataLoader(sub_data, 
                            batch_size=batch_size)

In [14]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [15]:
checkpoint = "distilbert-base-uncased"
PATH = "toxic_distilBERT_multilabel.pt"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 6)
model.load_state_dict(torch.load(PATH, map_location=torch.device('cpu')))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

<All keys matched successfully>

In [16]:
# Measure how long the evaluation going to takes.
t0 = time.time()

for step, batch in enumerate(sub_dataloader):
    # Progress update every 40 batches.
    if step % 40 == 0 and not step == 0:
        # Calculate elapsed time in minutes.
        elapsed = format_time(time.time() - t0)
        # Report progress.
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(sub_dataloader), elapsed))
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    with torch.no_grad():
        outputs = model(b_input_ids, b_input_mask)
        pred_probs = torch.sigmoid(outputs.logits)
        if step == 0:
            predictions = pred_probs.cpu().detach().numpy()
        else:
            predictions = np.append(predictions, pred_probs.cpu().detach().numpy(), axis=0)

In [17]:
predictions

array([[2.58184299e-02, 1.64508692e-05, 8.92855867e-04, 4.19273310e-05,
        2.51427846e-04, 8.40794601e-05],
       [2.10384160e-01, 2.16673332e-04, 3.90527607e-03, 4.71533567e-04,
        3.84325953e-03, 9.46234679e-04],
       [1.91101397e-04, 8.30880163e-06, 9.90097760e-05, 2.04638691e-05,
        5.26862787e-05, 2.91015022e-05],
       [4.13833827e-01, 3.14576493e-04, 6.43377844e-03, 6.14721503e-04,
        7.63138337e-03, 8.92808195e-04],
       [1.20736461e-03, 5.09690244e-06, 6.23194792e-05, 1.83295688e-05,
        7.43367200e-05, 3.72785980e-05],
       [3.57686728e-01, 4.02937403e-05, 3.13203433e-03, 9.71350455e-05,
        1.99661776e-03, 1.86942198e-04],
       [2.33035788e-01, 1.40861594e-04, 4.51837247e-03, 2.51069723e-04,
        3.27191409e-03, 3.96356016e-04],
       [1.74712122e-03, 8.13095630e-06, 1.27191277e-04, 2.83352820e-05,
        1.60767944e-04, 3.55688353e-05],
       [3.88994318e-04, 4.65011453e-06, 9.78781391e-05, 1.11067056e-05,
        3.93675073e-05, 

In [17]:
categories = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [18]:
predictions_df = pd.DataFrame(predictions, columns = categories)

In [19]:
predictions_df

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0.025818,0.000016,0.000893,0.000042,0.000251,0.000084
1,0.210384,0.000217,0.003905,0.000472,0.003843,0.000946
2,0.000191,0.000008,0.000099,0.000020,0.000053,0.000029
3,0.413834,0.000315,0.006434,0.000615,0.007631,0.000893
4,0.001207,0.000005,0.000062,0.000018,0.000074,0.000037
...,...,...,...,...,...,...
93,0.071086,0.000036,0.001254,0.000065,0.000678,0.000114
94,0.000182,0.000008,0.000099,0.000022,0.000051,0.000025
95,0.001478,0.000011,0.000211,0.000030,0.000077,0.000028
96,0.000255,0.000006,0.000108,0.000015,0.000042,0.000021


In [20]:
toxic = sum(predictions_df['toxic'])/len(predictions_df['toxic'])
toxic

0.040565053058659886

In [21]:
for name in categories:
    calc = sum(predictions_df[name])/len(predictions_df[name]) * 100
    print(name, calc)

toxic 4.056505305865989
severe_toxic 0.004277355670331658
obscene 0.07400146001526416
threat 0.03237981459464283
insult 0.058088189618910276
identity_hate 0.021685312323862324


In [39]:
#finding individual tweets data
for name in categories:
    predictions_df[name+'_calc'] = predictions_df[name] * 100
    

In [40]:
predictions_df

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,calc,toxic_calc,severe_toxic_calc,obscene_calc,threat_calc,insult_calc,identity_hate_calc
0,0.025818,0.000016,0.000893,0.000042,0.000251,0.000084,0.008408,2.581843,0.001645,0.089286,0.004193,0.025143,0.008408
1,0.210384,0.000217,0.003905,0.000472,0.003843,0.000946,0.094623,21.038416,0.021667,0.390528,0.047153,0.384326,0.094623
2,0.000191,0.000008,0.000099,0.000020,0.000053,0.000029,0.002910,0.019110,0.000831,0.009901,0.002046,0.005269,0.002910
3,0.413834,0.000315,0.006434,0.000615,0.007631,0.000893,0.089281,41.383381,0.031458,0.643378,0.061472,0.763138,0.089281
4,0.001207,0.000005,0.000062,0.000018,0.000074,0.000037,0.003728,0.120736,0.000510,0.006232,0.001833,0.007434,0.003728
...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,0.071086,0.000036,0.001254,0.000065,0.000678,0.000114,0.011442,7.108600,0.003611,0.125433,0.006523,0.067757,0.011442
94,0.000182,0.000008,0.000099,0.000022,0.000051,0.000025,0.002495,0.018202,0.000812,0.009879,0.002222,0.005087,0.002495
95,0.001478,0.000011,0.000211,0.000030,0.000077,0.000028,0.002822,0.147843,0.001071,0.021145,0.002991,0.007657,0.002822
96,0.000255,0.000006,0.000108,0.000015,0.000042,0.000021,0.002079,0.025522,0.000594,0.010831,0.001452,0.004197,0.002079


In [48]:
toxics = predictions_df[predictions_df['toxic_calc']>20]['toxic_calc'].count()
severe_toxic = predictions_df[predictions_df['severe_toxic_calc']>20]['severe_toxic_calc'].count()
obscenes = predictions_df[predictions_df['obscene_calc']>20]['obscene_calc'].count()
threats = predictions_df[predictions_df['threat_calc']>20]['threat_calc'].count()
insults = predictions_df[predictions_df['insult_calc']>20]['insult_calc'].count()
identity_hates = predictions_df[predictions_df['identity_hate_calc']>20]['identity_hate_calc'].count()

In [49]:
print(toxics,severe_toxic,obscenes,threats,insults,identity_hates)

10 0 0 0 0 0
