In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import pandas as pd
import time

In [None]:
!pip install transformers --quiet

In [None]:
from transformers import (AutoTokenizer, AutoModel, 
                          AutoModelForSequenceClassification, 
                          DataCollatorWithPadding, AdamW, get_scheduler,
                          get_linear_schedule_with_warmup,
                          )

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import random

In [None]:
import numpy as np
# Setting up seed value
seed_value = 42
random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)

NameError: name 'args' is not defined

In [None]:
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tweets = pd.read_csv('tweets.csv')

In [None]:
tweets.head()

In [None]:
# tokenize and encode sequences in the actual test set
sub_tokens = tokenizer.batch_encode_plus(tweets["cleaned_tweets"].tolist(),
                                         max_length = 200,
                                         pad_to_max_length=True,
                                         truncation=True,
                                         return_token_type_ids=False
                                         )

In [None]:
sub_tokens

In [None]:
sub_seq = torch.tensor(sub_tokens['input_ids'])
sub_mask = torch.tensor(sub_tokens['attention_mask'])

In [None]:
sub_seq

In [None]:
sub_data = TensorDataset(sub_seq, sub_mask)
print(sub_data)

In [None]:
batch_size = 32

In [57]:
# dataLoader for validation set
sub_dataloader = DataLoader(sub_data, 
                            batch_size=batch_size)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [51]:
checkpoint = "distilbert-base-uncased"
PATH = "toxic_distilBERT_multilabel.pt"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 6)
model.load_state_dict(torch.load(PATH, map_location=torch.device('cpu')))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

<All keys matched successfully>

In [None]:
# Measure how long the evaluation going to takes.
t0 = time.time()

for step, batch in enumerate(sub_dataloader):
    # Progress update every 40 batches.
    if step % 40 == 0 and not step == 0:
        # Calculate elapsed time in minutes.
        elapsed = format_time(time.time() - t0)
        # Report progress.
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(sub_dataloader), elapsed))
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    with torch.no_grad():
        outputs = model(b_input_ids, b_input_mask)
        pred_probs = torch.sigmoid(outputs.logits)
        if step == 0:
            predictions = pred_probs.cpu().detach().numpy()
        else:
            predictions = np.append(predictions, pred_probs.cpu().detach().numpy(), axis=0)

In [None]:
predictions

In [64]:
categories = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [65]:
predictions_df = pd.DataFrame(predictions, columns = categories)

In [66]:
predictions_df

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0.025818,0.000016,0.000893,0.000042,0.000251,0.000084
1,0.210384,0.000217,0.003905,0.000472,0.003843,0.000946
2,0.000191,0.000008,0.000099,0.000020,0.000053,0.000029
3,0.413834,0.000315,0.006434,0.000615,0.007631,0.000893
4,0.001207,0.000005,0.000062,0.000018,0.000074,0.000037
...,...,...,...,...,...,...
93,0.071086,0.000036,0.001254,0.000065,0.000678,0.000114
94,0.000182,0.000008,0.000099,0.000022,0.000051,0.000025
95,0.001478,0.000011,0.000211,0.000030,0.000077,0.000028
96,0.000255,0.000006,0.000108,0.000015,0.000042,0.000021


In [68]:
toxic = sum(predictions_df['toxic'])/len(predictions_df['toxic'])
toxic

0.040565053058659886

In [74]:
for name in categories:
    calc = sum(predictions_df[name])/len(predictions_df[name]) * 100
    print(name, calc)

toxic 4.056505305865989
severe_toxic 0.004277355670331658
obscene 0.07400146001526416
threat 0.03237981459464283
insult 0.058088189618910276
identity_hate 0.021685312323862324
