## Import Libraries

In [1]:
import logging

CONSOLE_LEVEL = logging.INFO
LOGFILE_LEVEL = logging.DEBUG

In [2]:
import numpy as np
import pandas as pd
import time
import datetime
import gc
import random
from nltk.corpus import stopwords
import re
import nltk

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup

In [3]:
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


## Connect to Google Drive

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
%cd /content/drive/My Drive/

/content/drive/My Drive


In [6]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_df.head()

Unnamed: 0,category,sub_category,crimeaditionalinfo
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,I had continue received random calls and abusi...
1,Online Financial Fraud,Fraud CallVishing,The above fraudster is continuously messaging ...
2,Online Gambling Betting,Online Gambling Betting,He is acting like a police and demanding for m...
3,Online and Social Media Related Crime,Online Job Fraud,In apna Job I have applied for job interview f...
4,Online Financial Fraud,Fraud CallVishing,I received a call from lady stating that she w...


In [7]:
synthetic_train_df = pd.read_csv("synthetic_train.csv")

df = pd.concat([train_df, synthetic_train_df], axis=0)

## Categories and Subcategories

In [None]:
category_set = list(set(list(df["category"])))
subcategory_set = list(set(list(df["sub_category"])))

In [None]:
from collections import defaultdict
category_to_subcategory_map = defaultdict(set)

In [None]:
for index, row in df.iterrows():
  category = row.category
  subcategory = row.sub_category

  category_to_subcategory_map[category].add(subcategory)

In [None]:
category_count = defaultdict(int)
subcategory_count = defaultdict(int)

for index, row in df.iterrows():
  category = row.category
  subcategory = row.sub_category

  category_count[category] += 1
  subcategory_count[subcategory] += 1

In [None]:
category_to_subcategory_map

defaultdict(set,
            {'Online and Social Media Related Crime': {'Cheating by Impersonation',
              'Cyber Bullying  Stalking  Sexting',
              'EMail Phishing',
              'FakeImpersonating Profile',
              'Impersonating Email',
              'Intimidating Email',
              'Online Job Fraud',
              'Online Matrimonial Fraud',
              'Profile Hacking Identity Theft',
              'Provocative Speech for unlawful acts'},
             'Online Financial Fraud': {'Business Email CompromiseEmail Takeover',
              'DebitCredit Card FraudSim Swap Fraud',
              'DematDepository Fraud',
              'EWallet Related Fraud',
              'Fraud CallVishing',
              'Internet Banking Related Fraud',
              'UPI Related Frauds'},
             'Online Gambling  Betting': {'Online Gambling  Betting'},
             'RapeGang Rape RGRSexually Abusive Content': {nan},
             'Any Other Cyber Crime': {'Other'},
 

In [None]:
subcategory_count

defaultdict(int,
            {'Cyber Bullying  Stalking  Sexting': 4089,
             'Fraud CallVishing': 5803,
             'Online Gambling  Betting': 544,
             'Online Job Fraud': 912,
             'UPI Related Frauds': 26856,
             'Internet Banking Related Fraud': 8872,
             nan: 6691,
             'Other': 10878,
             'Profile Hacking Identity Theft': 2073,
             'DebitCredit Card FraudSim Swap Fraud': 10805,
             'EWallet Related Fraud': 4047,
             'Data Breach/Theft': 484,
             'Cheating by Impersonation': 1988,
             'Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks': 504,
             'FakeImpersonating Profile': 2299,
             'Cryptocurrency Fraud': 480,
             'Malware Attack': 521,
             'Business Email CompromiseEmail Takeover': 390,
             'Email Hacking': 449,
             'Hacking/Defacement': 540,
             'Unauthorised AccessData Breach': 1114,
      

## Data Preprocessing

In [8]:
sw = nltk.download('stopwords')
hinglish_stops = set(stopwords.words('hinglish'))
english_stops = set(stopwords.words('english'))


def clean_text(text):

    text = text.lower()

    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    text = re.sub(r"http\S+", "",text) #Removing URLs
    #text = re.sub(r"http", "",text)

    html=re.compile(r'<.*?>')

    text = html.sub(r'',text) #Removing html tags

    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'

    for p in punctuations:
        text = text.replace(p,'') #Removing punctuations

    text = [word.lower() for word in text.split() if ((word.lower() not in hinglish_stops) and (word.lower() not in english_stops))]

    text = " ".join(text) #removing stopwords

    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) #Removing emojis

    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
# Remove all columns where crimeaditionalinfo column is nan
df = df[df['crimeaditionalinfo'].notna()]

df['crimeaditionalinfo'] = df['crimeaditionalinfo'].apply(lambda x: clean_text(x))

In [10]:
messages = df.crimeaditionalinfo.values
categories = df.category.values
subcategories = df.sub_category.values

In [11]:
# Use the BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [None]:
print(' Original: ', messages[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(messages[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(messages[0])))

 Original:  continue received random calls abusive messages whatsapp added number unknown facebook group girls calls unknown numbers pls sort issue possible
Tokenized:  ['continue', 'received', 'random', 'calls', 'abusive', 'messages', 'what', '##sa', '##pp', 'added', 'number', 'unknown', 'facebook', 'group', 'girls', 'calls', 'unknown', 'numbers', 'pl', '##s', 'sort', 'issue', 'possible']
Token IDs:  [3613, 2363, 6721, 4455, 20676, 7696, 2054, 3736, 9397, 2794, 2193, 4242, 9130, 2177, 3057, 4455, 4242, 3616, 20228, 2015, 4066, 3277, 2825]


In [13]:
max_len = 512

In [12]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

labels = list(categories)
categories_encoded = label_encoder.fit_transform(labels)

In [None]:
label_encoder = LabelEncoder()

labels = list(subcategories)
subcategories_encoded = label_encoder.fit_transform(labels)

In [None]:
# Use the BERT Tkenizer to tokenize the messages

input_ids = []

# For every message..
for msg in messages:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        msg,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
categories = torch.tensor(categories_encoded)

# Print sentence 0, now as a list of IDs.
print('Original: ', messages[0])
print('Token IDs:', input_ids[0])

Original:  continue received random calls abusive messages whatsapp added number unknown facebook group girls calls unknown numbers pls sort issue possible
Token IDs: tensor([  101,  3613,  2363,  6721,  4455, 20676,  7696,  2054,  3736,  9397,
         2794,  2193,  4242,  9130,  2177,  3057,  4455,  4242,  3616, 20228,
         2015,  4066,  3277,  2825,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,    

## Class Imbalance

In [None]:
from imblearn.pipeline import Pipeline

smote_sampling_count = {9: 20000, 7: 65000, 8: 6000, 11: 10000,
                  0: 18000, 3: 9000, 2: 6000, 13: 8000, 14: 8000,
                  5: 9000, 4: 6000, 1: 7000, 6: 6000, 10: 5000, 12: 5000}

undersampling_count = {9: 14000, 7: 62000, 8: 4000, 11: 7000,
                  0: 15000, 3: 7000, 2: 3000, 13: 4000, 14: 4000,
                  5: 5000, 4: 3000, 1: 4000, 6: 4000, 10: 2500, 12: 2500}


from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

over = SMOTE(sampling_strategy=smote_sampling_count)
under = RandomUnderSampler(sampling_strategy=undersampling_count)

steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

input_ids_resampled, categories_resampled = pipeline.fit_resample(input_ids, categories)

In [None]:
len(input_ids_resampled)

141000

In [None]:
# Create the attention masks manually

attention_masks = []


for i in range(len(input_ids_resampled)):

  input = input_ids_resampled[i]

  for j in range(len(input)):
    if input[j] == 0:
      index = j
      break

  attention = [1]*index + [0]*(max_len-index)
  attention_list = [attention]
  attention_tensor = torch.tensor(attention_list)
  attention_masks.append(attention_tensor)

In [None]:
# Convert the arrays to tensors
categories = torch.tensor(categories_resampled)
attention_masks = torch.cat(attention_masks, dim=0)
input_ids = torch.tensor(list(input_ids_resampled))

  input_ids = torch.tensor(list(input_ids_resampled))


In [None]:
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, categories)

# Calculate the number of samples to include in each set.
train_size = int(0.9 * len(dataset))
val_size = len(dataset)  - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

126,900 training samples
14,100 validation samples


In [None]:
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order.
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [None]:
# Code for Focal Loss (Not used)

"""
import numpy as np


class_counts = np.array([10877, 379, 480, 3608, 161, 1710, 183, 57416, 444, 12138, 56, 2822, 1, 1552, 1838])
total = np.sum(class_counts)

# Class frequencies
frequencies = class_counts / total

# Inverse frequencies (to give more weight to minority classes)
inverse_frequencies = 1.0 / frequencies
alpha = inverse_frequencies / np.sum(inverse_frequencies)
"""

'\nimport numpy as np\n\n\nclass_counts = np.array([10877, 379, 480, 3608, 161, 1710, 183, 57416, 444, 12138, 56, 2822, 1, 1552, 1838])\ntotal = np.sum(class_counts)\n\n# Class frequencies\nfrequencies = class_counts / total\n\n# Inverse frequencies (to give more weight to minority classes)\ninverse_frequencies = 1.0 / frequencies\nalpha = inverse_frequencies / np.sum(inverse_frequencies)\n'

In [20]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single
# linear classification layer on top.
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 15, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# if device == "cuda:0":
# # Tell pytorch to run this model on the GPU.
#     model = model.cuda()
model = model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
                )



In [None]:
# Code for Focal Loss (Not used finally)
"""
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=None, reduction='mean'):
        '''
        Args:
            gamma (float, optional): Focusing parameter. Default is 2.0.
            alpha (float or list, optional): Class balancing factor. If set, should be a float or a list of class-wise weights. Default is None.
            reduction (string, optional): Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default is 'mean'.
        '''
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.reduction = reduction

        if isinstance(alpha, (list, torch.Tensor)):
            self.alpha = torch.tensor(alpha)

    def forward(self, inputs, targets):
        # Apply softmax to get the probabilities
        if inputs.dim() > 2:
            inputs = inputs.view(inputs.size(0), inputs.size(1), -1)  # N,C,H,W -> N,C,H*W
            inputs = inputs.transpose(1, 2)    # N,C,H*W -> N,H*W,C
            inputs = inputs.contiguous().view(-1, inputs.size(-1))  # N,H*W,C -> N*H*W,C
        targets = targets.view(-1, 1)

        # Compute the log probability
        logpt = F.log_softmax(inputs, dim=-1)
        logpt = logpt.gather(1, targets)
        logpt = logpt.view(-1)
        pt = logpt.exp()

        # Compute the focal loss
        if self.alpha is not None:
            if self.alpha.type() != inputs.data.type():
                self.alpha = self.alpha.type_as(inputs.data)
            at = self.alpha.gather(0, targets.view(-1))
            logpt = logpt * at

        loss = -1 * (1 - pt) ** self.gamma * logpt

        # Apply the reduction method
        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            return loss
"""

In [None]:
epochs = 2

# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

## Fine tune the BERT Model

In [None]:
seed_val = 74
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):
        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the device using the
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        optimizer.zero_grad()
        output = model(b_input_ids,
                       token_type_ids=None,
                       attention_mask=b_input_mask,
                       labels=b_labels)

        loss = output.loss
        total_train_loss += loss.item()
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # Clip the norm of the gradients to 1.0.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    print("")
    print("Running Validation...")
    t0 = time.time()
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()
    # Tracking variables
    total_eval_accuracy = 0
    best_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():
            output= model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
        loss = output.loss
        total_eval_loss += loss.item()
        # Move logits and labels to CPU if we are using GPU
        logits = output.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    if avg_val_accuracy > best_eval_accuracy:
        torch.save(model, 'bert_model')
        best_eval_accuracy = avg_val_accuracy
    #print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    #print("  Validation took: {:}".format(validation_time))
    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...

  Average training loss: 1.16
  Training epoch took: 0:38:20

Running Validation...
  Accuracy: 0.64

Training...

  Average training loss: 1.01
  Training epoch took: 0:38:18

Running Validation...
  Accuracy: 0.65

Training complete!
Total training took 1:19:25 (h:mm:ss)


In [None]:
import pickle

filename = 'bert_model_smote_1.pkl'
pickle.dump(model, open(filename, 'wb'))

## Testing

In [14]:
df_test = pd.read_csv("test.csv")
df_test = df_test[df_test['crimeaditionalinfo'].notna()]
df_test = df_test[df_test['category'].notna()]

df['crimeaditionalinfo'] = df['crimeaditionalinfo'].apply(lambda x: clean_text(x))
df_test = df_test[df_test['category'] != 'Crime Against Women & Children']

test_categories = df_test.category.values
test_messages = df_test.crimeaditionalinfo.values

test_categories_encoded = label_encoder.transform(list(categories))

In [15]:
test_input_ids = []
test_attention_masks = []

for msg in messages:
    encoded_dict = tokenizer.encode_plus(
                        msg,
                        add_special_tokens = True,
                        max_length = 512,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    test_input_ids.append(encoded_dict['input_ids'])
    test_attention_masks.append(encoded_dict['attention_mask'])

test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [16]:
test_dataset = TensorDataset(test_input_ids, test_attention_masks)
test_dataloader = DataLoader(
            test_dataset,
            sampler = SequentialSampler(test_dataset),
            batch_size = 32
        )

In [18]:
import pickle
model = pickle.load(open("bert_model_smote_1.pkl", 'rb'))

In [21]:
predictions = []
for batch in test_dataloader:
  b_input_ids = batch[0].to(device)
  b_input_mask = batch[1].to(device)

  with torch.no_grad():
    output= model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = output.logits
    logits = logits.detach().cpu().numpy()
    pred_flat = np.argmax(logits, axis=1).flatten()
    predictions.extend(list(pred_flat))

In [27]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix

In [23]:
f1 = f1_score(list(test_categories_encoded), predictions, average='micro')

In [25]:
f1

0.7942520884779068

In [28]:
conf_matrix = confusion_matrix(list(test_categories_encoded), predictions)

In [32]:
print(conf_matrix)

[[ 3469     2    24     0     0   141     0  6191     9  1032     1     1
      1     0     6]
 [   17   235     0     0     0     1     0    39     1   157     0     2
      0     0    27]
 [   38     0   161     0     0     3     0   267     4     5     2     0
      0     0     0]
 [    0     0     0  3608     0     0     0     0     0     0     0     0
      0     0     0]
 [   51     0     0     0    93     5     5    71     0    31     0     0
      5     0     0]
 [  252     0     0     0     0   833     0   537     0   376     9     0
      0     0     3]
 [   20     1     0     0     2     2    94   104     0    60     0     0
      0     0     0]
 [ 1097     1    25     0     0    67     1 55829     4   492     0     0
      0     0     0]
 [   37     0     1     0     0     4     0   308   163    31     0     0
      0     0     0]
 [  827     8     7     0     0   153     3  3439     5  8150     0     8
      3     0    35]
 [    2     0     0     0     0    12     1     8 