In [1]:
import logging

CONSOLE_LEVEL = logging.INFO
LOGFILE_LEVEL = logging.DEBUG

In [2]:
import numpy as np
import pandas as pd
import time
import datetime
import gc
import random
from nltk.corpus import stopwords
import re
import nltk

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [4]:
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
%cd /content/drive/My Drive/

/content/drive/My Drive


In [7]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_df.head()

Unnamed: 0,category,sub_category,crimeaditionalinfo
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,I had continue received random calls and abusi...
1,Online Financial Fraud,Fraud CallVishing,The above fraudster is continuously messaging ...
2,Online Gambling Betting,Online Gambling Betting,He is acting like a police and demanding for m...
3,Online and Social Media Related Crime,Online Job Fraud,In apna Job I have applied for job interview f...
4,Online Financial Fraud,Fraud CallVishing,I received a call from lady stating that she w...


In [8]:
synthetic_train_df = pd.read_csv("synthetic_train.csv")

df = pd.concat([train_df, synthetic_train_df], axis=0)

In [9]:
category_set = list(set(list(df["category"])))
subcategory_set = list(set(list(df["sub_category"])))

In [10]:
from collections import defaultdict
category_to_subcategory_map = defaultdict(set)

In [12]:
for index, row in df.iterrows():
  category = row.category
  subcategory = row.sub_category

  category_to_subcategory_map[category].add(subcategory)

In [13]:
category_count = defaultdict(int)
subcategory_count = defaultdict(int)

for index, row in df.iterrows():
  category = row.category
  subcategory = row.sub_category

  category_count[category] += 1
  subcategory_count[subcategory] += 1

In [14]:
category_to_subcategory_map

defaultdict(set,
            {'Online and Social Media Related Crime': {'Cheating by Impersonation',
              'Cyber Bullying  Stalking  Sexting',
              'EMail Phishing',
              'FakeImpersonating Profile',
              'Impersonating Email',
              'Intimidating Email',
              'Online Job Fraud',
              'Online Matrimonial Fraud',
              'Profile Hacking Identity Theft',
              'Provocative Speech for unlawful acts'},
             'Online Financial Fraud': {'Business Email CompromiseEmail Takeover',
              'DebitCredit Card FraudSim Swap Fraud',
              'DematDepository Fraud',
              'EWallet Related Fraud',
              'Fraud CallVishing',
              'Internet Banking Related Fraud',
              'UPI Related Frauds'},
             'Online Gambling  Betting': {'Online Gambling  Betting'},
             'RapeGang Rape RGRSexually Abusive Content': {nan},
             'Any Other Cyber Crime': {'Other'},
 

In [15]:
subcategory_count

defaultdict(int,
            {'Cyber Bullying  Stalking  Sexting': 4089,
             'Fraud CallVishing': 5803,
             'Online Gambling  Betting': 544,
             'Online Job Fraud': 912,
             'UPI Related Frauds': 26856,
             'Internet Banking Related Fraud': 8872,
             nan: 6691,
             'Other': 10878,
             'Profile Hacking Identity Theft': 2073,
             'DebitCredit Card FraudSim Swap Fraud': 10805,
             'EWallet Related Fraud': 4047,
             'Data Breach/Theft': 484,
             'Cheating by Impersonation': 1988,
             'Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks': 504,
             'FakeImpersonating Profile': 2299,
             'Cryptocurrency Fraud': 480,
             'Malware Attack': 521,
             'Business Email CompromiseEmail Takeover': 390,
             'Email Hacking': 449,
             'Hacking/Defacement': 540,
             'Unauthorised AccessData Breach': 1114,
      

## Data Preprocessing

In [11]:
sw = nltk.download('stopwords')
hinglish_stops = set(stopwords.words('hinglish'))
english_stops = set(stopwords.words('english'))


def clean_text(text):

    text = text.lower()

    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    text = re.sub(r"http\S+", "",text) #Removing URLs
    #text = re.sub(r"http", "",text)

    html=re.compile(r'<.*?>')

    text = html.sub(r'',text) #Removing html tags

    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'

    for p in punctuations:
        text = text.replace(p,'') #Removing punctuations

    text = [word.lower() for word in text.split() if ((word.lower() not in hinglish_stops) and (word.lower() not in english_stops))]

    text = " ".join(text) #removing stopwords

    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) #Removing emojis

    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [16]:
# Remove all columns where crimeaditionalinfo column is nan
df = df[df['crimeaditionalinfo'].notna()]

df['crimeaditionalinfo'] = df['crimeaditionalinfo'].apply(lambda x: clean_text(x))

In [17]:
df = df[df['sub_category'].notna()]

In [18]:
messages = df.crimeaditionalinfo.values
categories = df.category.values
subcategories = df.sub_category.values

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [21]:
print(' Original: ', messages[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(messages[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(messages[0])))

 Original:  continue received random calls abusive messages whatsapp added number unknown facebook group girls calls unknown numbers pls sort issue possible
Tokenized:  ['continue', 'received', 'random', 'calls', 'abusive', 'messages', 'what', '##sa', '##pp', 'added', 'number', 'unknown', 'facebook', 'group', 'girls', 'calls', 'unknown', 'numbers', 'pl', '##s', 'sort', 'issue', 'possible']
Token IDs:  [3613, 2363, 6721, 4455, 20676, 7696, 2054, 3736, 9397, 2794, 2193, 4242, 9130, 2177, 3057, 4455, 4242, 3616, 20228, 2015, 4066, 3277, 2825]


In [14]:
max_len = 512

In [28]:
social_media_crime_df = df[df["category"] == "Online and Social Media Related Crime"]
financial_fraud_df = df[df["category"] == "Online Financial Fraud"]
cyber_attack_df    = df[df["category"] == "Cyber Attack/ Dependent Crimes"]
hacking_damage_df  = df[df["category"] == "Hacking  Damage to computercomputer system etc"]

In [24]:
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [25]:
def encode_input(messages, max_len):

  input_ids = []

  # For every message..
  for msg in messages:
      # `encode_plus` will:
      #   (1) Tokenize the sentence.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      #   (5) Pad or truncate the sentence to `max_length`
      #   (6) Create attention masks for [PAD] tokens.
      encoded_dict = tokenizer.encode_plus(
                          msg,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          max_length = max_len,           # Pad & truncate all sentences.
                          pad_to_max_length = True,
                          return_tensors = 'pt',     # Return pytorch tensors.
                    )

      # Add the encoded sentence to the list.
      input_ids.append(encoded_dict['input_ids'])

  return input_ids

In [26]:
def transform_df_into_vectors(df):

  messages = df.crimeaditionalinfo.values
  subcategories = df.sub_category.values

  label_encoder = LabelEncoder()
  labels = list(subcategories)
  subcategories_encoded = label_encoder.fit_transform(labels)

  input_ids = encode_input(messages, 512)

  df_dict = {'messages': input_ids, 'sub_category': subcategories_encoded}
  df = pd.DataFrame(df_dict)

  return df

In [27]:
def count_subcategory_datapoints_in_df(df):

  subcategories = df.sub_category.values

  subcategory_count = defaultdict(int)

  for i in subcategories:
    subcategory_count[i] += 1

  return subcategory_count

In [28]:
social_media_crime_df_transform = transform_df_into_vectors(social_media_crime_df)

financial_fraud_df_transform = transform_df_into_vectors(financial_fraud_df)

cyber_attack_df_transform = transform_df_into_vectors(cyber_attack_df)

hacking_damage_df_transform = transform_df_into_vectors(hacking_damage_df)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [29]:
subcategory_count_1 = count_subcategory_datapoints_in_df(social_media_crime_df_transform)

subcategory_count_2 = count_subcategory_datapoints_in_df(financial_fraud_df_transform)

subcategory_count_3 = count_subcategory_datapoints_in_df(cyber_attack_df_transform)

subcategory_count_4 = count_subcategory_datapoints_in_df(hacking_damage_df_transform)

In [30]:
num_labels = {1: 10, 2: 7, 3: 7, 4: 5}

In [52]:
subcategory_count_3

defaultdict(int, {0: 484, 1: 504, 3: 521, 2: 540, 5: 508, 4: 534, 6: 517})

In [57]:
smote_sampling_count_1 = {1: 4100, 6: 1200, 8: 2500,
                          0: 2400, 3: 2500, 9: 900,
                          7: 600, 4: 600, 2: 600,
                          5: 600}

undersampling_count_1 = {1: 3000, 6: 1000, 8: 2200,
                          0: 2000, 3: 2200, 9: 900,
                          7: 600, 4: 600, 2: 600,
                          5: 600}


smote_sampling_count_2 = {4: 8000, 6: 32000, 5: 12000, 1: 14000,
                          3: 7000, 0: 2000, 2: 4000}

undersampling_count_2 = {4: 6000, 6: 25000, 5: 7000, 1: 10000,
                          3: 7000, 0: 2000, 2: 3000}

smote_sampling_count_3 = {0: 484, 1: 504, 3: 521, 2: 540, 5: 508, 4: 534, 6: 517}

undersampling_count_3 = {0: 484, 1: 504, 3: 521, 2: 540, 5: 508, 4: 534, 6: 517}

smote_sampling_count_4 = {1: 500, 3: 1300, 4: 500, 0: 500, 2: 300}

undersampling_count_4 =  {1: 430, 3: 1100, 4: 500, 0: 400, 2: 300}


smote_sampling_count = {1: smote_sampling_count_1, 2: smote_sampling_count_2, 3: smote_sampling_count_3, 4: smote_sampling_count_4}
undersampling_count  = {1: undersampling_count_1, 2: undersampling_count_2, 3: undersampling_count_3, 4: undersampling_count_4}

In [56]:
def balance_subcatgeories_in_df(df, smote_sampling_count, undersampling_count):

  input_ids = list(df.messages.values)
  subcategories = list(df.sub_category.values)


  input_ids = torch.cat(input_ids, dim=0)
  subcategories = torch.tensor(subcategories)


  over = SMOTE(sampling_strategy=smote_sampling_count)
  under = RandomUnderSampler(sampling_strategy=undersampling_count)

  steps = [('o', over), ('u', under)]
  pipeline = Pipeline(steps=steps)

  messages_resampled, subcategories_resampled = pipeline.fit_resample(input_ids, subcategories)

  return messages_resampled, subcategories_resampled

In [33]:
def prepare_df_for_training(input_ids, subcategories):

  attention_masks = []


  for i in range(len(input_ids)):

    input = input_ids[i]

    for j in range(len(input)):
      if input[j] == 0:
        index = j
        break

    attention = [1]*index + [0]*(max_len-index)
    attention_list = [attention]
    attention_tensor = torch.tensor(attention_list)
    attention_masks.append(attention_tensor)

  categories = torch.tensor(subcategories)
  attention_masks = torch.cat(attention_masks, dim=0)
  input_ids = torch.tensor(list(input_ids))

  dataset = TensorDataset(input_ids, attention_masks, categories)

  # Calculate the number of samples to include in each set.
  train_size = int(0.9 * len(dataset))
  val_size = len(dataset)  - train_size

  # Divide the dataset by randomly selecting samples.
  train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

  batch_size = 32

  # Create the DataLoaders for our training and validation sets.
  # We'll take training samples in random order.
  train_dataloader = DataLoader(
              train_dataset,  # The training samples.
              sampler = RandomSampler(train_dataset), # Select batches randomly
              batch_size = batch_size # Trains with this batch size.
          )

  # For validation the order doesn't matter, so we'll just read them sequentially.
  validation_dataloader = DataLoader(
              val_dataset, # The validation samples.
              sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
              batch_size = batch_size # Evaluate with this batch size.
          )

  return train_dataloader, validation_dataloader

In [34]:
def bert_model(num_labels, train_dataloader, epochs):

  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = num_labels, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
    )

  model = model.to(device)

  optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
                )

  total_steps = len(train_dataloader) * epochs
  scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

  return model, optimizer, scheduler

In [35]:
seed_val = 74
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [36]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [37]:
def train_model(train_dataloader, validation_dataloader, model, optimizer, scheduler, epochs):

  training_stats = []

  # Measure the total training time for the whole run.
  total_t0 = time.time()

  # For each epoch...
  for epoch_i in range(0, epochs):

      # ========================================
      #               Training
      # ========================================
      # Perform one full pass over the training set.
      print("")
      print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
      print('Training...')

      # Measure how long the training epoch takes.
      t0 = time.time()
      total_train_loss = 0
      model.train()

      for step, batch in enumerate(train_dataloader):
          # Unpack this training batch from our dataloader.
          #
          # As we unpack the batch, we'll also copy each tensor to the device using the
          # `to` method.
          #
          # `batch` contains three pytorch tensors:
          #   [0]: input ids
          #   [1]: attention masks
          #   [2]: labels
          b_input_ids = batch[0].to(device)
          b_input_mask = batch[1].to(device)
          b_labels = batch[2].to(device)
          optimizer.zero_grad()
          output = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)

          loss = output.loss
          total_train_loss += loss.item()
          # Perform a backward pass to calculate the gradients.
          loss.backward()
          # Clip the norm of the gradients to 1.0.
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
          # Update parameters and take a step using the computed gradient.
          # The optimizer dictates the "update rule"--how the parameters are
          # modified based on their gradients, the learning rate, etc.
          optimizer.step()
          # Update the learning rate.
          scheduler.step()

      # Calculate the average loss over all of the batches.
      avg_train_loss = total_train_loss / len(train_dataloader)

      # Measure how long this epoch took.
      training_time = format_time(time.time() - t0)
      print("")
      print("  Average training loss: {0:.2f}".format(avg_train_loss))
      print("  Training epoch took: {:}".format(training_time))
      # ========================================
      #               Validation
      # ========================================
      # After the completion of each training epoch, measure our performance on
      # our validation set.
      print("")
      print("Running Validation...")
      t0 = time.time()
      # Put the model in evaluation mode--the dropout layers behave differently
      # during evaluation.
      model.eval()
      # Tracking variables
      total_eval_accuracy = 0
      best_eval_accuracy = 0
      total_eval_loss = 0
      nb_eval_steps = 0
      # Evaluate data for one epoch
      for batch in validation_dataloader:
          b_input_ids = batch[0].to(device)
          b_input_mask = batch[1].to(device)
          b_labels = batch[2].to(device)
          # Tell pytorch not to bother with constructing the compute graph during
          # the forward pass, since this is only needed for backprop (training).
          with torch.no_grad():
              output= model(b_input_ids,
                                    token_type_ids=None,
                                    attention_mask=b_input_mask,
                                    labels=b_labels)
          loss = output.loss
          total_eval_loss += loss.item()
          # Move logits and labels to CPU if we are using GPU
          logits = output.logits
          logits = logits.detach().cpu().numpy()
          label_ids = b_labels.to('cpu').numpy()
          # Calculate the accuracy for this batch of test sentences, and
          # accumulate it over all batches.
          total_eval_accuracy += flat_accuracy(logits, label_ids)
      # Report the final accuracy for this validation run.
      avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
      print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
      # Calculate the average loss over all of the batches.
      avg_val_loss = total_eval_loss / len(validation_dataloader)
      # Measure how long the validation run took.
      validation_time = format_time(time.time() - t0)
      if avg_val_accuracy > best_eval_accuracy:
          torch.save(model, 'bert_model')
          best_eval_accuracy = avg_val_accuracy
      #print("  Validation Loss: {0:.2f}".format(avg_val_loss))
      #print("  Validation took: {:}".format(validation_time))
      # Record all statistics from this epoch.
      training_stats.append(
          {
              'epoch': epoch_i + 1,
              'Training Loss': avg_train_loss,
              'Valid. Loss': avg_val_loss,
              'Valid. Accur.': avg_val_accuracy,
              'Training Time': training_time,
              'Validation Time': validation_time
          }
      )
  print("")
  print("Training complete!")

  print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

  return model

In [38]:
import pickle

In [54]:
def training_pipeline(transform_df, df_number, epochs):

  smote_sampling_dict = smote_sampling_count[df_number]
  undersampling_dict = undersampling_count[df_number]

  num_classes = num_labels[df_number]

  messages_resampled, subcategories_resampled = balance_subcatgeories_in_df(transform_df, smote_sampling_dict, undersampling_dict)

  train_dataloader, validation_dataloader = prepare_df_for_training(messages_resampled, subcategories_resampled)

  model, optimizer, scheduler = bert_model(num_classes, train_dataloader, epochs)

  model = train_model(train_dataloader, validation_dataloader, model, optimizer, scheduler, epochs)


  filename = f'subcategory_model_{df_number}.pkl'
  pickle.dump(model, open(filename, 'wb'))

In [42]:
training_pipeline(social_media_crime_df_transform, 1, 4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

  Average training loss: 1.57
  Training epoch took: 0:03:44

Running Validation...
  Accuracy: 0.53

Training...

  Average training loss: 1.22
  Training epoch took: 0:03:43

Running Validation...
  Accuracy: 0.57

Training...

  Average training loss: 1.08
  Training epoch took: 0:03:43

Running Validation...
  Accuracy: 0.58

Training...

  Average training loss: 0.98
  Training epoch took: 0:03:43

Running Validation...
  Accuracy: 0.58

Training complete!
Total training took 0:15:29 (h:mm:ss)


In [49]:
training_pipeline(financial_fraud_df_transform, 2, 4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

  Average training loss: 1.14
  Training epoch took: 0:16:18

Running Validation...
  Accuracy: 0.62

Training...

  Average training loss: 1.00
  Training epoch took: 0:16:18

Running Validation...
  Accuracy: 0.63

Training...

  Average training loss: 0.91
  Training epoch took: 0:16:18

Running Validation...
  Accuracy: 0.63

Training...

  Average training loss: 0.82
  Training epoch took: 0:16:18

Running Validation...
  Accuracy: 0.62

Training complete!
Total training took 1:07:34 (h:mm:ss)


In [59]:
training_pipeline(cyber_attack_df_transform, 3, 7)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

  Average training loss: 1.96
  Training epoch took: 0:00:59

Running Validation...
  Accuracy: 0.16

Training...

  Average training loss: 1.96
  Training epoch took: 0:00:59

Running Validation...
  Accuracy: 0.17

Training...

  Average training loss: 1.95
  Training epoch took: 0:00:59

Running Validation...
  Accuracy: 0.16

Training...

  Average training loss: 1.95
  Training epoch took: 0:00:59

Running Validation...
  Accuracy: 0.15

Training...

  Average training loss: 1.94
  Training epoch took: 0:00:59

Running Validation...
  Accuracy: 0.15

Training...

  Average training loss: 1.94
  Training epoch took: 0:00:59

Running Validation...
  Accuracy: 0.11

Training...


KeyboardInterrupt: 

In [51]:
training_pipeline(hacking_damage_df_transform, 4, 4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

  Average training loss: 1.24
  Training epoch took: 0:00:45

Running Validation...
  Accuracy: 0.60

Training...

  Average training loss: 0.98
  Training epoch took: 0:00:45

Running Validation...
  Accuracy: 0.63

Training...

  Average training loss: 0.84
  Training epoch took: 0:00:45

Running Validation...
  Accuracy: 0.68

Training...

  Average training loss: 0.75
  Training epoch took: 0:00:45

Running Validation...
  Accuracy: 0.68

Training complete!
Total training took 0:03:09 (h:mm:ss)


## Focal Loss Code (Not used)

In [74]:
"""
import numpy as np


class_counts = np.array([10877, 379, 480, 3608, 161, 1710, 183, 57416, 444, 12138, 56, 2822, 1, 1552, 1838])
total = np.sum(class_counts)

# Class frequencies
frequencies = class_counts / total

# Inverse frequencies (to give more weight to minority classes)
inverse_frequencies = 1.0 / frequencies
alpha = inverse_frequencies / np.sum(inverse_frequencies)
"""

'\nimport numpy as np\n\n\nclass_counts = np.array([10877, 379, 480, 3608, 161, 1710, 183, 57416, 444, 12138, 56, 2822, 1, 1552, 1838])\ntotal = np.sum(class_counts)\n\n# Class frequencies\nfrequencies = class_counts / total\n\n# Inverse frequencies (to give more weight to minority classes)\ninverse_frequencies = 1.0 / frequencies\nalpha = inverse_frequencies / np.sum(inverse_frequencies)\n'

In [76]:
"""
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=None, reduction='mean'):
        '''
        Args:
            gamma (float, optional): Focusing parameter. Default is 2.0.
            alpha (float or list, optional): Class balancing factor. If set, should be a float or a list of class-wise weights. Default is None.
            reduction (string, optional): Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default is 'mean'.
        '''
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.reduction = reduction

        if isinstance(alpha, (list, torch.Tensor)):
            self.alpha = torch.tensor(alpha)

    def forward(self, inputs, targets):
        # Apply softmax to get the probabilities
        if inputs.dim() > 2:
            inputs = inputs.view(inputs.size(0), inputs.size(1), -1)  # N,C,H,W -> N,C,H*W
            inputs = inputs.transpose(1, 2)    # N,C,H*W -> N,H*W,C
            inputs = inputs.contiguous().view(-1, inputs.size(-1))  # N,H*W,C -> N*H*W,C
        targets = targets.view(-1, 1)

        # Compute the log probability
        logpt = F.log_softmax(inputs, dim=-1)
        logpt = logpt.gather(1, targets)
        logpt = logpt.view(-1)
        pt = logpt.exp()

        # Compute the focal loss
        if self.alpha is not None:
            if self.alpha.type() != inputs.data.type():
                self.alpha = self.alpha.type_as(inputs.data)
            at = self.alpha.gather(0, targets.view(-1))
            logpt = logpt * at

        loss = -1 * (1 - pt) ** self.gamma * logpt

        # Apply the reduction method
        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            return loss
"""

"\nclass FocalLoss(nn.Module):\n    def __init__(self, gamma=2.0, alpha=None, reduction='mean'):\n        '''\n        Args:\n            gamma (float, optional): Focusing parameter. Default is 2.0.\n            alpha (float or list, optional): Class balancing factor. If set, should be a float or a list of class-wise weights. Default is None.\n            reduction (string, optional): Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'. Default is 'mean'.\n        '''\n        super(FocalLoss, self).__init__()\n        self.gamma = gamma\n        self.alpha = alpha\n        self.reduction = reduction\n\n        if isinstance(alpha, (list, torch.Tensor)):\n            self.alpha = torch.tensor(alpha)\n\n    def forward(self, inputs, targets):\n        # Apply softmax to get the probabilities\n        if inputs.dim() > 2:\n            inputs = inputs.view(inputs.size(0), inputs.size(1), -1)  # N,C,H,W -> N,C,H*W\n            inputs = inputs.transpose(1, 2)    # N,C,H*

## Testing

In [20]:
df_test = pd.read_csv("test.csv")
df_test = df_test[df_test['crimeaditionalinfo'].notna()]
df_test = df_test[df_test['category'].notna()]

df['crimeaditionalinfo'] = df['crimeaditionalinfo'].apply(lambda x: clean_text(x))
df_test = df_test[df_test['category'] != 'Crime Against Women & Children']

In [21]:
df_test_1 = df_test[df_test["category"] == "Online and Social Media Related Crime"]
df_test_2 = df_test[df_test["category"] == "Online Financial Fraud"]
df_test_3 = df_test[df_test["category"] == "Cyber Attack/ Dependent Crimes"]
df_test_4 = df_test[df_test["category"] == "Hacking  Damage to computercomputer system etc"]

In [25]:
df_test_1.head()

Unnamed: 0,category,sub_category,crimeaditionalinfo
16,Online and Social Media Related Crime,Cheating by Impersonation,SO AGRI...
33,Online and Social Media Related Crime,EMail Phishing,Dear sir\r\nMera name simarjeet pannu h meri G...
34,Online and Social Media Related Crime,Profile Hacking Identity Theft,all my social media accounts were hacked and m...
35,Online and Social Media Related Crime,Profile Hacking Identity Theft,I was unaware of my account being hacked when ...
45,Online and Social Media Related Crime,Cheating by Impersonation,I am getting sms continuously from pretending...


In [29]:
test_categories_1 = df_test_1.sub_category.values
test_messages_1 = df_test_1.crimeaditionalinfo.values

test_categories_2 = df_test_2.sub_category.values
test_messages_2 = df_test_2.crimeaditionalinfo.values

test_categories_3 = df_test_3.sub_category.values
test_messages_3 = df_test_3.crimeaditionalinfo.values

test_categories_4 = df_test_4.sub_category.values
test_messages_4 = df_test_4.crimeaditionalinfo.values

In [31]:
subcategories_1 = social_media_crime_df.sub_category.values
subcategories_2 = financial_fraud_df.sub_category.values
subcategories_3 = cyber_attack_df.sub_category.values
subcategories_4 = hacking_damage_df.sub_category.values

In [32]:
label_encoder_1 = LabelEncoder()
labels = list(subcategories_1)
subcategories_encoded = label_encoder_1.fit_transform(labels)

In [34]:
label_encoder_2 = LabelEncoder()
labels = list(subcategories_2)
subcategories_encoded = label_encoder_2.fit_transform(labels)

In [35]:
label_encoder_3 = LabelEncoder()
labels = list(subcategories_3)
subcategories_encoded = label_encoder_3.fit_transform(labels)

In [36]:
label_encoder_4 = LabelEncoder()
labels = list(subcategories_4)
subcategories_encoded = label_encoder_4.fit_transform(labels)

In [37]:
test_categories_encoded_1 = label_encoder_1.transform(list(test_categories_1))
test_categories_encoded_2 = label_encoder_2.transform(list(test_categories_2))
test_categories_encoded_3 = label_encoder_3.transform(list(test_categories_3))
test_categories_encoded_4 = label_encoder_4.transform(list(test_categories_4))

In [22]:
def get_embeddings_attention_masks(messages):

  test_input_ids = []
  test_attention_masks = []

  for msg in messages:
      encoded_dict = tokenizer.encode_plus(
                          msg,
                          add_special_tokens = True,
                          max_length = 512,
                          pad_to_max_length = True,
                          return_attention_mask = True,
                          return_tensors = 'pt',
                    )
      test_input_ids.append(encoded_dict['input_ids'])
      test_attention_masks.append(encoded_dict['attention_mask'])

  test_input_ids = torch.cat(test_input_ids, dim=0)
  test_attention_masks = torch.cat(test_attention_masks, dim=0)
  return test_input_ids, test_attention_masks

In [38]:
test_input_ids_1, test_attention_masks_1 = get_embeddings_attention_masks(test_messages_1)
test_input_ids_2, test_attention_masks_2 = get_embeddings_attention_masks(test_messages_2)
test_input_ids_3, test_attention_masks_3 = get_embeddings_attention_masks(test_messages_3)
test_input_ids_4, test_attention_masks_4 = get_embeddings_attention_masks(test_messages_4)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [39]:
test_dataset_1 = TensorDataset(test_input_ids_1, test_attention_masks_1)
test_dataloader_1 = DataLoader(
            test_dataset_1,
            sampler = SequentialSampler(test_dataset_1),
            batch_size = 32
        )

test_dataset_2 = TensorDataset(test_input_ids_2, test_attention_masks_2)
test_dataloader_2 = DataLoader(
            test_dataset_2,
            sampler = SequentialSampler(test_dataset_2),
            batch_size = 32
        )

test_dataset_3 = TensorDataset(test_input_ids_3, test_attention_masks_3)
test_dataloader_3 = DataLoader(
            test_dataset_3,
            sampler = SequentialSampler(test_dataset_3),
            batch_size = 32
        )

test_dataset_4 = TensorDataset(test_input_ids_4, test_attention_masks_4)
test_dataloader_4 = DataLoader(
            test_dataset_4,
            sampler = SequentialSampler(test_dataset_4),
            batch_size = 32
        )

In [40]:
import pickle
model_1 = pickle.load(open("subcategory_model_1.pkl", 'rb'))
model_2 = pickle.load(open("subcategory_model_2.pkl", 'rb'))
model_3 = pickle.load(open("subcategory_model_3.pkl", 'rb'))
model_4 = pickle.load(open("subcategory_model_4.pkl", 'rb'))

In [41]:
def predict_labels(test_dataloader, model):
  predictions = []

  for batch in test_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)

    with torch.no_grad():
      output= model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
      logits = output.logits
      logits = logits.detach().cpu().numpy()
      pred_flat = np.argmax(logits, axis=1).flatten()
      predictions.extend(list(pred_flat))

  return predictions

In [42]:
predictions_1 = predict_labels(test_dataloader_1, model_1)
predictions_2 = predict_labels(test_dataloader_2, model_2)
predictions_3 = predict_labels(test_dataloader_3, model_3)
predictions_4 = predict_labels(test_dataloader_4, model_4)

In [43]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix

In [44]:
f11 = f1_score(list(test_categories_encoded_1), predictions_1, average='micro')
f12 = f1_score(list(test_categories_encoded_2), predictions_2, average='micro')
f13 = f1_score(list(test_categories_encoded_3), predictions_3, average='micro')
f14 = f1_score(list(test_categories_encoded_4), predictions_4, average='micro')

In [45]:
conf_matrix_1 = confusion_matrix(list(test_categories_encoded_1), predictions_1)
conf_matrix_2 = confusion_matrix(list(test_categories_encoded_2), predictions_2)
conf_matrix_3 = confusion_matrix(list(test_categories_encoded_3), predictions_3)
conf_matrix_4 = confusion_matrix(list(test_categories_encoded_4), predictions_4)

In [58]:
print(conf_matrix_4)

[[  6   5   0  27   1]
 [  3  93   0  29   5]
 [  3   3   0   7   1]
 [ 37  30   0 298   5]
 [  5   8   0  22   4]]


In [61]:
predictions_3

[3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
