In [2]:
!pip install transformers
!pip install accelerate
!pip install datasets
!pip install rouge_score
!pip install peft
!pip install trl
!pip install bitsandbytes

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [3]:
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')

In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import statistics
import os
import transformers
import torch
from datasets import load_dataset, Dataset, DatasetDict
from trl import SFTTrainer
from peft import LoraConfig, PeftModel, get_peft_model
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig, GemmaTokenizer

In [5]:
!wget https://indiaai.s3.ap-south-1.amazonaws.com/docs/test.csv
!wget https://indiaai.s3.ap-south-1.amazonaws.com/docs/train.csv

df_train = pd.read_csv('/content/train.csv')
df_test = pd.read_csv('/content/test.csv')

df_train.loc[df_train["sub_category"].isnull(), "sub_category"] = df_train["category"]
df_train= df_train.dropna(how='any')

df_test.loc[df_test["sub_category"].isnull(), "sub_category"] = df_test["category"]
df_test= df_test.dropna(how='any')

# Ensure the sub categories in train and test are the same. Remove from train, if it doesn't exist in test and vice versa.
sub_cat_train = sorted(set(df_train["sub_category"].unique().tolist()))
sub_cat_test = sorted(set(df_test["sub_category"].unique().tolist()))

not_in_test = [c for c in sub_cat_train if c not in sub_cat_test]
not_in_train = [c for c in sub_cat_test if c not in sub_cat_train]

df_train= df_train.apply(lambda row: row[~df_train['sub_category'].isin(not_in_test)])
df_test= df_test.apply(lambda row: row[~df_test['sub_category'].isin(not_in_train)])

sub_cat_train = sorted(set(df_train["sub_category"].unique().tolist()))
sub_cat_test = sorted(set(df_test["sub_category"].unique().tolist()))

sub_cat_train == sub_cat_test

--2024-11-21 02:16:23--  https://indiaai.s3.ap-south-1.amazonaws.com/docs/test.csv
Resolving indiaai.s3.ap-south-1.amazonaws.com (indiaai.s3.ap-south-1.amazonaws.com)... 3.5.212.1, 3.5.210.138, 3.5.208.135, ...
Connecting to indiaai.s3.ap-south-1.amazonaws.com (indiaai.s3.ap-south-1.amazonaws.com)|3.5.212.1|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14069068 (13M) [text/csv]
Saving to: ‘test.csv’


2024-11-21 02:16:27 (5.49 MB/s) - ‘test.csv’ saved [14069068/14069068]

--2024-11-21 02:16:27--  https://indiaai.s3.ap-south-1.amazonaws.com/docs/train.csv
Resolving indiaai.s3.ap-south-1.amazonaws.com (indiaai.s3.ap-south-1.amazonaws.com)... 3.5.212.1, 3.5.210.138, 3.5.208.135, ...
Connecting to indiaai.s3.ap-south-1.amazonaws.com (indiaai.s3.ap-south-1.amazonaws.com)|3.5.212.1|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42124051 (40M) [text/csv]
Saving to: ‘train.csv’


2024-11-21 02:16:32 (9.77 MB/s) - ‘train.csv’ saved [4212405

True

In [36]:
# Download Gemma 2b-it base model
model_id = "google/gemma-2b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config = bnb_config,
                                             device_map={"":0})

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [37]:
# Configure LoRA
lora_config = LoraConfig(
    r = 8,
    target_modules = ["q_proj", "o_proj", "k_proj", "v_proj",
                      "gate_proj", "up_proj", "down_proj"],
    task_type = "CAUSAL_LM"
)

# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")


1,515,268,096 total parameters.
524,363,776 training parameters.


In [38]:
categories = ', '.join(sub_cat_train)
categories

'Business Email CompromiseEmail Takeover, Cheating by Impersonation, Child Pornography CPChild Sexual Abuse Material CSAM, Cryptocurrency Fraud, Cyber Bullying  Stalking  Sexting, Cyber Terrorism, Damage to computer computer systems etc, Data Breach/Theft, DebitCredit Card FraudSim Swap Fraud, DematDepository Fraud, Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks, EMail Phishing, EWallet Related Fraud, Email Hacking, FakeImpersonating Profile, Fraud CallVishing, Hacking/Defacement, Impersonating Email, Internet Banking Related Fraud, Intimidating Email, Malware Attack, Online Gambling  Betting, Online Job Fraud, Online Matrimonial Fraud, Online Trafficking, Other, Profile Hacking Identity Theft, Provocative Speech for unlawful acts, Ransomware, Ransomware Attack, RapeGang Rape RGRSexually Abusive Content, SQL Injection, Sexually Explicit Act, Sexually Obscene material, Tampering with computer source documents, UPI Related Frauds, Unauthorised AccessData Breach, Web

In [22]:
# ZERO SHOT TESTING
device = "cuda"
prompt = "Calssify the following Text into only one of the 38 Categories separated by comma"
df_test['generated_category'] = None
def generate_category(crimeinfo):
    input_ids = tokenizer.encode(prompt + "\nText: " + crimeinfo + "\nCategories: " + categories +
                                 "\nClassified as:", return_tensors='pt').to(device)
    attention_mask = torch.ones(input_ids.shape, device=device)
    output = model.generate(input_ids, attention_mask = attention_mask, max_new_tokens=6)
    gen_text = tokenizer.decode(output[0], skip_special_tokens=True)
    text, gen_category = gen_text.split('\nClassified as:', maxsplit=1)
    return gen_category

df_test['generated_category'] = df_test['crimeaditionalinfo'][:3].apply(generate_category)
for index, row in df_test[:3].iterrows():
  print(f'Text: {row.iloc[2]} \nActual-Category: {row.iloc[1]} \nGenerated_Category: {row.iloc[3]} \n')

Text: Sir namaskar  mein Ranjit Kumar PatraPaise nehi the tho sir kuch din pehele online loan aap Credit pearl loan aap se money loan kiya thalekin sir  loan bolke jub  loan diye tho mein turant return kar diya thalekin din baad whats app pe message aya payment karomein bola  diye the aap mein wo de diyawo gali diye tho  v return kar diyafir v message karke bolte hai full payment karo half payment nehi chalegarape case mein daldenge etcFake or illigal se contact number v hack kar dete haibol rahehai sab ko message karenge ye rapist hai bolke sirpls sir small ammount ke liye goggle play store se loan apply kiya thaFake loan aap v hai socha nehi thapls sir request kar rahahun action lo Sir  mera number  hai jo v proof chahiye dunga Sir  
Actual-Category: RapeGang Rape RGRSexually Abusive Content 
Generated_Category:  Business Email CompromiseEmail Takeover 

Text: KOTAK MAHINDRA BANK FRAUD
FRAUD AMOUNT 
Actual-Category: DebitCredit Card FraudSim Swap Fraud 
Generated_Category:  Business

In [35]:
# Tokenize the Dataset

import json

train_data = []; test_data = []
for index, row in df_train[:1000].iterrows(): # remove slicing for loading complete train dataset
  train_data += [{'crimeinfo': row.iloc[2], 'category' : row.iloc[1]}]
with open('train_file.json', 'w') as file:
    json.dump(train_data, file)

for index, row in df_test[:100].iterrows(): # remove slicing for loading complete test dataset
  test_data += [{'crimeinfo': row.iloc[2], 'category' : row.iloc[1]}]
with open('test_file.json', 'w') as file:
    json.dump(test_data, file)

from datasets import load_dataset
dataset = load_dataset('json', data_files = {'train':'train_file.json', 'test':'test_file.json'})
tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    inputs = [prompt + "\nText: " + cr + "\nCategories: " + categories + "\nClassified as:" + cat
              + tokenizer.eos_token for cr, cat in zip(examples['crimeinfo'], examples['category'])]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = model_inputs['input_ids'].copy()
    model_inputs['labels'] = labels
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [43]:
from transformers import Trainer, TrainingArguments

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Training arguments configuration
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_strategy= "steps", #'epoch',
    eval_strategy="steps", #"epoch",  # Evaluate at the end of each epoch
    logging_steps = 100,  # change this proportioantely based on the num of samples.
    save_strategy = 'epoch',
    save_total_limit=1,
    fp16=True
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
)

trainer.train()

Step,Training Loss,Validation Loss
100,1.4687,1.202882
200,1.0851,1.112503


TrainOutput(global_step=250, training_loss=1.2222558135986328, metrics={'train_runtime': 277.1992, 'train_samples_per_second': 3.608, 'train_steps_per_second': 0.902, 'total_flos': 3059236208640000.0, 'train_loss': 1.2222558135986328, 'epoch': 1.0})

In [57]:
# After training - trying zero shot testing again
model.eval()
device = "cuda"
prompt = "Calssify the following Text into only one of the 38 Categories separated by comma"
df_test['generated_category'] = None
def generate_category(crimeinfo):
    input_ids = tokenizer.encode(prompt + "\nText: " + crimeinfo + "\nCategories: " + categories +
                                 "\nClassified as:", return_tensors='pt').to(device)
    attention_mask = torch.ones(input_ids.shape, device=device)
    output = model.generate(input_ids, attention_mask = attention_mask, max_new_tokens=10)
    gen_text = tokenizer.decode(output[0], skip_special_tokens=True)
    text, gen_category = gen_text.split('\nClassified as:', maxsplit=1)
    return gen_category

df_test['generated_category'] = df_test['crimeaditionalinfo'][:3].apply(generate_category) # trying for forst 3 samples only. Remove slicing to generate for all test samples
for index, row in df_test[:3].iterrows():
  print(f'Text: {row.iloc[2]} \nActual-Category: {row.iloc[1]} \nGenerated_Category: {row.iloc[3]} \n')

Text: Sir namaskar  mein Ranjit Kumar PatraPaise nehi the tho sir kuch din pehele online loan aap Credit pearl loan aap se money loan kiya thalekin sir  loan bolke jub  loan diye tho mein turant return kar diya thalekin din baad whats app pe message aya payment karomein bola  diye the aap mein wo de diyawo gali diye tho  v return kar diyafir v message karke bolte hai full payment karo half payment nehi chalegarape case mein daldenge etcFake or illigal se contact number v hack kar dete haibol rahehai sab ko message karenge ye rapist hai bolke sirpls sir small ammount ke liye goggle play store se loan apply kiya thaFake loan aap v hai socha nehi thapls sir request kar rahahun action lo Sir  mera number  hai jo v proof chahiye dunga Sir  
Actual-Category: RapeGang Rape RGRSexually Abusive Content 
Generated_Category: DebitCredit Card FraudSim Swap Fraud 

Text: KOTAK MAHINDRA BANK FRAUD
FRAUD AMOUNT 
Actual-Category: DebitCredit Card FraudSim Swap Fraud 
Generated_Category: UPI Related F

In [58]:
from sklearn.metrics import accuracy_score, f1_score
test_labels = df_test['generated_category'].tolist()
pred_labels = df_test["sub_category"].tolist()
accuracy = accuracy_score(test_labels, pred_labels)
f1 = f1_score(test_labels, pred_labels, average="weighted")

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.0000
F1 Score: 0.0000
