In [14]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import time  
import random

dataset_dir = "./data/train_test_val"

DATASET = {
    'train': pd.read_csv(dataset_dir + '/train.csv').reset_index(drop=True),  # """ encoding='cp1252' """ insert between train_data.csv and .reset index as parameter
    'test': pd.read_csv(dataset_dir + '/test.csv').reset_index(drop=True),  # """ encoding='cp1252' """ insert between train_data.csv and .reset index as parameter
    'val': pd.read_csv(dataset_dir + '/val.csv').reset_index(drop=True),  # """ encoding='cp1252' """ insert between train_data.csv and .reset index as parameter
}


MODEL_NAMES = {
    "bert": 'google-bert/bert-base-uncased',
    "xlnet": 'xlnet/xlnet-base-cased',
}

MODEL_VARIANTS = {
    "bert-pretrained": 'cc-bert-pretrained-model.pth',
    "xlnet-pretrained": 'cc-xlnet-pretrained-model.pth',
    "bert-finetuned": 'cc-bert-finetuned-model.pth',
    "xlnet-finetuned": 'cc-xlnet-finetuned-model.pth',
}

MODEL_DIR = "./models" 

LABELS = [

    'Murder',
    'Homicide',
    'Robbery',
    'Physical Injuries',
    'Rape',
    'Theft',
    'Carnapping',
    'Others'
]

THRESHOLD = 0.5

class BERTCrimeClassifier(nn.Module):
    def __init__(self, model_name, batch_size=8, epochs=5, dropout=0.1):
        super(BERTCrimeClassifier, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.hidden_linear = nn.Linear(self.model.config.hidden_size, self.model.config.hidden_size) 
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(self.model.config.hidden_size, len(LABELS))

    def forward(self, ids, mask):
        bert_outputs = self.model(ids, attention_mask=mask)
        cls_hidden_state = bert_outputs.last_hidden_state[:, 0, :] 
        hidden_output = self.hidden_linear(cls_hidden_state) 
        dropped_out = self.dropout(hidden_output)  
        logits = self.linear(dropped_out)  
        return logits


class XLNetCrimeClassifier(nn.Module):
    def __init__(self, model_name, sbatch_size=8,epochs=5, dropout=0.1): 
        super(XLNetCrimeClassifier, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(self.model.config.hidden_size, len(LABELS))

    def forward(self, ids, mask):
        bert_outputs = self.model(ids, attention_mask=mask)
        cls_hidden_state = bert_outputs.last_hidden_state[:, 0, :] 
        dropped_out = self.dropout(cls_hidden_state)
        logits = self.linear(dropped_out)
        return logits



bertCrimeClassifier = BERTCrimeClassifier('google-bert/bert-base-uncased')
xlnetCrimeClassifier = XLNetCrimeClassifier('xlnet/xlnet-base-cased')

print(bertCrimeClassifier)
print(xlnetCrimeClassifier)


# Global cache para i-store ang mga loaded na models
model_cache = {}

def get_model(model_id, model_variant):
    model_name = MODEL_NAMES[model_id]

    cache_key = f"{model_id}-{model_variant}"

    if cache_key in model_cache:
        print(f"Using cached model: {cache_key}")
        return model_cache[cache_key]
    
    if model_id == "bert":
        crimeClassifier = BERTCrimeClassifier(model_name)
    elif model_id == "xlnet": 
        crimeClassifier = XLNetCrimeClassifier(model_name)

    # Load pre-trained weights
    model_path = f'{MODEL_DIR}/{model_variant}/{MODEL_VARIANTS[model_variant]}'
    crimeClassifier.load_state_dict(torch.load(model_path))

    crimeClassifier.eval()

    model_cache[cache_key] = crimeClassifier

    print(f"Model loaded and cached: {cache_key}")
    return crimeClassifier

def get_predictions(input_text, model_id, model_variant):

    crimeClassifier = get_model(model_id, model_variant)

    # Tokenizer
    model_name = MODEL_NAMES[model_id]
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Classification
    start_time = time.time()  # Start the timer

    # Encode text
    encoded_input_text = tokenizer(input_text, padding="max_length", truncation=True, max_length=128, return_tensors='pt')

    # Get raw results
    with torch.no_grad():
        logits = crimeClassifier(ids=encoded_input_text['input_ids'], mask=encoded_input_text['attention_mask'])

    # Apply activation to get probabilities
    predictions = logits.flatten().sigmoid()

    label_probabilities = [{"label": label, "probability": float(round(prob.item() * 100, 2))} for label, prob in zip(LABELS, predictions)]

    # Sort label probabilities in descending order
    label_probabilities = sorted(label_probabilities, key=lambda item: -item["probability"])

    # Labels greater than 0.5 threshold
    predicted_labels = [(label, f"{pred*100:.2f}%") for label, pred in zip(LABELS, predictions) if pred >= THRESHOLD]
    
    end_time = time.time()  # End the timer
    duration = round(end_time - start_time, 4)  # Calculate the duration


    # Display results
   
    print("Input: " + input_text)
    print("Index: " + (index))
    get_actual_labels(index)
    print()
    print("Predicted Labels:")
    for label, probability in predicted_labels:
        print(f"({label}, {probability})")
    print()
    for result in label_probabilities: 
        print(f"{result['label']}: {result['probability']}")

    print(f"\nPrediction processing time: {duration:.4f} seconds")

    return label_probabilities, duration   # Return both the predictions and the processing time


def get_actual_labels(index=-1): 

    text = DATASET["test"]["Text"][index]

    labels = []
    for label in LABELS: 
        actual = DATASET["test"][label][index]
        
        if actual == 1:
           labels.append(label)

    print("Actual labels:")
    print([class_name for class_name in LABELS if DATASET["test"][class_name][index] == 1])
    # print(labels)

    return text




BERTCrimeClassifier(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

# Inference


In [15]:
# index = 64 # Palitan ang index (from 0 - 1199) kung gusto niyo ng ibang example 
index = random.randint(0, 1199) # or get a random number 
EXAMPLE_INPUT = DATASET['test']['Text'][index] 
get_actual_labels(index)
# or i-uncomment out niyo eto kung gusto niyo magtest ng sariling example
# EXAMPLE_INPUT = "oh no a girl was found harassed by an old man"
# EXAMPLE_INPUT = "oh no a girl was killed in a closet"

xlnet_predictions = get_predictions(EXAMPLE_INPUT, "xlnet", "xlnet-finetuned")

  crimeClassifier.load_state_dict(torch.load(model_path))


Model loaded and cached: xlnet-xlnet-finetuned




Input: i was sitting on my porch enjoying the evening when i heard shouting coming from the street i looked over and saw a man trying to rob a woman he was using fearmongering tactics to make her hand over her purse she looked terrified and tried to back away but the man grabbed her arm and tried to pull her towards him in the struggle she slipped and hit her head on the pavement it all seemed like it was a result of negligence the man just stood there for a moment looking at her before he dropped the purse and ran off leaving her lying there unconscious
Index: 1148


TypeError: can only concatenate str (not "list") to str

In [13]:
bert_predictions = get_predictions(EXAMPLE_INPUT, "bert", "bert-finetuned")

  crimeClassifier.load_state_dict(torch.load(model_path))


Model loaded and cached: bert-bert-finetuned
Input: i had just stepped out of my house when i saw a car speeding down the street it swerved and hit a pedestrian i think the driver didn't see him it was a mistake after the crash the driver got out and saw a woman walking nearby he started harassing her grabbing at her clothes she looked terrified
Index: 265
Actual labels:
['Homicide', 'Rape']
Actual labels: i had just stepped out of my house when i saw a car speeding down the street it swerved and hit a pedestrian i think the driver didn't see him it was a mistake after the crash the driver got out and saw a woman walking nearby he started harassing her grabbing at her clothes she looked terrified

Predicted Labels:

Others: 48.36
Rape: 34.66
Homicide: 16.52
Physical Injuries: 7.49
Carnapping: 2.49
Murder: 1.17
Theft: 1.0
Robbery: 0.88

Prediction processing time: 0.3240 seconds
