In [4]:
import pandas as pd

df = pd.read_csv('final_balanced_with_unique_normals.csv')

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Category'])


print(df[['Category', 'label']].drop_duplicates())


                       Category  label
0             Tech Support Scam      5
1          Debt Collection Scam      0
3                Extortion Scam      1
4     Workplace Harassment Scam      6
7                  Romance Scam      4
12    Fake Law Enforcement Scam      2
2933                     Normal      3


In [5]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer


train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Message'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42
)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
import torch
from transformers import BertForSequenceClassification

class ScamDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }

train_dataset = ScamDataset(train_encodings, train_labels)
val_dataset = ScamDataset(val_encodings, val_labels)


num_labels = df['label'].nunique()
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from transformers import Trainer, TrainingArguments


training_args = TrainingArguments(
    output_dir='./results',

    do_eval=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33m1by23ai178[0m ([33mmalo2841[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.1029
1000,0.0029


TrainOutput(global_step=1032, training_loss=0.05128777627616601, metrics={'train_runtime': 122.8895, 'train_samples_per_second': 67.036, 'train_steps_per_second': 8.398, 'total_flos': 114307357148100.0, 'train_loss': 0.05128777627616601, 'epoch': 3.0})

In [8]:
import numpy as np
from sklearn.metrics import classification_report


predictions = trainer.predict(val_dataset)
preds = np.argmax(predictions.predictions, axis=1)


print(classification_report(val_labels, preds, target_names=label_encoder.classes_))


                           precision    recall  f1-score   support

     Debt Collection Scam       1.00      0.99      0.99        94
           Extortion Scam       1.00      1.00      1.00        95
Fake Law Enforcement Scam       0.99      1.00      0.99        90
                   Normal       1.00      1.00      1.00        93
             Romance Scam       1.00      1.00      1.00        98
        Tech Support Scam       1.00      1.00      1.00       116
Workplace Harassment Scam       1.00      1.00      1.00       101

                 accuracy                           1.00       687
                macro avg       1.00      1.00      1.00       687
             weighted avg       1.00      1.00      1.00       687



In [9]:
!pip install git+https://github.com/openai/whisper.git

import whisper


whisper_model = whisper.load_model("base")

def predict_from_audio(audio_path):

    result = whisper_model.transcribe(audio_path)
    text = result["text"]
    print("Transcript:", text)


    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    pred = torch.argmax(outputs.logits, dim=1).item()


    predicted_category = label_encoder.inverse_transform([pred])[0]
    print("Predicted Category:", predicted_category)

    return text, predicted_category


Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-fyz1sbp3
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-fyz1sbp3
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [10]:
def predict_from_audio(audio_path):

    result = whisper_model.transcribe(audio_path)
    text = result["text"]
    print("Transcript:", text)


    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}


    outputs = model(**inputs)
    pred = torch.argmax(outputs.logits, dim=1).item()


    predicted_category = label_encoder.inverse_transform([pred])[0]
    print("Predicted Category:", predicted_category)

    return text, predicted_category


In [11]:
predict_from_audio("output3.1.mp3")


Transcript:  Send Puntry State the money or I'll share your picture online.
Predicted Category: Extortion Scam


(" Send Puntry State the money or I'll share your picture online.",
 'Extortion Scam')

In [12]:
!pip install gradio

import gradio as gr

def classify_audio(audio_file):
    result = whisper_model.transcribe(audio_file)
    text = result['text']

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    outputs = model(**inputs)
    pred = torch.argmax(outputs.logits, dim=1).item()
    category = label_encoder.inverse_transform([pred])[0]

    return text,category


interface = gr.Interface(
    fn=classify_audio,
    inputs=gr.Audio(type="filepath"),
    outputs=[gr.Textbox(label="Transcript"), gr.Textbox(label="Predicted Category")],
    title="Guardian AI - Harassment Detection",
    description="Upload an audio file to detect possible harassment/scam types.",
)

interface.launch(share=True)


Collecting gradio
  Downloading gradio-5.25.2-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (

