In [1]:
pip install datasets



In [12]:
!pip install SpeechRecognition
!pip install pydub

Collecting SpeechRecognition
  Downloading SpeechRecognition-3.14.1-py3-none-any.whl.metadata (31 kB)
Downloading SpeechRecognition-3.14.1-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.14.1
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [2]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from torch.nn import functional as F

# Load dataset
df = pd.read_csv("/content/augmented_scam_call_dataset_1x.csv")  # Your dataset file

# Split data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["TEXT"].tolist(), df["LABEL"].tolist(), test_size=0.2, random_state=42
)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the dataset
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

# Convert to Torch dataset
class SpamDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = SpamDataset(train_encodings, train_labels)
test_dataset = SpamDataset(test_encodings, test_labels)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
df.head()

Unnamed: 0,TEXT,LABEL
0,"[""good morning, yesterday is [ your name ]'s p...",0
1,"[""Howdy, my name is Jamie. I ' m interested in...",0
2,"[""yes, my'm really passionate about environmen...",0
3,"Great , how do I sign up , and where gestural ...",0
4,"[""Could you mail me the link, delight? And my ...",0


In [4]:
df.shape

(9705, 2)

In [5]:
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

# Save the trained model
model.save_pretrained("bert_spam_model")
tokenizer.save_pretrained("bert_spam_model")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.10925952109784448
Epoch 2, Loss: 0.023258144464954635
Epoch 3, Loss: 0.028444633183749197


('bert_spam_model/tokenizer_config.json',
 'bert_spam_model/special_tokens_map.json',
 'bert_spam_model/vocab.txt',
 'bert_spam_model/added_tokens.json')

In [11]:
def load_model():
    model = BertForSequenceClassification.from_pretrained("bert_spam_model")
    tokenizer = BertTokenizer.from_pretrained("bert_spam_model")
    model.to(device)
    return model, tokenizer

def predict_spam(text):
    model, tokenizer = load_model()
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=1)
        prediction = torch.argmax(probs, dim=1).item()
    return "Spam" if prediction == 1 else "Not Spam"

# Example prediction
print(predict_spam("What is your name?"))


Not Spam


In [16]:
import speech_recognition as sr
from pydub import AudioSegment

def audio_to_text(mp3_file):
    # Convert MP3 to WAV
    audio = AudioSegment.from_mp3(mp3_file)
    wav_file = mp3_file.replace(".mp3", ".wav")
    audio.export(wav_file, format="wav")

    # Perform Speech-to-Text
    recognizer = sr.Recognizer()
    with sr.AudioFile(wav_file) as source:
        audio_data = recognizer.record(source)
        try:
            text = recognizer.recognize_google(audio_data)
            return text
        except sr.UnknownValueError:
            return "Could not understand audio"
        except sr.RequestError:
            return "Speech-to-text service unavailable"

# Example Usage
audio_text = audio_to_text("/content/sound5_notspam.wav")
print("Transcribed Text:", audio_text)
print("Prediction:", predict_spam(audio_text))


Transcribed Text: hello bro can you give me some 100000 rupees I just needed a quick money for my you know my mother's surgery could you please could you please give me a buy me some 10000 rupees thank you so much dude thank you so much
Prediction: Not Spam
