In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

In [2]:
df = pd.read_csv('../artifacts/data_ingestion/bongo_scam.csv')
df.head()

Unnamed: 0,Category,Sms
0,trust,"Nipigie baada ya saa moja, tafadhali."
1,scam,Naomba unitumie iyo Hela kwenye namba hii ya A...
2,scam,"666,KARIBU FREEMASON UTIMIZE NDOTO KATIKA BIAS..."
3,trust,Watoto wanapenda sana zawadi ulizowaletea.
4,scam,IYO PESA ITUME KWENYE NAMBA HII 0657538690 JIN...


In [3]:
# load the multilingual BERT model
model = SentenceTransformer('sentence-transformers/LaBSE')

Now that the model is loaded, we need to generate the embeddings. These embeddings will then be fed to another model downstream to perform the actual classification.

In [4]:
def generate_embeddings_batch(texts, batch_size=32) -> np.ndarray:
    """Generate embeddings in batches to manage memory"""
    all_embeddings = []
    
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i + batch_size]
        embeddings = model.encode(batch)
        all_embeddings.append(embeddings)
        
    return np.vstack(all_embeddings)

# Generate embeddings for all SMS
print("Generating embeddings...")
X = generate_embeddings_batch(df['Sms'].tolist())
y = (df['Category'] == 'scam').astype(int)  # Convert labels to binary

Generating embeddings...


100%|██████████| 48/48 [00:04<00:00,  9.84it/s]


In [5]:
## Generate the train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
## Train a basic logistic regression model. Use a cross-validation strategy to evaluate the model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

clf = LogisticRegression(max_iter=100)
clf.fit(X_train, y_train)

## Generate a classification report
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

           0       1.00      1.00      1.00       102
           1       1.00      1.00      1.00       200

    accuracy                           1.00       302
   macro avg       1.00      1.00      1.00       302
weighted avg       1.00      1.00      1.00       302



In [7]:
## Generate a confusion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)


[[102   0]
 [  0 200]]


In [17]:
## Test the model on a new spam SMS that I received recently. 
new_sms = "HELLO. Ungana na wakenya wengi wanoSHINDA katika PICK A BOX.2024 END YEAR Bonus NI from 50,000. BONYEZA *201# BILA Credo upick BOX YAKO.STOP *456*9*5#"
new_sms_embedding = model.encode([new_sms])
prediction = clf.predict(new_sms_embedding)
probability = clf.predict_proba(new_sms_embedding)
print(prediction)
print(probability)

[1]
[[0.11013348 0.88986652]]


In [19]:
## Test the model on a new ham SMS that I received recently. 
new_sms = "Leo siko kazi."
new_sms_embedding = model.encode([new_sms])
prediction = clf.predict(new_sms_embedding)
probability = clf.predict_proba(new_sms_embedding)
print(prediction)
print(probability)


[0]
[[0.88813314 0.11186686]]


A simple logistic regression model seems to work excellently with the embeddings from the multilingual BERT model. No need for more complex models for this dataset.