##### ### The University of Melbourne, School of Computing and Information Systems
# COMP30027 Machine Learning, 2025 Semester 1

## Assignment 1: Scam detection with naive Bayes


##### **Student ID(s): 1353984


This iPython notebook is a template which you will use for your Assignment 1 submission.

**NOTE: YOU SHOULD ADD YOUR RESULTS, GRAPHS, AND FIGURES FROM YOUR OBSERVATIONS IN THIS FILE TO YOUR REPORT (the PDF file).** Results, figures, etc. which appear in this file but are NOT included in your report will not be marked.

**Adding proper comments to your code is MANDATORY. **

## 1. Supervised model training


In [101]:
import pandas as pd

# load training data
train_df = pd.read_csv('/Users/jessicale/Desktop/machinelearning/COMP30027_2025_asst1_data/sms_supervised_train.csv')

# focus opn preprocessed text and class label
train_texts = train_df['textPreprocessed']
train_labels = train_df['class']

# remove out the NaN and non string values and replace with an empty string
train_texts = train_df['textPreprocessed'].fillna('').astype(str)

In [114]:
#building vocab and print out number of words
vocab = set()
for text in train_texts:
    words = text.split() # splitting words by the white spaces
    vocab.update(words)
vocab = sorted(list(vocab))
vocab_size = len(vocab)
print("vocab size:", vocab_size)

vocab size: 2006


In [115]:
#creating the count matrix
import numpy as np

#create map of word to index
word_to_idx = {word: idx for idx, word in enumerate(vocab)}

#inir count matrix where rows represent messages and columns represents words
num_messages = len(train_texts)
count_matrix= np.zeros((num_messages, vocab_size), dtype = int)

# fill in matrix
for i, text in enumerate(train_texts):
    for word in text.split():
        if word in word_to_idx:
            count_matrix[i, word_to_idx[word]] += 1

print("Count matrix shape:", count_matrix.shape)
print(count_matrix)


Count matrix shape: (2000, 2006)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [116]:
# compute prior probabilties of each class

# count the amount of instances belong to each class
classes = train_labels.unique()

# create dictionary to store prior prob of the classes
priors = {}
num_total_messages = len(train_texts)

for c in classes:
    # add the number of messages in that class
    count = (train_labels == c).sum()
    #calculates prior probabilties by dividng count by total messages
    priors[c] = count / num_total_messages
    print(f"{c}:{priors[c]}")

0:0.8
1:0.2


In [117]:
#calculating likelihooods of each word appearing in a message of class c with laplace smoothing
likelihoods = {}

#laplace smoothing parameter
alpha = 1

for c in classes:
    class_indices = (train_labels == c)
    class_counts = count_matrix[class_indices]

    #total count of all the words in class c (which is either scam or non-malicious
    total_count = class_counts.sum()

    #sum counts for each word in the vocab (columns)
    word_counts = class_counts.sum(axis = 0)

    #apply laplace smoothing to find likelihoods of each word
    likelihood = (word_counts + alpha) / (total_count + vocab_size * alpha)
    likelihoods[c] = likelihood

# see fgirs ten likelikhoods for one class
print(classes[0], likelihoods[classes[0]][:10]) 

0 [0.01718733 0.00559848 0.00537454 0.00044788 0.00016795 0.01310044
 0.00391893 0.00061583 0.00011197 0.00033591]


In [118]:
#finding the 10 most probable words per class and prob values of each 

def most_probable(likelihood, vocab):
    #get indexes of the top 10 likelihoods
    top10 = np.argsort(likelihood)[-10:][::-1]
    return[(vocab[i], likelihood[i]) for i in top10]

for c in classes:
    top_words = most_probable(likelihoods[c], vocab)
    print(f"\nTop words for class {c}:")
    for word, prob in top_words:
        print(f" {word}: {prob:.4f}")


Top words for class 0:
 .: 0.0793
 ,: 0.0260
 ?: 0.0256
 u: 0.0189
 ...: 0.0188
 !: 0.0172
 ..: 0.0149
 ;: 0.0132
 &: 0.0131
 go: 0.0111

Top words for class 1:
 .: 0.0565
 !: 0.0244
 ,: 0.0235
 call: 0.0205
 £: 0.0139
 free: 0.0105
 /: 0.0091
 2: 0.0088
 &: 0.0087
 ?: 0.0085


In [119]:
# most predictive wordsof each class 
scam_class = 1
okay_class  = 0 # non malicious


ratios = {}
for i, word in enumerate(vocab):
    ratio = likelihoods[scam_class][i] / likelihoods[okay_class][i]
    ratios[word] = ratio

# sort words by highest ratio for scam predictive words
top_scam_predictions = sorted(ratios.items(), key=lambda x: x[1], reverse=True)[:10]
print("\ntop 10 words predictive of scam:")
for word, ratio in top_scam_predictions:
    print(f"{word}: {ratio:.4f}")

#for non-malicious words, the inverse ratios are found 
ratios_okay = {word: 1/ratio for word, ratio in ratios.items()}
top_okay_predictions = sorted(ratios_okay.items(), key=lambda x: x[1], reverse=True)[:10]
print("\ntop 10 words predictive of non-malicious:")
for word, ratio in top_okay_predictions:
    print(f"{word}: {ratio:.4f}")


top 10 words predictive of scam:
prize: 99.0284
tone: 64.0772
£: 49.7084
select: 46.6016
claim: 45.9543
paytm: 36.8929
code: 34.9512
award: 32.0386
won: 31.0677
18: 29.1260

top 10 words predictive of non-malicious:
;: 60.5130
...: 57.5088
gt: 54.0754
lt: 53.5604
:): 47.8954
ü: 31.9302
lor: 28.8402
hope: 24.7202
ok: 24.7202
d: 21.1152


In [120]:
import math
import numpy as np

def classify_message(message, priors, likelihoods, word_to_idx, vocab):
    # build vector for the message being analysed
    counts = np.zeros(len(vocab))
    for word in message.split():
        if word in word_to_idx:
            counts[word_to_idx[word]] += 1

    # if there's no known word, skip
    if counts.sum() == 0:
        return None, None

    log_posteriors = {}

    for c in priors:
        # start with log prior
        log_prob = math.log(priors[c])

        # add log likelihoods for each word
        for i, count in enumerate(counts):
            if count > 0:
                log_prob += count * math.log(likelihoods[c][i])

        # ✅ This line must be inside the `for c in priors` loop!
        log_posteriors[c] = log_prob

    predicted_class = max(log_posteriors, key=log_posteriors.get)
    return predicted_class, log_posteriors


## 2. Supervised model evaluation

In [121]:
# load test data
test_df = pd.read_csv('/Users/jessicale/Desktop/machinelearning/COMP30027_2025_asst1_data/sms_test.csv')
test_texts = test_df['textPreprocessed']
test_labels = test_df['class'] 
test_texts = test_df['textPreprocessed'].fillna('').astype(str)

predictions = []
skipped = 0
for text in test_texts:
    pred, _ = classify_message(text, priors, likelihoods, word_to_idx, vocab)
    if pred is None:
        skipped += 1
        predictions.append(None)
    else:
        predictions.append(pred)

print("number of test messages skipped:", skipped)

# compute accuracy and confusion matrix
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# filter out skipped messages if there are any skipped
valid_idx = [i for i, p in enumerate(predictions) if p is not None]
valid_preds = [predictions[i] for i in valid_idx]
valid_true = test_labels.iloc[valid_idx]

acc = accuracy_score(valid_true, valid_preds)
cm = confusion_matrix(valid_true, valid_preds)
print("accuracy on test set:", acc)
print("confusion Matrix:\n", cm)
print("classification Report:\n", classification_report(valid_true, valid_preds))

number of test messages skipped: 0
accuracy on test set: 0.975
confusion Matrix:
 [[785  15]
 [ 10 190]]
classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98       800
           1       0.93      0.95      0.94       200

    accuracy                           0.97      1000
   macro avg       0.96      0.97      0.96      1000
weighted avg       0.98      0.97      0.98      1000



In [122]:
def compute_confidence_ratio(log_posteriors, scam_class=1, okay_class=0):
    # compute ratio in the original probability space
    # R = exp(log(P(scam|x)) - log(P(non-malicious|x)))
    ratio = math.exp(log_posteriors[scam_class] - log_posteriors[okay_class])
    return ratio

# gather examples for different confidence levels
high_conf_scam = []
high_conf_okay = []
boundary = []

for text in test_texts:
    pred, log_post = classify_message(text, priors, likelihoods, word_to_idx, vocab)
    # skip messages that dont have any words in the vocab
    if pred is None:
        continue
    ratio = compute_confidence_ratio(log_post, scam_class=1, okay_class=0)
    
    # defining what high confidence scam and nonmalicious texts are 
    if pred == 1 and ratio > 10:
        high_conf_scam.append((text, ratio))
    elif pred == 0 and ratio < 0.1:
        high_conf_okay.append((text, ratio))
    elif 0.9 < ratio < 1.1:
        boundary.append((text, ratio))
    
    # stop after three examples of high confidence 
    if len(high_conf_scam) >= 3 and len(high_conf_okay) >= 3 and len(boundary) >= 3:
        break

print("High confidence scam examples:")
for text, r in high_conf_scam[:10]:
    print(f"R = {r:.4f} : {text}")

print("\nHigh confidence non-malicious examples:")
for text, r in high_conf_okay[:10]:
    print(f"R = {r:.4f} : {text}")

print("\nBoundary examples:")
for text, r in boundary[:10]:
    print(f"R = {r:.4f} : {text}")


High confidence scam examples:
R = 30464191.5694 : 1 . . reply free free , , , message 's yes text : hello orange month access game news sport plus 10 20 photo term apply
R = 483730.3741 : . ! call customer important service announcement freephone 0800 542
R = 2426496.5084 : . . 4 send : win easy just question hmv hmv hmv bonus special 500 pound vouchers answer 86688 info
R = 14375010869.1781 : . . . . u u send know know year ! / / / / find msg 86688 150p 18 rcvd chat hg suite342/2lands row w1j6hl ldn
R = 13.8230 : ? reply single single break plus answer fight
R = 156993185172664064.0000 : ? . please call award end 350 todays voda number select receive match 08712300220 quoting claim code standard rate app
R = 156891569358454.4688 : please call rs. guarantee customer prize service team fl1pkart w0n representative 6200992462 10am-9pm cash
R = 8296831.6617 : ? . 4 4 u week £ txt new question name 100 draw thanks enter cash continue support an
R = 90446758236257714176.0000 : . call £ £ gu

## 3. Extending the model with semi-supervised training

In [123]:
#ATTEMPTING LABEL PROPAGRATION OPTION

#load the unlabelled dataset
unlabelled_df = pd.read_csv('/Users/jessicale/Desktop/machinelearning/COMP30027_2025_asst1_data/sms_unlabelled.csv')
unlabelled_texts = unlabelled_df['textPreprocessed']
unlabelled_texts = unlabelled_texts.dropna() #ignore null or nonstring


In [124]:
# classifying unsupervised dataset using classify_message from Q1

pseudo_labels = []
pseudo_texts = []
confidence_threshold = 10

for text in unlabelled_texts:
    pred, log_post = classify_message(text, priors, likelihoods, word_to_idx, vocab)
    if pred is None:
        continue  
    ratio = compute_confidence_ratio(log_post)
    
    #keep high-confidence predictions
    if ratio > confidence_threshold or ratio < 1 / confidence_threshold:
        pseudo_labels.append(pred)
        pseudo_texts.append(text)

print(f"Added {len(pseudo_labels)} high-confidence pseudo-labelled instances.")



Added 1858 high-confidence pseudo-labelled instances.


In [125]:
#combine supervised and unsupervised data sets

# convert to series to align shapes
pseudo_texts_series = pd.Series(pseudo_texts)
pseudo_labels_series = pd.Series(pseudo_labels)

# combine with original training data
combined_texts = pd.concat([train_texts, pseudo_texts_series], ignore_index=True)
combined_labels = pd.concat([train_labels, pseudo_labels_series], ignore_index=True)



In [126]:
# rebuilding vocav and the count matrix

# cuild new vocab
combined_vocab = sorted(set(word for text in combined_texts for word in text.split()))
word_to_idx_combined = {word: i for i, word in enumerate(combined_vocab)}
V_combined = len(combined_vocab)

# cuild new count matrix
N_combined = len(combined_texts)
count_matrix_combined = np.zeros((N_combined, V_combined), dtype=int)

for i, text in enumerate(combined_texts):
    for word in text.split():
        if word in word_to_idx_combined:
            count_matrix_combined[i, word_to_idx_combined[word]] += 1


In [127]:
#recompute priors and likelihoods

# recalculate priors
combined_classes = combined_labels.unique()
combined_priors = {}
for c in combined_classes:
    combined_priors[c] = (combined_labels == c).sum() / N_combined

# recalculate likelihoods
alpha = 1
combined_likelihoods = {}

for c in combined_classes:
    class_indices = (combined_labels == c)
    class_counts = count_matrix_combined[class_indices]
    total_count = class_counts.sum()
    word_counts = class_counts.sum(axis=0)
    
    likelihood = (word_counts + alpha) / (total_count + V_combined * alpha)
    combined_likelihoods[c] = likelihood


In [128]:
# grab likelihood vectors
likelihood_scam = combined_likelihoods[1]
likelihood_okay = combined_likelihoods[0]

# compute ratios
ratios = {}
for i, word in enumerate(combined_vocab):
    # avoid division by zero
    ratio = likelihood_scam[i] / likelihood_okay[i]
    ratios[word] = ratio

# top 10 predictive words for scam (highest ratios)
top_scam_words = sorted(ratios.items(), key=lambda x: x[1], reverse=True)[:10]
print("\nTop 10 predictive words for scam (Q3):")
for word, r in top_scam_words:
    print(f"{word}: R = {r:.4f}")

# top 10 predictive words for non-malicious (lowest ratios)
top_okay_words = sorted(ratios.items(), key=lambda x: x[1])[:10]
print("\nTop 10 predictive words for non-malicious (Q3):")
for word, r in top_okay_words:
    print(f"{word}: R = {r:.4f}")



Top 10 predictive words for scam (Q3):
prize: R = 234.3411
tone: R = 133.3320
claim: R = 96.2953
award: R = 77.7770
code: R = 76.7669
guaranteed: R = 74.7467
£: R = 72.1494
18: R = 70.7064
paytm: R = 66.6660
winner: R = 60.6055

Top 10 predictive words for non-malicious (Q3):
;: R = 0.0094
gt: R = 0.0109
lt: R = 0.0109
:): R = 0.0122
ü: R = 0.0168
...: R = 0.0198
lor: R = 0.0206
da: R = 0.0217
come: R = 0.0297
wat: R = 0.0311


## 4. Supervised model evaluation