In [4]:
import pandas as pd

df = pd.read_csv("/content/spam.csv", encoding="latin-1")

In [5]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [7]:
df.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [8]:
df.v1.unique()

array(['ham', 'spam'], dtype=object)

In [10]:
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [11]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# GloVe + LSTM

In [16]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import os
import requests
from zipfile import ZipFile
from io import BytesIO

nltk.data.path.append('/tmp/nltk_data')
os.environ['NLTK_DATA'] = '/tmp/nltk_data'

try:
    nltk.download('punkt', download_dir='/tmp/nltk_data', quiet=True)
except:
    pass

def download_glove():
    if not os.path.exists('glove'):
        os.makedirs('glove')

    if not os.path.exists('glove/glove.6B.50d.txt'):
        url = 'https://nlp.stanford.edu/data/glove.6B.zip'
        response = requests.get(url)
        with ZipFile(BytesIO(response.content)) as zip_file:
            zip_file.extractall('glove')

def load_glove(dimension=50):
    download_glove()

    embeddings = {}
    with open(f'glove/glove.6B.{dimension}d.txt', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

def vectorize_text(text, embeddings, dimension=50):
    try:
        nltk.download('punkt', quiet=True)
    except:
        nltk.download('punkt', download_dir='/tmp/nltk_data')
        nltk.data.path.append('/tmp/nltk_data')

    if pd.isna(text):
        return np.zeros(dimension)

    try:
        tokens = word_tokenize(text.lower())
    except:
        tokens = text.lower().split()

    vectors = [embeddings.get(token, np.zeros(dimension)) for token in tokens]

    if not vectors:
        return np.zeros(dimension)

    return np.mean(vectors, axis=0)

In [17]:
dimension = 50
glove_embeddings = load_glove(dimension)

df['v2_vectorized'] = df['v2'].apply(lambda x: vectorize_text(x, glove_embeddings, dimension))

In [18]:
df

Unnamed: 0,v1,v2,v2_vectorized
0,ham,"Go until jurong point, crazy.. Available only ...","[0.1864135002717376, 0.26192750558257105, -0.0..."
1,ham,Ok lar... Joking wif u oni...,"[-0.21402766307195029, 0.15854466954867044, 0...."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[-0.1973253640213183, 0.20823492775005953, -0...."
3,ham,U dun say so early hor... U c already then say...,"[0.1481227224523371, 0.06637636233459819, 0.35..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[0.23352846331321275, 0.24408138493219247, -0...."
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,"[0.09497379853079717, 0.13324066797892253, 0.1..."
5568,ham,Will Ì_ b going to esplanade fr home?,"[0.3109863706631586, 0.24635337246581912, 0.06..."
5569,ham,"Pity, * was in mood for that. So...any other s...","[-0.005740398168563842, 0.22621999979019164, -..."
5570,ham,The guy did some bitching but I acted like i'd...,"[0.2590248707252053, -0.043575228800853856, -0..."


In [20]:
vector_df = pd.DataFrame(df['v2_vectorized'].tolist(), columns=[f'dim_{i}' for i in range(dimension)])
result_df = pd.concat([df, vector_df], axis=1)

In [21]:
result_df

Unnamed: 0,v1,v2,v2_vectorized,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,...,dim_40,dim_41,dim_42,dim_43,dim_44,dim_45,dim_46,dim_47,dim_48,dim_49
0,ham,"Go until jurong point, crazy.. Available only ...","[0.1864135002717376, 0.26192750558257105, -0.0...",0.186414,0.261928,-0.053090,0.103147,0.040546,-0.291268,-0.161999,...,0.117588,-0.070454,0.055971,-0.096880,-0.204149,0.109231,-0.053217,-0.304996,0.171188,0.156786
1,ham,Ok lar... Joking wif u oni...,"[-0.21402766307195029, 0.15854466954867044, 0....",-0.214028,0.158545,0.097882,0.100438,0.177207,-0.521728,0.299651,...,0.086815,-0.227790,-0.152800,0.173536,0.003913,-0.085223,0.246900,0.122419,0.139491,0.401155
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[-0.1973253640213183, 0.20823492775005953, -0....",-0.197325,0.208235,-0.052291,0.154483,0.090836,-0.145471,-0.155031,...,0.246669,-0.231278,0.035385,0.069831,-0.223485,0.191765,0.076454,0.104324,0.158678,-0.094890
3,ham,U dun say so early hor... U c already then say...,"[0.1481227224523371, 0.06637636233459819, 0.35...",0.148123,0.066376,0.355986,0.056194,0.210209,-0.192467,-0.145177,...,-0.028865,0.077548,-0.127852,0.095358,-0.023309,0.060392,-0.085055,-0.223813,0.012678,0.106840
4,ham,"Nah I don't think he goes to usf, he lives aro...","[0.23352846331321275, 0.24408138493219247, -0....",0.233528,0.244081,-0.081947,-0.460037,0.393072,-0.338189,-0.421148,...,-0.213076,0.232316,0.047981,0.139653,0.009535,0.079278,-0.146741,-0.290199,-0.013955,0.128624
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,"[0.09497379853079717, 0.13324066797892253, 0.1...",0.094974,0.133241,0.193542,0.059391,0.361622,0.026216,-0.129463,...,0.012206,0.001128,0.096869,0.018781,-0.144595,0.196450,-0.109313,-0.034947,0.025582,0.029699
5568,ham,Will Ì_ b going to esplanade fr home?,"[0.3109863706631586, 0.24635337246581912, 0.06...",0.310986,0.246353,0.067219,-0.037426,0.222317,-0.243073,-0.112763,...,-0.044378,-0.044856,-0.074689,0.030762,-0.178449,0.316938,-0.045347,-0.298866,0.051652,0.194269
5569,ham,"Pity, * was in mood for that. So...any other s...","[-0.005740398168563842, 0.22621999979019164, -...",-0.005740,0.226220,-0.100474,0.061500,0.145362,-0.033474,-0.157289,...,-0.199610,0.053856,0.001906,-0.050816,0.147330,0.112577,-0.103760,-0.034280,0.163602,0.014863
5570,ham,The guy did some bitching but I acted like i'd...,"[0.2590248707252053, -0.043575228800853856, -0...",0.259025,-0.043575,-0.015679,-0.212860,0.392709,-0.017993,-0.576073,...,-0.116881,0.126679,0.045446,0.200592,-0.012973,0.058527,-0.252643,0.030968,-0.137889,0.188220


In [46]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import plotly.express as px
import plotly.graph_objects as go

X = result_df[['dim_' + str(i) for i in range(50)]].values
y = result_df['v1'].values

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

explained_variance = pca.explained_variance_ratio_
explained_variance_sum = sum(explained_variance)

pca_df = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])
pca_df['label'] = y
pca_df['text'] = df['v2']
pca_df['color'] = ['Spam' if label == 1 else 'Ham' for label in y]

fig = px.scatter(
    pca_df,
    x='PC1',
    y='PC2',
    color='color',
    color_discrete_map={'Ham': 'blue', 'Spam': 'red'},
    hover_data=['text'],
    title=f'PCA of Text Messages (Explained Variance: {explained_variance_sum:.2%})',
)

fig.update_layout(
    legend_title_text='Message Type',
    xaxis_title=f'PC1 ({explained_variance[0]:.2%})',
    yaxis_title=f'PC2 ({explained_variance[1]:.2%})',
    width=900,
    height=700
)

fig.show()

In [24]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 53 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   v1             5572 non-null   object 
 1   v2             5572 non-null   object 
 2   v2_vectorized  5572 non-null   object 
 3   dim_0          5572 non-null   float64
 4   dim_1          5572 non-null   float64
 5   dim_2          5572 non-null   float64
 6   dim_3          5572 non-null   float64
 7   dim_4          5572 non-null   float64
 8   dim_5          5572 non-null   float64
 9   dim_6          5572 non-null   float64
 10  dim_7          5572 non-null   float64
 11  dim_8          5572 non-null   float64
 12  dim_9          5572 non-null   float64
 13  dim_10         5572 non-null   float64
 14  dim_11         5572 non-null   float64
 15  dim_12         5572 non-null   float64
 16  dim_13         5572 non-null   float64
 17  dim_14         5572 non-null   float64
 18  dim_15  

In [25]:
result_df.describe()

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,...,dim_40,dim_41,dim_42,dim_43,dim_44,dim_45,dim_46,dim_47,dim_48,dim_49
count,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,...,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0
mean,0.148088,0.135342,0.076954,-0.116301,0.269437,-0.075056,-0.263896,-0.001808,-0.165217,0.036694,...,-0.037473,0.040041,-0.02776,0.160657,0.002944,0.067745,-0.037483,-0.096245,0.004929,0.171872
std,0.16062,0.144251,0.156134,0.169382,0.177214,0.180469,0.189067,0.152014,0.154299,0.138471,...,0.160354,0.117114,0.125996,0.165804,0.13179,0.127087,0.136979,0.17243,0.115134,0.187735
min,-1.5754,-0.826856,-0.658226,-1.5782,-0.82308,-1.073226,-1.156273,-0.74383,-0.98244,-1.10655,...,-0.79416,-0.806975,-0.678185,-1.12163,-0.93908,-0.93644,-0.82783,-0.897523,-0.62396,-0.701415
25%,0.060921,0.047857,-0.019577,-0.224526,0.167973,-0.178334,-0.385513,-0.093019,-0.265097,-0.041669,...,-0.144039,-0.02519,-0.101259,0.056523,-0.071069,0.0,-0.123023,-0.204194,-0.065742,0.047521
50%,0.155334,0.133903,0.075855,-0.124633,0.286329,-0.061698,-0.27922,0.0,-0.169622,0.034314,...,-0.051513,0.039617,-0.025621,0.163015,0.001974,0.068789,-0.040089,-0.093779,-0.001551,0.162421
75%,0.24795,0.222761,0.1743,-0.01198,0.383681,0.039531,-0.159154,0.096205,-0.063502,0.115174,...,0.061209,0.106575,0.04828,0.264202,0.075987,0.13871,0.040849,0.008227,0.065656,0.286718
max,0.90243,0.87547,1.0707,0.91309,0.956066,0.85495,1.4195,0.558565,0.727175,1.07568,...,1.4647,0.6284,1.0063,0.880477,1.296187,0.87,1.1006,0.7893,0.910569,1.1584


In [27]:
unique_labels = result_df['v1'].unique()
label_map = {unique_labels[0]: 0, unique_labels[1]: 1}
result_df['v1'] = result_df['v1'].map(label_map)

In [28]:
result_df

Unnamed: 0,v1,v2,v2_vectorized,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,...,dim_40,dim_41,dim_42,dim_43,dim_44,dim_45,dim_46,dim_47,dim_48,dim_49
0,0,"Go until jurong point, crazy.. Available only ...","[0.1864135002717376, 0.26192750558257105, -0.0...",0.186414,0.261928,-0.053090,0.103147,0.040546,-0.291268,-0.161999,...,0.117588,-0.070454,0.055971,-0.096880,-0.204149,0.109231,-0.053217,-0.304996,0.171188,0.156786
1,0,Ok lar... Joking wif u oni...,"[-0.21402766307195029, 0.15854466954867044, 0....",-0.214028,0.158545,0.097882,0.100438,0.177207,-0.521728,0.299651,...,0.086815,-0.227790,-0.152800,0.173536,0.003913,-0.085223,0.246900,0.122419,0.139491,0.401155
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,"[-0.1973253640213183, 0.20823492775005953, -0....",-0.197325,0.208235,-0.052291,0.154483,0.090836,-0.145471,-0.155031,...,0.246669,-0.231278,0.035385,0.069831,-0.223485,0.191765,0.076454,0.104324,0.158678,-0.094890
3,0,U dun say so early hor... U c already then say...,"[0.1481227224523371, 0.06637636233459819, 0.35...",0.148123,0.066376,0.355986,0.056194,0.210209,-0.192467,-0.145177,...,-0.028865,0.077548,-0.127852,0.095358,-0.023309,0.060392,-0.085055,-0.223813,0.012678,0.106840
4,0,"Nah I don't think he goes to usf, he lives aro...","[0.23352846331321275, 0.24408138493219247, -0....",0.233528,0.244081,-0.081947,-0.460037,0.393072,-0.338189,-0.421148,...,-0.213076,0.232316,0.047981,0.139653,0.009535,0.079278,-0.146741,-0.290199,-0.013955,0.128624
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,"[0.09497379853079717, 0.13324066797892253, 0.1...",0.094974,0.133241,0.193542,0.059391,0.361622,0.026216,-0.129463,...,0.012206,0.001128,0.096869,0.018781,-0.144595,0.196450,-0.109313,-0.034947,0.025582,0.029699
5568,0,Will Ì_ b going to esplanade fr home?,"[0.3109863706631586, 0.24635337246581912, 0.06...",0.310986,0.246353,0.067219,-0.037426,0.222317,-0.243073,-0.112763,...,-0.044378,-0.044856,-0.074689,0.030762,-0.178449,0.316938,-0.045347,-0.298866,0.051652,0.194269
5569,0,"Pity, * was in mood for that. So...any other s...","[-0.005740398168563842, 0.22621999979019164, -...",-0.005740,0.226220,-0.100474,0.061500,0.145362,-0.033474,-0.157289,...,-0.199610,0.053856,0.001906,-0.050816,0.147330,0.112577,-0.103760,-0.034280,0.163602,0.014863
5570,0,The guy did some bitching but I acted like i'd...,"[0.2590248707252053, -0.043575228800853856, -0...",0.259025,-0.043575,-0.015679,-0.212860,0.392709,-0.017993,-0.576073,...,-0.116881,0.126679,0.045446,0.200592,-0.012973,0.058527,-0.252643,0.030968,-0.137889,0.188220


In [37]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

X = result_df[['dim_' + str(i) for i in range(50)]].values
y = result_df['v1'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

model = Sequential()
model.add(LSTM(256, input_shape=(1, 50), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.1))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    min_delta=0.001,
    restore_best_weights=True
)

history = model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping],
    verbose=1
)

Epoch 1/20


  super().__init__(**kwargs)


[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.8489 - loss: 0.4527 - val_accuracy: 0.9327 - val_loss: 0.1577
Epoch 2/20
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9480 - loss: 0.1312 - val_accuracy: 0.9439 - val_loss: 0.1423
Epoch 3/20
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9548 - loss: 0.1213 - val_accuracy: 0.9451 - val_loss: 0.1385
Epoch 4/20
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9598 - loss: 0.1072 - val_accuracy: 0.9484 - val_loss: 0.1244
Epoch 5/20
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9674 - loss: 0.0920 - val_accuracy: 0.9585 - val_loss: 0.1139
Epoch 6/20
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9708 - loss: 0.0796 - val_accuracy: 0.9630 - val_loss: 0.1103
Epoch 7/20
[1m112/112[0m [32m━━━━━━

In [34]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')

y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Test Loss: 0.1324
Test Accuracy: 0.9650
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       965
           1       0.88      0.86      0.87       150

    accuracy                           0.97      1115
   macro avg       0.93      0.92      0.92      1115
weighted avg       0.96      0.97      0.96      1115


Confusion Matrix:
[[947  18]
 [ 21 129]]


In [35]:
def predict_text_class(new_vector, model, scaler):
    scaled_vector = scaler.transform([new_vector])
    reshaped_vector = scaled_vector.reshape(1, 1, 50)
    prediction = model.predict(reshaped_vector)[0][0]
    return 1 if prediction > 0.5 else 0

In [38]:
def predict_single_text(text, model, scaler, glove_embeddings, dimension=50):
    from nltk.tokenize import word_tokenize
    import nltk

    try:
        nltk.download('punkt', download_dir='/tmp/nltk_data', quiet=True)
        nltk.data.path.append('/tmp/nltk_data')
    except:
        pass

    try:
        tokens = word_tokenize(text.lower())
    except:
        tokens = text.lower().split()

    vectors = [glove_embeddings.get(token, np.zeros(dimension)) for token in tokens]

    if not vectors:
        text_vector = np.zeros(dimension)
    else:
        text_vector = np.mean(vectors, axis=0)

    scaled_vector = scaler.transform([text_vector])
    reshaped_vector = scaled_vector.reshape(1, 1, dimension)
    prediction = model.predict(reshaped_vector)[0][0]
    predicted_class = 1 if prediction > 0.5 else 0

    result = {
        'text': text,
        'prediction_score': float(prediction),
        'predicted_class': int(predicted_class),
        'is_scam': 'Yes' if predicted_class == 1 else 'No'
    }

    return result

In [43]:
text_to_check = "Click here to claim your free prize! You've won $10,000!"
result = predict_single_text(text_to_check, model, scaler, glove_embeddings)
result

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step


{'text': "Click here to claim your free prize! You've won $10,000!",
 'prediction_score': 0.981732964515686,
 'predicted_class': 1,
 'is_scam': 'Yes'}

In [44]:
messages = [
    "Call me back regarding your account",
    "URGENT: Your account has been compromised. Click here to reset your password",
    "Meeting at 3pm tomorrow in the conference room"
]

for msg in messages:
    result = predict_single_text(msg, model, scaler, glove_embeddings)
    print(f"Text: {msg}")
    print(f"Scam probability: {result['prediction_score']:.4f}")
    print(f"Is scam: {result['is_scam']}")
    print("-" * 50)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Text: Call me back regarding your account
Scam probability: 0.0004
Is scam: No
--------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Text: URGENT: Your account has been compromised. Click here to reset your password
Scam probability: 0.0090
Is scam: No
--------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Text: Meeting at 3pm tomorrow in the conference room
Scam probability: 0.0000
Is scam: No
--------------------------------------------------


# BERT + MLP

In [49]:
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel

def load_bert_model():
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    return tokenizer, model

def vectorize_text_bert(text, tokenizer, model, max_length=128):
    if pd.isna(text):
        return np.zeros(768)

    inputs = tokenizer(text, return_tensors="pt", max_length=max_length,
                      padding="max_length", truncation=True)

    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.last_hidden_state[:, 0, :].numpy().flatten()
    return embeddings

tokenizer, model = load_bert_model()

df2 = df.copy()

unique_labels = df2['v1'].unique()
label_map = {unique_labels[0]: 0, unique_labels[1]: 1}
df['v1_encoded'] = df['v1'].map(label_map)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:  40%|####      | 178M/440M [00:00<?, ?B/s]

In [51]:
unique_labels = df2['v1'].unique()
label_map = {unique_labels[0]: 0, unique_labels[1]: 1}
df2['v1_encoded'] = df2['v1'].map(label_map)

df2['v2_vectorized'] = df2['v2'].apply(lambda x: vectorize_text_bert(x, tokenizer, model))

vector_df = pd.DataFrame(df2['v2_vectorized'].tolist(), columns=[f'dim_{i}' for i in range(768)])
result_df = pd.concat([df2, vector_df], axis=1)

In [52]:
result_df

Unnamed: 0,v1,v2,v2_vectorized,v1_encoded,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,...,dim_758,dim_759,dim_760,dim_761,dim_762,dim_763,dim_764,dim_765,dim_766,dim_767
0,0,"Go until jurong point, crazy.. Available only ...","[-0.15130673, -0.32292148, 0.18984665, -0.1814...",0,-0.151307,-0.322921,0.189847,-0.181466,-0.331167,-0.283335,...,0.009398,-0.553711,-0.293213,-0.691019,-0.071334,-0.071222,-0.270227,-0.506180,0.637855,0.544040
1,0,Ok lar... Joking wif u oni...,"[-0.12379805, 0.34354445, -0.00094410765, -0.5...",0,-0.123798,0.343544,-0.000944,-0.570006,-0.705344,-0.033496,...,0.321728,-0.585733,-0.354311,-0.171751,-0.012980,-0.039756,-0.099514,-0.294352,0.314726,0.567615
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,"[-0.50309587, -0.28293756, 0.5747776, -0.14806...",1,-0.503096,-0.282938,0.574778,-0.148062,-0.356201,-0.366296,...,0.223085,-0.344779,0.276835,-0.286550,0.175028,-0.215764,-0.861642,-0.209744,0.280155,0.652472
3,0,U dun say so early hor... U c already then say...,"[0.06464517, 0.57689357, 0.3410961, -0.4666418...",0,0.064645,0.576894,0.341096,-0.466642,-0.481207,-0.069156,...,0.463334,-0.486881,0.022841,0.094647,0.010169,0.127587,-0.062866,-0.135028,0.445602,0.714602
4,0,"Nah I don't think he goes to usf, he lives aro...","[0.09474505, 0.38972345, 0.038936254, -0.33427...",0,0.094745,0.389723,0.038936,-0.334271,-0.610230,-0.298817,...,-0.363345,-0.397210,0.447765,-0.359783,0.086355,-0.053478,-0.050675,-0.515646,1.176274,0.252792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,"[-0.07247542, 0.055050947, 0.62025595, -0.3170...",1,-0.072475,0.055051,0.620256,-0.317046,-0.232876,-0.450038,...,0.414639,-0.536056,0.438631,-0.122778,-0.033932,0.465827,-0.261360,-0.338232,0.168457,0.989371
5568,0,Will Ì_ b going to esplanade fr home?,"[-0.008886666, 0.13748202, 0.27079245, -0.2207...",0,-0.008887,0.137482,0.270792,-0.220712,-0.068850,-0.078778,...,0.047202,-0.131978,0.106154,-0.138994,0.318368,-0.219576,-0.173396,-0.372755,0.395842,0.494934
5569,0,"Pity, * was in mood for that. So...any other s...","[-0.046375293, 0.07157645, -0.2811314, -0.4813...",0,-0.046375,0.071576,-0.281131,-0.481317,-0.609164,-0.134605,...,-0.018864,-0.127699,-0.096401,-0.058212,0.246049,-0.159947,-0.204754,-0.446099,0.692653,0.492611
5570,0,The guy did some bitching but I acted like i'd...,"[0.28872028, 0.21530701, -0.35321584, 0.423628...",0,0.288720,0.215307,-0.353216,0.423629,-0.190779,0.098727,...,-0.557828,-0.611266,0.230194,-0.093744,0.132807,0.164040,-0.172481,-0.305890,0.025765,0.052798


In [54]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import plotly.express as px
import plotly.graph_objects as go

X = result_df[['dim_' + str(i) for i in range(767)]].values
y = result_df['v1'].values

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

explained_variance = pca.explained_variance_ratio_
explained_variance_sum = sum(explained_variance)

pca_df = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])
pca_df['label'] = y
pca_df['text'] = df['v2']
pca_df['color'] = ['Spam' if label == 1 else 'Ham' for label in y]

fig = px.scatter(
    pca_df,
    x='PC1',
    y='PC2',
    color='color',
    color_discrete_map={'Ham': 'blue', 'Spam': 'red'},
    hover_data=['text'],
    title=f'PCA of Text Messages (Explained Variance: {explained_variance_sum:.2%})',
)

fig.update_layout(
    legend_title_text='Message Type',
    xaxis_title=f'PC1 ({explained_variance[0]:.2%})',
    yaxis_title=f'PC2 ({explained_variance[1]:.2%})',
    width=900,
    height=700
)

fig.show()

In [62]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

class EmbeddingClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dims=[512, 256, 128]):
        super(EmbeddingClassifier, self).__init__()

        layers = []
        prev_dim = input_dim

        for dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, dim))
            layers.append(nn.ReLU())
            layers.append(nn.BatchNorm1d(dim))
            layers.append(nn.Dropout(0.3))
            prev_dim = dim

        layers.append(nn.Linear(prev_dim, 1))
        layers.append(nn.Sigmoid())

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

def train_model(X_train, y_train, X_val, y_val, input_dim, device, epochs=1000):
    model = EmbeddingClassifier(input_dim).to(device)

    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

    X_train_tensor = torch.FloatTensor(X_train).to(device)
    y_train_tensor = torch.FloatTensor(y_train).unsqueeze(1).to(device)

    X_val_tensor = torch.FloatTensor(X_val).to(device)
    y_val_tensor = torch.FloatTensor(y_val).unsqueeze(1).to(device)

    best_val_loss = float('inf')
    best_model_state = None

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()

        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)

        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val_tensor)
            val_loss = criterion(val_outputs, y_val_tensor)
            val_preds = (val_outputs > 0.5).float()
            val_acc = (val_preds == y_val_tensor).float().mean()

        scheduler.step(val_loss)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict().copy()

        if (epoch + 1) % 5 == 0:
            print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}, Val Acc: {val_acc.item():.4f}")

    model.load_state_dict(best_model_state)
    return model

def predict_vector(model, vector, scaler, device):
    model.eval()
    scaled_vector = scaler.transform([vector])
    vector_tensor = torch.FloatTensor(scaled_vector).to(device)

    with torch.no_grad():
        output = model(vector_tensor)

    probability = output.item()
    prediction = 1 if probability > 0.5 else 0

    return {
        "prediction": "Spam" if prediction == 1 else "Ham",
        "probability": probability
    }

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

dimension_cols = ['dim_' + str(i) for i in range(767)]
X = result_df[dimension_cols].values
y = result_df['v1'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

model = train_model(X_train, y_train, X_val, y_val, input_dim=767, device=device)

Using device: cuda
Epoch 5/1000, Loss: 0.3417, Val Loss: 0.5352, Val Acc: 0.9473
Epoch 10/1000, Loss: 0.2382, Val Loss: 0.4057, Val Acc: 0.9496
Epoch 15/1000, Loss: 0.1840, Val Loss: 0.2692, Val Acc: 0.9675
Epoch 20/1000, Loss: 0.1471, Val Loss: 0.1812, Val Acc: 0.9742
Epoch 25/1000, Loss: 0.1174, Val Loss: 0.1369, Val Acc: 0.9787
Epoch 30/1000, Loss: 0.0915, Val Loss: 0.1118, Val Acc: 0.9787
Epoch 35/1000, Loss: 0.0730, Val Loss: 0.0982, Val Acc: 0.9821
Epoch 40/1000, Loss: 0.0575, Val Loss: 0.0925, Val Acc: 0.9821
Epoch 45/1000, Loss: 0.0463, Val Loss: 0.0899, Val Acc: 0.9832
Epoch 50/1000, Loss: 0.0368, Val Loss: 0.0923, Val Acc: 0.9832
Epoch 55/1000, Loss: 0.0341, Val Loss: 0.0973, Val Acc: 0.9809
Epoch 60/1000, Loss: 0.0324, Val Loss: 0.1017, Val Acc: 0.9809
Epoch 65/1000, Loss: 0.0321, Val Loss: 0.1054, Val Acc: 0.9809
Epoch 70/1000, Loss: 0.0314, Val Loss: 0.1082, Val Acc: 0.9809
Epoch 75/1000, Loss: 0.0317, Val Loss: 0.1102, Val Acc: 0.9809
Epoch 80/1000, Loss: 0.0317, Val Loss

In [63]:
model.eval()
X_test_tensor = torch.FloatTensor(X_test).to(device)
y_test_tensor = torch.FloatTensor(y_test).unsqueeze(1).to(device)

with torch.no_grad():
    test_outputs = model(X_test_tensor)
    test_preds = (test_outputs > 0.5).float().cpu().numpy().flatten()

accuracy = accuracy_score(y_test, test_preds)
f1 = f1_score(y_test, test_preds)
precision = precision_score(y_test, test_preds)
recall = recall_score(y_test, test_preds)

print(f"\nTest Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")


Test Results:
Accuracy: 0.9865
F1 Score: 0.9508
Precision: 0.9295
Recall: 0.9732


In [64]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import StandardScaler

def complete_spam_prediction_pipeline(text_messages, bert_model=None, bert_tokenizer=None,
                                     classifier_model=None, scaler=None):
    if bert_model is None or bert_tokenizer is None:
        bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        bert_model = BertModel.from_pretrained('bert-base-uncased')
        bert_model.eval()

    if classifier_model is None:
        print("Using a placeholder classifier - replace with your trained model")
        classifier_model = EmbeddingClassifier(input_dim=768)
        classifier_model.eval()

    if scaler is None:
        print("Using a placeholder scaler - replace with your trained scaler")
        scaler = StandardScaler()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    bert_model = bert_model.to(device)
    classifier_model = classifier_model.to(device)

    results = []

    for message in text_messages:
        inputs = bert_tokenizer(message, return_tensors="pt",
                              padding="max_length", truncation=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = bert_model(**inputs)

        embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()

        try:
            scaled_embedding = scaler.transform([embedding])
        except:
            scaled_embedding = np.array([embedding])

        embedding_tensor = torch.FloatTensor(scaled_embedding).to(device)

        with torch.no_grad():
            output = classifier_model(embedding_tensor)

        probability = output.item()
        prediction = 1 if probability > 0.5 else 0

        results.append({
            "message": message,
            "prediction": "Spam" if prediction == 1 else "Ham",
            "probability": probability
        })

    return results

example_messages = [
    "URGENT: You have won £10000 in our monthly prize draw! Call 09123456789 now to claim your prize!",
    "Free entry to the biggest music festival this weekend! Just SMS WIN to 12345 to claim your tickets now!",
    "Hey, are we still meeting for coffee tomorrow at 2pm?",
    "Your Amazon order #A12345 has been shipped. Track your delivery at amazon.com/orders",
    "FINAL NOTICE: Your payment is overdue. Click here immediately: payment-secure.co/verify",
    "Can you pick up some milk on your way home?",
    "Congratulations! You've been selected to receive a free iPhone 15. Click here to claim: bit.ly/claim-prize",
    "The team meeting has been moved to 3pm tomorrow. Please adjust your schedule accordingly."
]

results = complete_spam_prediction_pipeline(example_messages)

print("Complete Spam Detection Pipeline Results:")
print("-" * 80)
for i, result in enumerate(results):
    print(f"Example {i+1}:")
    print(f"Message: {result['message']}")
    print(f"Prediction: {result['prediction']}")
    print(f"Confidence: {result['probability']:.4f}")
    print("-" * 80)

Using a placeholder classifier - replace with your trained model
Using a placeholder scaler - replace with your trained scaler
Complete Spam Detection Pipeline Results:
--------------------------------------------------------------------------------
Example 1:
Message: URGENT: You have won £10000 in our monthly prize draw! Call 09123456789 now to claim your prize!
Prediction: Spam
Confidence: 0.5003
--------------------------------------------------------------------------------
Example 2:
Message: Free entry to the biggest music festival this weekend! Just SMS WIN to 12345 to claim your tickets now!
Prediction: Spam
Confidence: 0.5024
--------------------------------------------------------------------------------
Example 3:
Message: Hey, are we still meeting for coffee tomorrow at 2pm?
Prediction: Spam
Confidence: 0.5025
--------------------------------------------------------------------------------
Example 4:
Message: Your Amazon order #A12345 has been shipped. Track your delivery 