# **STEP 2 — Import Libraries**

In [None]:
# Basic Libraries
import numpy as np
import pandas as pd
import re
import os

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# For padding
from collections import Counter

# **STEP 3 — Load and Explore Dataset**

In [None]:
# Load dataset
data = pd.read_csv('/content/SMSSpamCollection',
                   sep='\t', header=None, names=['label', 'text'])

print(data.head())
print("Total samples:", len(data))
print("Class distribution:\n", data['label'].value_counts())

  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
Total samples: 5572
Class distribution:
 label
ham     4825
spam     747
Name: count, dtype: int64


# **STEP 4 — Text Preprocessing**

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

data['text'] = data['text'].apply(clean_text)

# Tokenization
data['tokens'] = data['text'].apply(lambda x: x.split())

In [None]:
MAX_LEN = 50

# **STEP 5 — Vocabulary and Embedding Preparation**

In [None]:
all_words = [word for tokens in data['tokens'] for word in tokens]
word_counts = Counter(all_words)

vocab = {word: idx+1 for idx, (word, _) in enumerate(word_counts.items())}
vocab['<PAD>'] = 0

vocab_size = len(vocab)

# **Download and Load GloVe**

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2026-02-19 10:02:23--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2026-02-19 10:02:24--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2026-02-19 10:02:24--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’

Ar

In [None]:
def load_glove(glove_file, embedding_dim):
    embeddings = {}
    with open(glove_file, encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

embedding_dim = 100
glove = load_glove('glove.6B.100d.txt', embedding_dim)

# **Embedding Matrix**

In [None]:
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, idx in vocab.items():
    vector = glove.get(word)
    if vector is not None:
        embedding_matrix[idx] = vector


# **STEP 6 — Train-Test Split**

In [None]:
X = data['tokens']
y = data['label'].map({'ham':0, 'spam':1})

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# **STEP 7 — Build 1D CNN Model**

In [None]:
class SMSDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len):
        self.texts = texts
        self.labels = labels.values
        self.vocab = vocab
        self.max_len = max_len

    def encode(self, tokens):
        encoded = [self.vocab.get(word, 0) for word in tokens]
        if len(encoded) < self.max_len:
            encoded += [0] * (self.max_len - len(encoded))
        else:
            encoded = encoded[:self.max_len]
        return torch.tensor(encoded)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.encode(self.texts.iloc[idx]), torch.tensor(self.labels[idx])


# **CNN Model**

In [None]:
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, embedding_matrix):
        super(TextCNN, self).__init__()

        self.embedding = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix, dtype=torch.float32),
            freeze=False)

        self.conv = nn.Conv1d(embedding_dim, 128, kernel_size=5)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = torch.relu(self.conv(x))
        x = self.pool(x).squeeze(2)
        x = self.sigmoid(self.fc(x))
        return x


# **STEP 8 — Model Training**

In [None]:
train_dataset = SMSDataset(X_train, y_train, vocab, MAX_LEN)
test_dataset = SMSDataset(X_test, y_test, vocab, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

model = TextCNN(vocab_size, embedding_dim, embedding_matrix)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(5):
    model.train()
    total_loss = 0

    for inputs, labels in train_loader:
        labels = labels.float().unsqueeze(1)

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")


Epoch 1, Loss: 0.20403586458414794
Epoch 2, Loss: 0.05254475425962093
Epoch 3, Loss: 0.02229563178261742
Epoch 4, Loss: 0.011001757635468883
Epoch 5, Loss: 0.005619726374321284


# **STEP 9 — Model Evaluation**

In [None]:
model.eval()
preds = []
true = []

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        predicted = (outputs > 0.5).int()

        preds.extend(predicted.squeeze().tolist())
        true.extend(labels.tolist())

acc = accuracy_score(true, preds)
prec = precision_score(true, preds)
rec = recall_score(true, preds)
f1 = f1_score(true, preds)
cm = confusion_matrix(true, preds)

print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1 Score:", f1)
print("Confusion Matrix:\n", cm)


Accuracy: 0.9829596412556054
Precision: 0.9710144927536232
Recall: 0.8993288590604027
F1 Score: 0.9337979094076655
Confusion Matrix:
 [[962   4]
 [ 15 134]]


# **STEP 10 — Result Analysis**

The model achieved strong classification performance using pretrained GloVe embeddings. Pretrained embeddings improved semantic understanding because similar words share similar vector representations. Compared to random initialization, convergence was faster and loss decreased smoothly. The CNN was able to capture important local features such as spam keywords and patterns. The model handled short SMS texts effectively due to convolutional filters. Precision and recall values indicate good spam detection capability. However, rare words not present in GloVe were mapped to zero vectors, which may slightly reduce performance. The dataset imbalance can also influence evaluation metrics. Overall, pretrained embeddings significantly improved generalization compared to training embeddings from scratch.