# SMS Spam Text Message Classification using Deep Learning

## Dataset

**Spam Text Message Classification Dataset**: A collection of labeled SMS messages, categorized as "spam" or "ham". [Dataset Link](https://www.kaggle.com/uciml/sms-spam-collection-dataset)


### 1. Package and Module Installation

In [None]:
!pip install scikit-learn tensorflow



### 2. Data Loading and Preprocessing

In [None]:
# Load data
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/My Drive/REATBFZR_Filbert Naldo Wijaya/spam.csv'
data = pd.read_csv(file_path, encoding='latin-1')

data.head()

# Drop unnecessary columns
data = data.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])

# Rename the columns for clarity
data.columns = ['label', 'message']

# Check for missing values
missing_values = data.isnull().sum()

# Check the distribution of the labels
label_distribution = data['label'].value_counts()

missing_values, label_distribution

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


(label      0
 message    0
 dtype: int64,
 label
 ham     4825
 spam     747
 Name: count, dtype: int64)

### 3. Model Building

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import torch.nn as nn
import torch.optim as optim

# Encode labels
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

# Split data
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.2, random_state=42)

# Vectorize text data
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train).toarray()
X_test_vec = vectorizer.transform(X_test).toarray()

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_vec, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_vec, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

### 4. Model Training

In [None]:
# for Loss Value
dl_loss_value = 0

# Building Model
class SpamClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim1, hidden_dim2):
        super(SpamClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

# Instantiate model, loss function, and optimizer
input_dim = X_train_vec.shape[1]  # Number of features
hidden_dim = 10
model = SpamClassifier(input_dim=X_train_vec.shape[1], hidden_dim1=64, hidden_dim2=32)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()

    # Forward pass
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch+1) % 20 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
        dl_loss_value = loss.item()

Epoch [20/100], Loss: 0.6063
Epoch [40/100], Loss: 0.4712
Epoch [60/100], Loss: 0.2216
Epoch [80/100], Loss: 0.0732
Epoch [100/100], Loss: 0.0283


### 5. Model Evaluation

In [None]:
# for Accuracy
dl_accuracy = 0

model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    predicted = test_outputs.round().squeeze().numpy()
    accuracy = (predicted == y_test_tensor.squeeze().numpy()).mean()
    print(f'Accuracy on test set: {accuracy:.4f}')
    dl_accuracy = accuracy

Accuracy on test set: 0.9812
