In [1]:
pip install fastai

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from fastai.text.all import *

In [3]:
# loading dataset 
df = pd.read_csv("spam.csv",encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives around here though",,,


In [4]:
#delete column 
df = df.drop('Unnamed: 2', axis=1)
df = df.drop('Unnamed: 3', axis=1)
df = df.drop('Unnamed: 4', axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
#checking null 
df.isnull().sum()

v1    0
v2    0
dtype: int64

In [6]:
#change column name
df.rename(columns={'v1': 'type', 'v2': 'text'}, inplace=True)

print(df)

      type  \
0      ham   
1      ham   
2     spam   
3      ham   
4      ham   
...    ...   
5567  spam   
5568   ham   
5569   ham   
5570   ham   
5571   ham   

                                                                                                                                                                   text  
0                                                       Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...  
1                                                                                                                                         Ok lar... Joking wif u oni...  
2           Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's  
3                                                                                                                     U dun say so early hor... U c alr

In [7]:
# encoding the type to 0,1
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df['type'])

df['label'] = le.transform(df['type'])
df.head()

Unnamed: 0,type,text,label
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives around here though",0


In [8]:
from sklearn.model_selection import train_test_split
label = df['label']
X_train, X_temp, y_train, y_temp = train_test_split(df, label, test_size=0.2, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [9]:
# Language model data preparation
dls_lm = TextDataLoaders.from_df(df, text_col='text', is_lm=True)

In [10]:
# Fine-tuning the language model (ULMFiT step 1)
learn = language_model_learner(dls_lm, AWD_LSTM, drop_mult=0.3)

In [11]:
# Fine-tuning the classifier (ULMFiT step 2)
dls_clas = TextDataLoaders.from_df(df, text_col='text', label_col='label', valid_pct=0.2, text_vocab=dls_lm.vocab)
learn = text_classifier_learner(dls_clas, AWD_LSTM, drop_mult=0.5, metrics=accuracy)

In [12]:
import torch.nn.functional as F

class CustomLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, bidirectional=True, batch_first=True, dropout_prob=0.5):
        super(CustomLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bidirectional=bidirectional, batch_first=batch_first)
        
        # Add a dense (fully connected) layer
        self.fc = nn.Linear(hidden_size * 2 if bidirectional else hidden_size, 1)  # Output size is 1 for binary classification
        
        # Add ReLU activation function
        self.relu = nn.ReLU()
        
        # Add dropout layer
        self.dropout = nn.Dropout(0.5)
        
        # Add sigmoid activation for binary classification
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        # LSTM layer
        output, _ = self.lstm(x)
        
        # Apply ReLU activation
        output = self.relu(output)
        
        # Apply dropout
        output = self.dropout(output)
        
        # Apply dense layer
        output = self.fc(output)
        
        # Apply sigmoid activation for binary classification
        output = self.sigmoid(output)
        
        return output


In [13]:
vocab_size = len(dls_clas.vocab[0])
print("Vocabulary Size:", vocab_size)

Vocabulary Size: 2944


In [14]:
# Add custom LSTM layers
custom_lstm = CustomLSTM(2944, hidden_size=64, num_layers=2, bidirectional=True, batch_first=True)
learn.model[0].encoder = custom_lstm

In [15]:
# Train the classifier
learn.fine_tune(1)

epoch,train_loss,valid_loss,accuracy,time
0,0.620267,0.295711,0.920108,00:10


  real_bs = (input[:,i] != self.pad_idx).long().sum()


epoch,train_loss,valid_loss,accuracy,time
0,0.511637,0.306925,0.931777,00:21


In [16]:
learn.fine_tune(epochs=10, cbs=EarlyStoppingCallback(patience=3, min_delta=0.01))

epoch,train_loss,valid_loss,accuracy,time
0,0.440895,0.231381,0.969479,00:14


epoch,train_loss,valid_loss,accuracy,time
0,0.323534,0.126061,0.98474,00:21
1,0.274581,0.133531,0.983842,00:21
2,0.216738,0.089368,0.98474,00:22
3,0.162766,0.072543,0.979354,00:22
4,0.115973,0.059552,0.986535,00:21
5,0.090327,0.065826,0.985637,00:23
6,0.071969,0.06012,0.983842,00:21
7,0.052213,0.060813,0.983842,00:22


No improvement since epoch 4: early stopping


In [17]:
# Make predictions on the validation dataset
preds, targets = learn.get_preds(ds_idx=1)  # Use ds_idx=1 for the validation dataset

# Convert predictions to binary labels (0 or 1)
pred_labels = (preds[:, 1] > 0.5).int()

# Compute accuracy
accuracy = (pred_labels == targets).float().mean().item()
print("Accuracy:", accuracy)


# Compute accuracy
accuracy = (pred_labels == targets).float().mean().item()
print("Accuracy:", accuracy)


# You can also compute other metrics using sklearn.metrics
from sklearn.metrics import classification_report, confusion_matrix

# Compute and print a classification report
report = classification_report(targets, pred_labels, target_names=["Non-Spam", "Spam"])
print("Classification Report:\n", report)

# Compute and print a confusion matrix
confusion = confusion_matrix(targets, pred_labels)
print("Confusion Matrix:\n", confusion)


Accuracy: 0.9838420152664185
Accuracy: 0.9838420152664185
Classification Report:
               precision    recall  f1-score   support

    Non-Spam       0.99      0.99      0.99       980
        Spam       0.95      0.91      0.93       134

    accuracy                           0.98      1114
   macro avg       0.97      0.95      0.96      1114
weighted avg       0.98      0.98      0.98      1114

Confusion Matrix:
 [[974   6]
 [ 12 122]]


In [18]:
# Predict on new SMS messages
sms_messages = ["win a free gift!!"]
results = learn.predict(sms_messages)
for message, label in zip(sms_messages, results[0]):
    print(f"Message: {message} / Predicted Label: {learn.dls.vocab[1][int(label)]}")

Message: win a free gift!! / Predicted Label: 0


In [21]:
from sklearn.metrics import accuracy_score

# Function to validate the model
def validate_model(learn, X_valid, y_valid):
    # Put the model in evaluation mode
    learn.model.eval()
    
    # Make predictions on the validation data
    preds, _ = learn.get_preds(ds_idx=1)  # Use ds_idx=1 for the validation dataset
    pred_labels = (preds[:, 1] > 0.5).int()  # Convert probabilities to binary labels
    
    # Calculate accuracy
    accuracy = accuracy_score(y_valid, pred_labels)
    
print("Validation Accuracy:", accuracy)

# Function to test the model
def test_model(learn, X_test, y_test):
    # Put the model in evaluation mode
    learn.model.eval()
    
    # Make predictions on the test data
    preds, _ = learn.get_preds(ds_idx=2)  # Use ds_idx=2 for the test dataset
    pred_labels = (preds[:, 1] > 0.5).int()  # Convert probabilities to binary labels
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, pred_labels)
    
print("Test Accuracy:", accuracy)


Validation Accuracy: 0.9838420152664185
Test Accuracy: 0.9838420152664185
