In [33]:
# Importing the numpy library for handling numerical operations
import numpy as np

# Importing pandas for handling and manipulating data in tabular form
import pandas as pd

# Importing a function to split the dataset into training and test sets
from sklearn.model_selection import train_test_split

# Importing TfidfVectorizer for converting text data into numerical vectors using TF-IDF technique
from sklearn.feature_extraction.text import TfidfVectorizer

# Importing LogisticRegression, a classification algorithm used to build the model
from sklearn.linear_model import LogisticRegression

# Importing accuracy_score to evaluate how well our model performs
from sklearn.metrics import accuracy_score

In [35]:
# Step 1: Loading and Preparing the Data

# Load the dataset from a CSV file into a pandas DataFrame
raw_mail_data = pd.read_csv('mail_data.csv')
print(raw_mail_data)  # Print the raw data to understand its structure

# Replace any missing/null values with an empty string to avoid errors during processing
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)), '')

# Show the first 5 rows of the cleaned dataset
mail_data.head()

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [37]:
# Print the shape of the DataFrame (number of rows and columns)
mail_data.shape


(5572, 2)

In [39]:
# Step 2: Preprocessing - Label Encoding

# Replace text labels with numerical values

# 'spam' is labeled as 0, meaning a bad/unwanted email
# 'ham' is labeled as 1, meaning a good/normal email
mail_data.loc[mail_data['Category'] == 'spam', 'Category'] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category'] = 1

In [41]:
# Step 3: Separating Text Data and Labels

# Feature - the actual email messages
X = mail_data['Message']

# Label - 0 or 1 depending on spam or ham
Y = mail_data['Category']

# Print the email messages and their corresponding labels
print(X)
print("\n")
print(Y)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


In [43]:
# Step 4: Splitting the Data into Training and Testing Sets

# 80% training, 20% testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

# Display the size of original data, training data, and test data
print(X.shape)         # Total data
print(X_train.shape)   # Training data
print(X_test.shape)    # Test data

(5572,)
(4457,)
(1115,)


In [45]:
# Step 5: Feature Extraction using TF-IDF

# TF-IDF = Term Frequency - Inverse Document Frequency
# Converts text data into numerical vectors while reducing the impact of common words
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

# Fit on training data and transform both training and testing text
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# Convert label values to integers for training
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

# Print a sample of the transformed data
print("Original data")
print(X_train) # Original text data
print("\n")
print("Transformed TF-IDF data")
print(X_train_features)    # Transformed TF-IDF features

Original data
3075                  Don know. I did't msg him recently.
1787    Do you know why god created gap between your f...
1614                         Thnx dude. u guys out 2nite?
4304                                      Yup i'm free...
3266    44 7732584351, Do you want a New Nokia 3510i c...
                              ...                        
789     5 Free Top Polyphonic Tones call 087018728737,...
968     What do u want when i come back?.a beautiful n...
1667    Guess who spent all last night phasing in and ...
3321    Eh sorry leh... I din c ur msg. Not sad alread...
1688    Free Top ringtone -sub to weekly ringtone-get ...
Name: Message, Length: 4457, dtype: object


Transformed TF-IDF data
  (0, 2329)	0.38783870336935383
  (0, 3811)	0.34780165336891333
  (0, 2224)	0.413103377943378
  (0, 4456)	0.4168658090846482
  (0, 5413)	0.6198254967574347
  (1, 3811)	0.17419952275504033
  (1, 3046)	0.2503712792613518
  (1, 1991)	0.33036995955537024
  (1, 2956)	0.33036995955537

In [47]:
# Step 6: Model Training - Logistic Regression

# Initialize the Logistic Regression model
model = LogisticRegression()

# Train the model using the training data
model.fit(X_train_features, Y_train)

# Step 7: Evaluating the Model

# Predict labels for training data
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)
print('Accuracy on training data : ', accuracy_on_training_data)

# Predict labels for test data
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on training data :  0.9676912721561588
Accuracy on test data :  0.9668161434977578


In [49]:
# Step 8: Predicting a New Input
# ------------------------------------

# New input email to classify
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"]

# Convert the input text to TF-IDF feature vector (same transformation as training data)
input_data_features = feature_extraction.transform(input_mail)

# Use the trained model to predict the label
prediction = model.predict(input_data_features)
print(prediction)  # Output will be 0 or 1

# Interpret the prediction
if (prediction[0] == 1):
    print('Ham mail')  # Non-spam
else:
    print('Spam mail')  # Spam

[1]
Ham mail


In [51]:
# Step 8: Predicting a New Input
# ------------------------------------

# New input email to classify
input_mail = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]

# Convert the input text to TF-IDF feature vector (same transformation as training data)
input_data_features = feature_extraction.transform(input_mail)

# Use the trained model to predict the label
prediction = model.predict(input_data_features)
print(prediction)  # Output will be 0 or 1

# Interpret the prediction
if (prediction[0] == 1):
    print('Ham mail')  # Non-spam
else:
    print('Spam mail')  # Spam

[0]
Spam mail
