# Building an email spam detection using supervised learning
### The mail_data.csv provided contains 2 columns namely
- Message : This contains description of a email content. Collected from various sources
- Category: This contains the classification
    - spam: unwanted, unsolicited junk messages. 
    - ham: not a spam and might be required for the user to keep. Good message

In [2]:
# Task From https://www.youtube.com/watch?v=FkF2jhaRJIs
# Importing Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split # To split the data as training data set and testing data set
from sklearn.feature_extraction.text import TfidfVectorizer # Feature of extraction technique in NLP to extract data from a text and tokanize it as a numerical representation
from sklearn.linear_model import LogisticRegression # The model which will be used to detect the spam. 
from sklearn.metrics import accuracy_score # This will tell you the accuracy of the model. To evaluate the performance of the model

In [3]:
df = pd.read_csv("./Data/mail_data.csv")
df.head() # Check if the data is imported properly
nRecords = int(df.shape[0])
nColumns = int(df.shape[1])
type(df["Category"].describe())

pandas.core.series.Series

In [4]:
df.isna().count()

Category    5572
Message     5572
dtype: int64

In [5]:
df.isna().count()

Category    5572
Message     5572
dtype: int64

In [29]:
# Check if the data has null values especially categorization. If yes then we need to remove those records
data = df.where(pd.notnull(df), '') # rename all null values to blanks

In [30]:
data.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [31]:
# Replace the values spam as 0 and ham as 1
# Method 1 to replace
# data.loc[data["Category"] == 'spam', 'Category',] = 0
# data.loc[data["Category"] == 'ham', 'Category',] = 1
# data

# Method 2 to replace
data['Category'] = data['Category'].map({'ham': 1, 'spam': 0})
data

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will ü b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


In [32]:
# Converting the data into series which will become an input for the training and testing dataset
X = data["Message"]
Y = data["Category"]

print ('Type(X):', type(X), X.shape, '\nType(Y):', type(Y), Y.shape)
print ('X:\n',X.head(3))
print ('Y:\n',Y.head(3))



Type(X): <class 'pandas.core.series.Series'> (5572,) 
Type(Y): <class 'pandas.core.series.Series'> (5572,)
X:
 0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
Name: Message, dtype: object
Y:
 0    1
1    1
2    0
Name: Category, dtype: int64


In [33]:
# Random state can range from 0 to 2^(32-1)
# train_size should be 80% of the data which is denoted as train_size= .8
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3) 

In [34]:
print(X_train)

3075                  Don know. I did't msg him recently.
1787    Do you know why god created gap between your f...
1614                         Thnx dude. u guys out 2nite?
4304                                      Yup i'm free...
3266    44 7732584351, Do you want a New Nokia 3510i c...
                              ...                        
789     5 Free Top Polyphonic Tones call 087018728737,...
968     What do u want when i come back?.a beautiful n...
1667    Guess who spent all last night phasing in and ...
3321    Eh sorry leh... I din c ur msg. Not sad alread...
1688    Free Top ringtone -sub to weekly ringtone-get ...
Name: Message, Length: 4457, dtype: object


In [35]:
print ('X:', X.shape, 'Y:', Y.shape)
print('X Training:', X_train.shape, 'Y Training:',Y_train.shape)
print('X Testing:', X_test.shape, 'Y Testing:',Y_test.shape)

X: (5572,) Y: (5572,)
X Training: (4457,) Y Training: (4457,)
X Testing: (1115,) Y Testing: (1115,)


In [36]:
# Vectorize the messages and give a mapping to all the words
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_Train_Features = feature_extraction.fit_transform(X_train) # fit and transform the training set to the algorithm
X_Test_Features = feature_extraction.fit_transform(X_test) # fit and transform the testing set to the algorithm

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [37]:
print(X_Train_Features.shape)
print(X_Test_Features.shape)

(4457, 7431)
(1115, 3296)


In [38]:
# Train the model
model = LogisticRegression()

In [39]:
model.fit(X_Train_Features, Y_train)

In [40]:
print(X_Train_Features.shape)
print(Y_train.shape)

(4457, 7431)
(4457,)


In [41]:
print(X_Test_Features.shape)
print(Y_test.shape)

(1115, 3296)
(1115,)


In [42]:
# Evaluation of the training model
train_data_prediction = model.predict(X_Train_Features)
accuracy_on_training_data = accuracy_score(Y_train, train_data_prediction)
print("Accuracy on predicted training data: ", accuracy_on_training_data)

Accuracy on predicted training data:  0.9670181736594121


In [43]:
test_data_prediction = model.predict(X_Test_Features)
accuracy_on_test_data = accuracy_score(Y_test, test_data_prediction)

print("Accuracy on predicted test data: ", accuracy_on_test_data)


ValueError: X has 3296 features, but LogisticRegression is expecting 7431 features as input.

In [1]:
# Test with a new Mail information
mail = ["this is the second time we have tried to contact you"]
input_data_features = feature_extraction.transform(mail)

mail_prediction = model.predict(input_data_features)

print(mail_prediction)


NameError: name 'feature_extraction' is not defined