# Email spam classifier 

In [123]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [124]:
df = pd.read_csv("mail_data.csv")

In [125]:
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [126]:
data = df.where((pd.notnull(df)), '')

In [127]:
data.shape 

(5572, 2)

In [128]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [129]:
# Encode values in the categorical column
data.loc[data['Category'] == 'spam', 'Category'] = 0
data.loc[data['Category'] == 'ham', 'Category'] = 1

In [130]:
data['Category']

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

In [131]:
# specify independent and dependent or target variable 
X = data['Message'] #Independent Variable 

Y = data['Category'].astype(int) # convert to integer cause its of the models in  use 

In [132]:
X.head(5)

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

In [133]:
Y.head(5)

0    1
1    1
2    0
3    1
4    1
Name: Category, dtype: int32

In [134]:
#Vectorization
feature_extraction = TfidfVectorizer(min_df=1, stop_words= 'english', lowercase= True)

In [135]:
#Splitting data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state= 3)

In [136]:
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [137]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(4457,)
(1115,)


In [138]:
print(Y.shape)
print(Y_train.shape)
print(Y_test.shape)

(5572,)
(4457,)
(1115,)


# Train the model

In [139]:
#train the model 
lr_model = LogisticRegression()
nb_model = MultinomialNB()

In [140]:
lr_model.fit(X_train_features, Y_train)

In [141]:
nb_model.fit(X_train_features, Y_train)

In [142]:
#make a prediction using both models

lr_pred = lr_model.predict(X_test_features)
nb_pred = nb_model.predict(X_test_features)

# Model Evaluation

In [143]:
#accuracy for logistic regression model

accuracy_lr = accuracy_score(Y_test, lr_pred)
print('Accuracy:', accuracy_lr)

Accuracy: 0.9659192825112107


In [144]:
#accuracy for logistic naive bayes model

accuracy_nb = accuracy_score(Y_test, nb_pred)
print('Accuracy:', accuracy_nb)

Accuracy: 0.9730941704035875


# Build a predictive System 

In [239]:
X_test.sample(20) # testing samples
# open dataset to see correspondence 

4674    I forgot 2 ask ü all smth.. There's a card on ...
3109    Hello hun how ru? Its here by the way. Im good...
3265    I will send them to your email. Do you mind  &...
2823    ROMCAPspam Everyone around should be respondin...
261                                                   Yup
1048    1000's flirting NOW! Txt GIRL or BLOKE & ur NA...
4485         Shopping? Eh ger i toking abt syd leh...Haha
1044    Mmm thats better now i got a roast down me! i...
1030                           Its good, we'll find a way
2621                                            How come?
3670    Yeah imma come over cause jay wants to do some...
1176    Horrible u eat macs eat until u forgot abt me ...
2072         Good night my dear.. Sleepwell&amp;Take care
2658                          Dai  &lt;#&gt;  naal eruku.
2959    U have a secret admirer. REVEAL who thinks U R...
1493    In the end she might still vomit but its okay....
4636           How come i din c ü... Yup i cut my hair...
229     Life i

In [325]:
# input mail here 
input_mail = ["  "]

In [300]:
input_data_features = feature_extraction.transform(input_mail)

lr_prediction = lr_model.predict(input_data_features)

print("logistic Regression", lr_prediction)
if (lr_prediction[0]==1):
    print('Ham mail')
else:
    print('spam mail')

logistic Regression [1]
Ham mail


In [314]:
input_data_features = feature_extraction.transform(input_mail)

nb_prediction = nb_model.predict(input_data_features)

print("NaiveBayes", nb_prediction)
if (nb_prediction[0]==1):
    print('Ham mail')
else:
    print('spam mail')

NaiveBayes [1]
Ham mail
