In [41]:
#importing packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [42]:
#Data collection and preprocessing

raw_data=pd.read_csv('maildata.csv')

In [43]:
raw_data

Unnamed: 0,Category,Message
0,ham,"Nah I don't think he goes to usf, he lives aro..."
1,ham,Even my brother is not like to speak with me. ...
2,ham,As per your request 'Melle Melle (Oru Minnamin...
3,ham,I'm gonna be home soon and i don't want to tal...
4,ham,I've been searching for the right words to tha...
...,...,...
1490,spam,Want explicit SEX in 30 secs? Ring 02073162414...
1491,spam,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
1492,spam,Had your contract mobile 11 Mnths? Latest Moto...
1493,spam,REMINDER FROM O2: To get 2.50 pounds free call...


In [44]:
#repalce the mail values with a null string

mail_data=raw_data.where((pd.notnull(raw_data)),'') 

In [46]:
#printing the first five rows of the dataffram

mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Nah I don't think he goes to usf, he lives aro..."
1,ham,Even my brother is not like to speak with me. ...
2,ham,As per your request 'Melle Melle (Oru Minnamin...
3,ham,I'm gonna be home soon and i don't want to tal...
4,ham,I've been searching for the right words to tha...


In [47]:
#Checking the number of rows and columns in the dataframe

mail_data.shape

(1495, 2)

In [48]:
#Lable Encoding
#label spam mail as 1; ham mail as 0;
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()


mail_data['Category']= le.fit_transform(mail_data['Category'])
mail_data['Category'].value_counts()

0    748
1    747
Name: Category, dtype: int64

In [49]:
#Spam=1
#ham=0
#Seperating the data as text and labels

X=mail_data['Message']
print(X)

0       Nah I don't think he goes to usf, he lives aro...
1       Even my brother is not like to speak with me. ...
2       As per your request 'Melle Melle (Oru Minnamin...
3       I'm gonna be home soon and i don't want to tal...
4       I've been searching for the right words to tha...
                              ...                        
1490    Want explicit SEX in 30 secs? Ring 02073162414...
1491    ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
1492    Had your contract mobile 11 Mnths? Latest Moto...
1493    REMINDER FROM O2: To get 2.50 pounds free call...
1494    This is the 2nd time we have tried 2 contact u...
Name: Message, Length: 1495, dtype: object


In [50]:
Y=mail_data['Category']
print(Y)

0       0
1       0
2       0
3       0
4       0
       ..
1490    1
1491    1
1492    1
1493    1
1494    1
Name: Category, Length: 1495, dtype: int32


In [51]:
#Splitting the data into training data and testing data

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=3)

In [52]:
print(X.shape)
print(X_test.shape)
print(X_train.shape)

(1495,)
(299,)
(1196,)


In [53]:
#Feature Extraction
#Transform the text data to feature vectors that can be used as input to the logistic regression model

feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase='True')
X_train_features=feature_extraction.fit_transform(X_train)
X_test_features=feature_extraction.transform(X_test)

In [54]:
#Convert Y_train and Y_test values as Integers

Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int')

In [55]:
print(X_train)

740                           Dude we should go sup again
51                     I plane to give on this month end.
824     Today's Offer! Claim ur £150 worth of discount...
566     Yes baby! We can study all the positions of th...
548                 Ya very nice. . .be ready on thursday
                              ...                        
789     Ringtone Club: Get the UK singles chart on you...
256     Oh and by the way you do have more food in you...
968     You have won a Nokia 7250i. This is what you g...
952     Congratulations ur awarded either £500 of CD g...
1273    You have been specially selected to receive a ...
Name: Message, Length: 1196, dtype: object


In [56]:
print(X_train_features)

  (0, 3248)	0.7267512887334335
  (0, 1399)	0.6869006946599293
  (1, 1435)	0.5053855521037061
  (1, 2342)	0.5118671671424306
  (1, 2601)	0.6946779447531173
  (2, 2823)	0.14876146644957278
  (2, 3513)	0.2522714632762162
  (2, 298)	0.17867417501990981
  (2, 3214)	0.24678660408443692
  (2, 0)	0.24678660408443692
  (2, 132)	0.2752579216512388
  (2, 1227)	0.18735378662229996
  (2, 2324)	0.14782789379384276
  (2, 2473)	0.23370598056998457
  (2, 2276)	0.25860326852853616
  (2, 2917)	0.24678660408443692
  (2, 585)	0.2522714632762162
  (2, 3758)	0.19114806034353965
  (2, 3326)	0.1442923423417189
  (2, 3589)	0.21125646354883518
  (2, 1342)	0.24194861540583357
  (2, 3724)	0.22684416243284067
  (2, 287)	0.22380014570941428
  (2, 3525)	0.14026210934890468
  (2, 1111)	0.14923633218537297
  :	:
  (1194, 1677)	0.2451380739483686
  (1194, 250)	0.2037880980117859
  (1194, 1384)	0.2051539894181722
  (1194, 472)	0.1975259056784092
  (1194, 817)	0.2051539894181722
  (1194, 1602)	0.13777879221430384
  (1194,

In [57]:
#Training The Model
#Logistic Regression

model=LogisticRegression()

In [58]:
#Training the Logistic Regression Model with the training data

model.fit(X_train_features,Y_train)

LogisticRegression()

In [59]:
#Evaluating the trained model
#Prediction on training data
prediction_on_training_data=model.predict(X_train_features)
accuracy_on_training_data=accuracy_score(Y_train,prediction_on_training_data)

In [60]:
print('Accuracy on training data:',accuracy_on_training_data*100)

Accuracy on training data: 98.24414715719062


In [61]:
#Prediction on training data

prediction_on_test_data=model.predict(X_test_features)
accuracy_on_test_data=accuracy_score(Y_test,prediction_on_test_data)

In [62]:
print('Accuracy on test data:',accuracy_on_test_data)

Accuracy on test data: 0.9498327759197325


In [66]:
#Building a predictive System

input_mail=["Reminder: You have not downloaded the content you have already paid for. Goto http://doit. mymoby. tv/ to collect your content"]


In [67]:
#convert text to feature vectors

input_data_features=feature_extraction.transform(input_mail)

In [68]:
#making predictions

prediction=model.predict(input_data_features)
print(prediction)

if (prediction[0]==0):
    print('Ham mail')
else:
    print('Spam mail')

[1]
Spam mail
