In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import  train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

Data Preprocessing

In [None]:
#load the dataset to pandas DataFrame
raw_mail_data=pd.read_csv('spam_ham_dataset.csv')
#replace the null values with anull string
mail_data=raw_mail_data.where(pd.notnull(raw_mail_data),' ')

In [None]:
mail_data.shape

(5171, 4)

In [None]:
mail_data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [None]:
#label spam mail as 0; non spam mail as 1
mail_data.loc[mail_data['label']=='spam','label',]=0
mail_data.loc[mail_data['label']=='ham','label',]=1


In [None]:
#seperate the data as text and label x-->text y-->LABEL
x=mail_data['text']
y=mail_data['label']

In [None]:
print(x)
print('........')
print(y)

0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: text, Length: 5171, dtype: object
........
0       1
1       1
2       1
3       0
4       1
       ..
5166    1
5167    1
5168    1
5169    1
5170    0
Name: label, Length: 5171, dtype: object


Train Test Split

In [None]:
from sklearn.model_selection import  train_test_split

In [None]:
#split the data as train data and test data
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.8,test_size=0.2,random_state=3)

Feature Extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [None]:
#transform the text data to feature vectors that can be used as inputto the svm model using Tfidfvectorizer
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase='True')
x_train_features=feature_extraction.fit_transform(x_train)
x_test_features=feature_extraction.transform(x_test)

#convert y_train and y_test values as integers
y_train=y_train.astype('int')
y_test=y_test.astype('int')

Training the model ->Support Vector Machine 

In [None]:
#training the svm model with training data
model=LinearSVC()
model.fit(x_train_features,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

Evaluation of the Model

In [None]:
#prediction on training data
prediction_on_training_data=model.predict(x_train_features)
accuracy_on_training_data=accuracy_score(y_train,prediction_on_training_data)
print('Accuracy on training data:',accuracy_on_training_data)


Accuracy on training data: 1.0


In [None]:
#prediction on test data
prediction_on_test_data=model.predict(x_test_features)
accuracy_on_test_data=accuracy_score(y_test,prediction_on_test_data)
print('Accuracy on testing data:',accuracy_on_test_data)

Accuracy on testing data: 0.9864734299516909


Prediction on new mail

In [None]:
input_mail=["Subject: photoshop , windows , office . cheap . main trending abasements darer prudently fortuitous undergone lighthearted charm orinoco taster railroad affluent pornographic cuvier irvin parkhouse blame worthy chlorophyll robed diagrammatic fogarty clears bayda inconveniencing managing represented smartness hashish academies shareholders unload badness danielson pure caffein spaniard chargeable levin"]
input_mail_features=feature_extraction.transform(input_mail)
#making prediction
prediction=model.predict(input_mail_features)
print(prediction)
if(prediction[0]==1):
  print('HAM MAIL')
else:
  print('SPAM MAIL')

[0]
SPAM MAIL
