In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split #function for training and testing
from sklearn.feature_extraction.text import TfidfVectorizer #text to numerical data
from sklearn.linear_model import LogisticRegression #the algorithm that we use
from sklearn.metrics import accuracy_score #to evaluate our model
import pickle

In [46]:
# to load data
raw_mail_data = pd.read_csv("D:\workspace python\mlprojects\spam prediction project\mail_data.csv")
raw_mail_data.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [47]:
# take care of null values with null strings
mail_data = raw_mail_data.where(pd.notnull(raw_mail_data),"")
mail_data.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [48]:
#checking dimensions
mail_data.shape

(5572, 2)

In [49]:
# label encoding -> ham - 1 | spam - 0
#replace all spam category values with 0
mail_data.loc[mail_data["Category"]=='spam',"Category",] = 0

#replace all ham category values with 0
mail_data.loc[mail_data["Category"]=='ham',"Category",] = 1

In [50]:
#seperating the data as texts and labels
x = mail_data['Message']
y = mail_data['Category']

In [51]:
#spliting x and y to training and testing data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=3)
#random state=3 splits data in the same manner
print(x.shape)
print(x_train.shape)
print(x_test.shape)

(5572,)
(4457,)
(1115,)


In [52]:
# transform the text data to feature vectors that can be used as input to the Logistic regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True')
with open("feature_extraction_SM","wb") as f:
    pickle.dump(TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True'),f)
with open("feature_extraction_SM","rb") as f:
    fe = pickle.load(f)

x_train_features = feature_extraction.fit_transform(x_train) #fit and transform to features
x_test_features = feature_extraction.transform(x_test) #we dont fit data again for test data

# convert y train and test as integers 
y_train = y_train.astype('int')
y_test = y_test.astype('int')
print(x_train_features) #which is scored into numbers

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

In [53]:
#Train logistic reg model
#x data represented into numerical form 
model = LogisticRegression()
model.fit(x_train_features,y_train) #both in numerical form

LogisticRegression()

In [54]:
#evaluating the model
#predict on training data

predict_training_data = model.predict(x_train_features);
accuracy = accuracy_score(y_train,predict_training_data)
accuracy

0.9670181736594121

In [55]:
#prediction on test data
predict_training_data = model.predict(x_test_features);
accuracy = accuracy_score(y_test,predict_training_data)
accuracy

0.9659192825112107

In [56]:
mail = ["TODAY is Sorry day.! If ever i was angry with you, if ever i misbehaved or hurt you? plz plz JUST SLAP URSELF Bcoz, Its ur fault, I'm basically GOOD"]
#text to feature vectors
mail = feature_extraction.transform(mail)

if (model.predict(mail)==1):
    print("HAM")
else:
    print("SPAM")

HAM


In [57]:
mail = ["Here is your discount code RP176781. To stop further messages reply stop. www.regalportfolio.co.uk. Customer Services 08717205546"]
#text to feature vectors
mail = feature_extraction.transform(mail)
if (model.predict(mail)==1):
    print("HAM")
else:
    print("SPAM")

SPAM


In [58]:
#import pickle
with open("spam_mail_prediction_model","wb") as f:
    pickle.dump(model,f)

In [59]:
with open("spam_mail_prediction_model","rb") as f:
    mp = pickle.load(f)
mp.predict(mail)

array([0])