In [47]:
# Import Laibraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [48]:
# Loading Data

raw_mail_data = pd.read_csv("/home/ihsan/Documents/GitHub/ML-AI-ICT-course/ML Projects/Spam Mail Prediction/mail_data.csv")
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [49]:
#replace all null values with a null string 
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [50]:
raw_mail_data.shape

(5572, 2)

Label Encoding

In [51]:
# label spam mail:0
# ham mail:1
raw_mail_data.loc[mail_data['Category'] == 'spam', 'Catagory'] = 0
raw_mail_data.loc[mail_data['Category'] == 'ham', 'Catagory'] = 1

In [52]:
raw_mail_data.head()

Unnamed: 0,Category,Message,Catagory
0,ham,"Go until jurong point, crazy.. Available only ...",1.0
1,ham,Ok lar... Joking wif u oni...,1.0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,0.0
3,ham,U dun say so early hor... U c already then say...,1.0
4,ham,"Nah I don't think he goes to usf, he lives aro...",1.0


In [53]:
# seperating the data as texts and label

x = mail_data['Message']
y = mail_data['Category']

Splitting train and test data

In [54]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

In [55]:
print(x.shape, x_train.shape, x_test.shape)

(5572,) (4457,) (1115,)


Feature Extraction

In [56]:
# transform the test data to feature vectors that can be used as input to the logistic regression 

feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase='True')

In [57]:
y_train.head()

3890     ham
5553     ham
4366     ham
3968    spam
3771     ham
Name: Category, dtype: object

In [58]:
x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

# # convert y_train and y_test views as integers

# y_train = y_train.astype('int')
# y_test = y_test.astype('int')

In [59]:
print(x_train)

3890                    Unlimited texts. Limited minutes.
5553                          Hahaha..use your brain dear
4366    Ujhhhhhhh computer shipped out with address to...
3968    YOU HAVE WON! As a valued Vodafone customer ou...
3771    Love it! The girls at the office may wonder wh...
                              ...                        
3335    That's fine, have him give me a call if he kno...
1099    NO GIFTS!! You trying to get me to throw mysel...
2514    U have won a nokia 6230 plus a free digital ca...
3606                      Jordan got voted out last nite!
2575    Your next amazing xxx PICSFREE1 video will be ...
Name: Message, Length: 4457, dtype: object


In [60]:
print(x_train_features)

  (0, 4334)	0.42941702167641554
  (0, 3958)	0.6161071828926097
  (0, 6586)	0.44333254982109394
  (0, 6927)	0.48935591439341625
  (1, 2121)	0.3573617143022146
  (1, 1428)	0.5869421390016223
  (1, 6971)	0.42812434651556874
  (1, 3168)	0.5869421390016223
  (2, 5115)	0.3408491178137899
  (2, 7353)	0.31988118061968496
  (2, 3852)	0.3408491178137899
  (2, 4884)	0.35749230587184955
  (2, 5695)	0.35749230587184955
  (2, 806)	0.26730249393705324
  (2, 5894)	0.35749230587184955
  (2, 1876)	0.28751725124107325
  (2, 6878)	0.35749230587184955
  (3, 197)	0.36522237107066735
  (3, 3723)	0.16297045459835785
  (3, 2435)	0.26698378141852
  (3, 1825)	0.26858331513730566
  (3, 5231)	0.2266831802864503
  (3, 300)	0.2915969875465198
  (3, 7248)	0.23571908490908416
  (3, 5005)	0.3169028431039865
  :	:
  (4454, 2244)	0.2526916142542512
  (4454, 666)	0.28653660324238944
  (4454, 1575)	0.20946314330145205
  (4454, 1094)	0.24862733340971144
  (4454, 5068)	0.22284357632450164
  (4454, 311)	0.19547195974237946
  

In [61]:
print(y_train)

3890     ham
5553     ham
4366     ham
3968    spam
3771     ham
        ... 
3335     ham
1099     ham
2514    spam
3606     ham
2575    spam
Name: Category, Length: 4457, dtype: object


In [62]:
model = LogisticRegression()

In [63]:
model.fit(x_train_features, y_train)

In [64]:
# prediction on training data

prediction_on_training_data = model.predict(x_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

In [65]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9683643706529056


In [67]:
# prediction on test data

prediction_on_test_data = model.predict(x_test_features)
accuracy_on_test_data = accuracy_score(y_test, prediction_on_test_data)

In [68]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9524663677130045


Building a Predictive System

In [69]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

['ham']
Spam mail
