# Importing the dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Data Collection and Pre-Processing

In [2]:
# loading the data from csv file to a pandas DataFrame
dataset=pd.read_csv('email_data.csv')
dataset

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


# Replacing null values with a null string

In [5]:
dataset=dataset.where((pd.notnull(dataset)),'')
dataset

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


# checking number of rows and columns in dataset

In [6]:
dataset.shape

(5572, 2)

# Label Encoding.... Label spam mail as 0 and ham mail as 1 ;

In [7]:
dataset.loc[dataset['Category']=='spam','Category']=0
dataset.loc[dataset['Category']=='ham','Category']=1

# separating the data as texts and labels

In [9]:
x=dataset['Message']
y=dataset['Category']

In [10]:
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [11]:
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

# splitting the data into training data and testing data.... We will train 80% of data and rest 20% data will be tested.

In [14]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [15]:
x_train

1114    No I'm good for the movie, is it ok if I leave...
3589    If you were/are free i can give. Otherwise nal...
3095    Have you emigrated or something? Ok maybe 5.30...
1012          I just got home babe, are you still awake ?
3320                      Kay... Since we are out already
                              ...                        
4931    Hi, the SEXYCHAT girls are waiting for you to ...
3264                              So u gonna get deus ex?
1653    For ur chance to win a £250 cash every wk TXT:...
2607    R U &SAM P IN EACHOTHER. IF WE MEET WE CAN GO ...
2732    Mm feeling sleepy. today itself i shall get th...
Name: Message, Length: 4457, dtype: object

# printing shape of x, x_train, x_test

In [16]:
print(x.shape)
print(x_train.shape)
print(x_test.shape)

(5572,)
(4457,)
(1115,)


# Feature Extraction..... converting the text data into numarical data that can be used as input to Logistic Regression

In [17]:
# we are using TfidfVectorizer .... it will give scores to words depending on how many times a particular word is repeated in the mail
# min_df... wwill considerr only those wwords which are repeated at least twice.
# stop_words.... these are the words that are to be ignored.
# comverting to lowercase will be good for processing.

In [26]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
x_train_feature = feature_extraction.fit_transform(x_train)
x_test_feature = feature_extraction.transform(x_test)

# converting y_train and y_test values as integers

In [27]:
y_train=y_train.astype('int')
y_test=y_test.astype('int')

In [28]:
x_train

1114    No I'm good for the movie, is it ok if I leave...
3589    If you were/are free i can give. Otherwise nal...
3095    Have you emigrated or something? Ok maybe 5.30...
1012          I just got home babe, are you still awake ?
3320                      Kay... Since we are out already
                              ...                        
4931    Hi, the SEXYCHAT girls are waiting for you to ...
3264                              So u gonna get deus ex?
1653    For ur chance to win a £250 cash every wk TXT:...
2607    R U &SAM P IN EACHOTHER. IF WE MEET WE CAN GO ...
2732    Mm feeling sleepy. today itself i shall get th...
Name: Message, Length: 4457, dtype: object

In [30]:
print(x_train_feature)

  (0, 3422)	0.6418008618863358
  (0, 3960)	0.40459749284424307
  (0, 4776)	0.2937599690543961
  (0, 4486)	0.4933198981059812
  (0, 3101)	0.30778739607068667
  (1, 3855)	0.4410710256765374
  (1, 4574)	0.4410710256765374
  (1, 2534)	0.4410710256765374
  (1, 814)	0.4410710256765374
  (1, 4555)	0.4205367990464199
  (1, 2902)	0.2120712188920981
  (2, 3398)	0.5133141633463273
  (2, 1317)	0.34462014146959175
  (2, 432)	0.4077104256374456
  (2, 4294)	0.36445133334144264
  (2, 2503)	0.5133141633463273
  (2, 4776)	0.2349500626979615
  (3, 1138)	0.6489221209014988
  (3, 1160)	0.44843330753299465
  (3, 3378)	0.38536596088088965
  (3, 3118)	0.3618113574629584
  (3, 3778)	0.31367701143832527
  (4, 3805)	1.0
  (5, 3731)	0.6020708068994186
  (5, 7381)	0.7984426989330436
  :	:
  (4454, 348)	0.2816333253882664
  (4454, 110)	0.3000941484572203
  (4454, 2067)	0.25658354936739225
  (4454, 4488)	0.3000941484572203
  (4454, 651)	0.3000941484572203
  (4454, 373)	0.23959800001827322
  (4454, 796)	0.28163332538

# training the model.... logistic regression

In [31]:
model=LogisticRegression()

In [32]:
# training the logistic regression model with the training data.
model.fit(x_train_feature,y_train)

# Evaluating the trained model

In [35]:
# prediction on training data
prediction_on_training_data = model.predict(x_train_feature)
accuracy_on_training_data=accuracy_score(y_train, prediction_on_training_data)

In [36]:
print("Accuracy on training data is : ",accuracy_on_training_data)

Accuracy on training data is :  0.9679156383217411


# prediction on test data

In [37]:
prediction_on_testing_data = model.predict(x_test_feature)
accuracy_on_testing_data=accuracy_score(y_test, prediction_on_testing_data)

In [38]:
print("Accuracy on testing data is : ",accuracy_on_testing_data)

Accuracy on testing data is :  0.9668161434977578


# Building a predictive system

In [43]:
mail=["FREE FREE FREE ! By clicking on the folloiwng link you will get FREE laptop and FREE samsung Tablet. Now do not wait and just click on link and get your GIFT for FREE"]

# convert text to feature vector
mail_feature=feature_extraction.transform(mail)

# making prediction
prediction = model.predict(mail_feature)
# print(prediction)
if prediction[0]==0:
    print("This is a spam mail")
else:
    print("This is a ham mail")

[0]
This is a spam mail
