### Importing The Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

### Data Collection And Preprocessing

In [2]:
# Loading the data form csv file to a pandas dataframe
raw_mail_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/mail_data.csv')

In [3]:
# Printing the first 5 rows of the dataset
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Encountering if there are any NaN values in the dataset
raw_mail_data['Message'].isnull().sum()

0

In [5]:
# Replacing the null values with null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)), '') # .where --> if the condition is true keep the data else execute ''.

In [6]:
# Printing the first 5 rows of the new dataframe
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# Checking the dimension of the dataset
mail_data.shape

(5572, 2)

In [8]:
mail_data['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,4825
spam,747


### Label Encoding  
* Spam mail --> 1
* Ham mail --> 0

In [9]:
label_encoder = LabelEncoder() # Assigning one instance of the LabelEncoder() to the variable label_encoder

In [10]:
labels = label_encoder.fit_transform(mail_data['Category']) # Encoding the labels in the Category column

In [11]:
mail_data['Category'] = labels # Updating the encoded labels in the Category column

In [12]:
mail_data['Category'].value_counts() # The ham mails are encoded as 0 and the spam mails are encoded as 1

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
0,4825
1,747


In [13]:
# Printing the first 5 rows of the dataset
mail_data.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
# Seperating the data into texts and labels
x = mail_data['Message']
y = mail_data['Category']

In [15]:
print(x)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [16]:
print(y)

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Category, Length: 5572, dtype: int64


### Train Test Split

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 3)

In [18]:
print(x.shape, x_train.shape, x_test.shape)

(5572,) (4457,) (1115,)


### Feature Extraction

In [19]:
# Transform the test data to feature vectors that can be used as input to the Logistic Regression Model
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True) # min_df: if the score of a particular word is less than 1 then, that particular word will be ignored.

In [20]:
# Converting the test data of the x_train and x_test to numerical value
x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

# The y_train and y_test is already in int64 datatype so, there is no need of changing it.

In [21]:
# print(x_train_features)

  (0, 7329)	0.39151450331197035
  (0, 2596)	0.5157331716075019
  (0, 4795)	0.6459507464707183
  (0, 6736)	0.40433070936297943
  (1, 1793)	0.43486660333673016
  (1, 4861)	0.596185515774092
  (1, 3112)	0.31103507183699425
  (1, 3758)	0.2826422333927384
  (1, 6887)	0.528038275197618
  (2, 2903)	1.0
  (3, 5081)	0.4169087023760639
  (3, 7198)	0.3971508483254661
  (3, 3373)	0.26859638268284747
  (3, 4040)	0.24099748417300504
  (3, 4692)	0.43001182720880177
  (3, 3909)	0.3260348921371232
  (3, 758)	0.37620667903348365
  (3, 3092)	0.32479862316475455
  (4, 3911)	0.2511783165875194
  (4, 3082)	0.4766800108257892
  (4, 5766)	0.6833422922401592
  (4, 6339)	0.37251069778964124
  (4, 3373)	0.23999265394731062
  (4, 4040)	0.21533285461106833
  (5, 4413)	0.4460096390714086
  :	:
  (4452, 1180)	0.8777703340143531
  (4452, 4770)	0.4790816639408472
  (4453, 1853)	0.5659242420057378
  (4453, 5704)	0.5659242420057378
  (4453, 1891)	0.4268643677817285
  (4453, 5098)	0.31370317391845537
  (4453, 5831)	0.280

In [22]:
# print(x_test_features)

  (0, 6069)	0.17898756886133724
  (0, 5592)	0.27040646184211636
  (0, 4334)	0.27040646184211636
  (0, 4333)	0.4043741386311273
  (0, 4258)	0.48391496454894295
  (0, 4199)	0.27040646184211636
  (0, 3326)	0.15053858929369243
  (0, 2885)	0.2315106978077796
  (0, 900)	0.25781758783662617
  (0, 356)	0.27040646184211636
  (0, 355)	0.2488856395843301
  (0, 170)	0.27040646184211636
  (1, 7381)	0.195283181583766
  (1, 7046)	0.24076699187385958
  (1, 6893)	0.41865401615879083
  (1, 4589)	0.17770109251757912
  (1, 3826)	0.2982254780251209
  (1, 3563)	0.26502407200930533
  (1, 3306)	0.22105285481388076
  (1, 2541)	0.23025505509842506
  (1, 2034)	0.2921850764758712
  (1, 1822)	0.5489480368150532
  (1, 312)	0.22197489844787852
  (2, 7156)	0.33671672463654934
  (2, 7140)	0.41087739018331193
  :	:
  (1112, 4597)	0.23602727834779066
  (1112, 4129)	0.5588600780072547
  (1112, 3173)	0.5581704944150844
  (1112, 2167)	0.173625894465186
  (1112, 2033)	0.2250389400930859
  (1112, 1292)	0.23602727834779066
  

### Model Training: Logistic Regression Model

In [23]:
model = LogisticRegression()

In [24]:
# Training the Logistic Regression Model
model.fit(x_train_features, y_train)

### Moduel Evaluation

In [25]:
# Prediction on training data
training_data_prediction = model.predict(x_train_features)
training_data_accuracy = accuracy_score(training_data_prediction, y_train)

In [26]:
# Training data accuracy
print("Accuracy on training data:", training_data_accuracy)

Accuracy on training data: 0.9681400044873233


In [27]:
# Prediction on test data
testing_data_prediction = model.predict(x_test_features)
testing_data_accuracy = accuracy_score(testing_data_prediction, y_test)

In [28]:
# Testing data accuracy
print("Accuracy on testing data:", testing_data_accuracy)

Accuracy on testing data: 0.9641255605381166


### Predictive System

In [29]:
mail = input("Enter the mail: ")
input_mail = [mail]

# Converting the test to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# Making prediction

prediction = model.predict(input_data_features)
# print(prediction)

if prediction[0] == 1:
  print("The mail is a spam mail")
else:
  print("The mail is a ham mail")

Enter the mail: The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free
The mail is a ham mail
