Importing Libraries for Model

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

Loading data into pandas dataframe

In [2]:
mail_data = pd.read_csv('/home/haroonwaheed-19/Downloads/DataSets for ML/mail_data.csv')

Analysis on the DataSets

In [3]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
mail_data.isnull().sum()

Category    0
Message     0
dtype: int64

In [5]:
mail_data['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

Label Encoding of the Category Column

In [6]:

mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 1

Separating the feature and Label from Data

In [23]:
x = mail_data['Message']
y = mail_data['Category']

x = x.sample(n=len(x),random_state=42)
y = y.sample(n=len(y),random_state=42)

Testing and Training Data Separation

In [24]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y,random_state=2)

Feature Extraction to convert text data into numerical form(Vector)

In [25]:
feature_extraction = TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)

In [26]:
x_train_feature = feature_extraction.fit_transform(x_train)
x_test_feature = feature_extraction.transform(x_test)

In [27]:
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [16]:
print(x_test_feature)
print(x_train_feature)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 7643 stored elements and shape (1115, 7496)>
  Coords	Values
  (0, 45)	0.23885705786351533
  (0, 398)	0.2831628958086886
  (0, 1283)	0.19658332365071185
  (0, 1585)	0.2137047703002642
  (0, 2046)	0.1984270278883612
  (0, 2375)	0.466320953046431
  (0, 2906)	0.28559070500052114
  (0, 3154)	0.17077412764771363
  (0, 3912)	0.20823705036803863
  (0, 4365)	0.2003761424782757
  (0, 4419)	0.2358899506086862
  (0, 4625)	0.1629132197579507
  (0, 4755)	0.2137047703002642
  (0, 4827)	0.21680692811499552
  (0, 5030)	0.24210715613503428
  (0, 6513)	0.2657436287350355
  (0, 6885)	0.15260537497993798
  (1, 4768)	0.5087543968611168
  (1, 6271)	0.681815528764269
  (1, 7196)	0.5256391808173945
  (2, 1866)	0.24268764995095132
  (2, 2062)	0.44326719810193344
  (2, 3154)	0.2848556310871354
  (2, 3339)	0.27401894009219313
  (2, 3539)	0.47232298297806397
  :	:
  (1111, 6001)	0.2671794976463982
  (1111, 6591)	0.29456140032492945
  (1111, 7141)	0.190

LogisticRegression Model Loading and Training

In [28]:
model = LogisticRegression()
model.fit(x_train_feature, y_train)

Evaluation on Training Data

In [29]:
x_train_predictions = model.predict(x_train_feature)
trainning_prediction_accuracy = accuracy_score(x_train_predictions, y_train)

In [30]:
print("Accuracy on training data is : ",trainning_prediction_accuracy*100)

Accuracy on training data is :  96.76912721561588


Evaluation of Model on Test data

In [31]:
x_test_predictions = model.predict(x_test_feature)
test_prediction_accuracy = accuracy_score(x_test_predictions, y_test)

In [32]:
print("Accuracy on test data is : ", test_prediction_accuracy*100)

Accuracy on test data is :  96.68161434977578


Model Predictor System

In [34]:
input_data = ["Thanks for your subscription to Ringtone UK your mobile will be charged £5/month Please confirm by replying YES or NO. If you reply NO you will not be charged"]
input_data_feature = feature_extraction.transform(input_data)
prediction = model.predict(input_data_feature)
if prediction[0] == 0:
    print("This is a Spam Email")
else:
    print("This is a Ham Email")

This is a Spam Email
