# **Email_Classifier**

In [None]:

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!ls ~/.kaggle
!kaggle datasets download -d purusinghvi/email-spam-classification-dataset

cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory
Dataset URL: https://www.kaggle.com/datasets/purusinghvi/email-spam-classification-dataset
License(s): MIT
Downloading email-spam-classification-dataset.zip to /content
 95% 41.0M/43.0M [00:00<00:00, 73.5MB/s]
100% 43.0M/43.0M [00:00<00:00, 66.8MB/s]


## **Create a Pandas DataFrame**

In [None]:
df = pd.read_csv('/content/email-spam-classification-dataset.zip')
df.shape

(83448, 2)

## **Separating Data for Analysis**

In [None]:
#check the number of null values and columns
df.isnull().sum()

label    0
text     0
dtype: int64

In [None]:
#check the data availabel for spam and ham
ham = df[df.label == 0]
spam = df[df.label == 1]
ham.shape

(39538, 2)

In [None]:
spam.shape

(43910, 2)

## **Under Sampling Spam Data for better predictions**

In [None]:
spam_sample = spam.sample(n=39537)
spam_sample

Unnamed: 0,label,text
43188,1,vip repl ai ica wat pw ches if you are looking...
29600,1,strong pleasesisal velasquez felicitousneater\...
68843,1,we don  t need to praise that we sell !\nwe h...
7586,1,using the speed and user - friendliness of the...
64389,1,when i am heard and what i say is solelyclear ...
...,...,...
48356,1,slowly the change of thanks having lit she had...
62256,1,thinking of breathing new life into your busin...
51516,1,"over 300 , 000 males in the world used our pro..."
68077,1,dear sirs aegis capital group llc aegis is a...


In [None]:
# concatenate the ham and spam_sample data to form a new dataframe with even divisions of ham and spam data along row axis=0
new_df = pd.concat([ham, spam_sample], axis=0)
new_df

Unnamed: 0,label,text
2,0,computer connection from cnn com wednesday es...
4,0,thanks for all your answers guys i know i shou...
5,0,larry king live at escapenumber escapenumber p...
6,0,michael pobega wrote i'm not sure if it's the ...
7,0,hi i have this error tr sample escapenumber es...
...,...,...
48356,1,slowly the change of thanks having lit she had...
62256,1,thinking of breathing new life into your busin...
51516,1,"over 300 , 000 males in the world used our pro..."
68077,1,dear sirs aegis capital group llc aegis is a...


In [None]:
new_df.shape

(79075, 2)

## **Splitting the data in features and targets**

In [None]:
#The dataset has two columns text will be features (X) and label will be target (Y)
X = new_df['text']
Y = new_df['label']

In [None]:
X.head()

2     computer connection from cnn com wednesday es...
4    thanks for all your answers guys i know i shou...
5    larry king live at escapenumber escapenumber p...
6    michael pobega wrote i'm not sure if it's the ...
7    hi i have this error tr sample escapenumber es...
Name: text, dtype: object

In [None]:
Y.head()

2    0
4    0
5    0
6    0
7    0
Name: label, dtype: int64

## **Splitting the data into training and testing data**

In [None]:
#using train test split from sklearn
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
X_train.shape

(63260,)

In [None]:
X_test.shape

(15815,)

In [None]:
Y_train.shape

(63260,)

In [None]:
Y_test.shape

(15815,)

## **Convert the text data into numerical data**

In [None]:
feature_extraction = TfidfVectorizer(min_df = 2, stop_words='english', lowercase=True)

In [None]:
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [None]:
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [None]:
print(X_train_features)

  (0, 102253)	0.03568222596411488
  (0, 79858)	0.041799804993315266
  (0, 1747)	0.06130607350972955
  (0, 172)	0.059947824368614636
  (0, 1878)	0.053134608386872784
  (0, 98920)	0.07460120052173388
  (0, 19989)	0.08995711487864677
  (0, 1390)	0.05932686120125304
  (0, 296)	0.06442425212029491
  (0, 59568)	0.34245581972628536
  (0, 103494)	0.4138728730799779
  (0, 2611)	0.07466895319581561
  (0, 342)	0.06158676200402207
  (0, 1300)	0.15132193050662865
  (0, 1933)	0.18832635601282738
  (0, 380)	0.18861539024027865
  (0, 36557)	0.30742911555978486
  (0, 38993)	0.24940249952578422
  (0, 32820)	0.3142091440549812
  (0, 61921)	0.3104046548099834
  (0, 43344)	0.05346032298346717
  (0, 0)	0.10553265881244119
  (0, 102219)	0.05280775429114744
  (0, 1037)	0.06057505901006655
  (0, 72532)	0.06709568735521936
  :	:
  (63258, 73761)	0.1797251491813461
  (63258, 96525)	0.16391106246396897
  (63258, 90972)	0.08252739159220897
  (63258, 103514)	0.07474353133933621
  (63258, 74347)	0.07809735082118013


## **Model Learning**

### *Logistic Regression*

In [None]:
model = LogisticRegression()


In [None]:
#training the logistic regression model with training data
model.fit(X_train_features, Y_train)

### *Multinomial NB*

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
#create an instance of MultinomialNB classifier
mnb = MultinomialNB()

In [None]:
#train the model
mnb.fit(X_train_features, Y_train)

## **Model Evaluation**

### *Logistic Regression*

In [None]:
#accuracy score on train data
X_train_prediction = model.predict(X_train_features)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

In [None]:
training_data_accuracy

0.9901201391084413

In [None]:
#accuracy on test data
X_test_prediction = model.predict(X_test_features)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)

In [None]:
test_data_accuracy

0.9838128359152704

### *MultinomialNB Classifier*

In [None]:
#make predictions on trained data
mnb_predicton = mnb.predict(X_train_features)
mnb_train_accuracy = accuracy_score(Y_train, mnb_predicton)

In [None]:
mnb_train_accuracy

0.9804457793234271

In [None]:
#make predictions on test data and calculate accuracy score
mnb_prediction = mnb.predict(X_test_features)
mnb_test_accuracy = accuracy_score(Y_test, mnb_prediction)

In [None]:
mnb_test_accuracy

0.9712930761934871

## **Prediction of email**

In [None]:
input_mail = ["Greetings! Your ticket for 'Emily In Paris' has been booked for 9PM",
              "StarsPwn has lunched a new game and we think you can be our testr. Click to dwnload th file",
              "We recently suspected a malicious activity from this computer. Please clck on the below link to verify your authorization!"]
#convert the text to data
input_data_features = feature_extraction.transform(input_mail)

#making prediction on logistic model
logistic_prediction = model.predict(input_data_features)

#making prediction on multinomialNb classifier
multinomial_prediction = mnb.predict(input_data_features)

In [None]:
logistic_prediction

array([1, 0, 1])

In [None]:
multinomial_prediction

array([0, 0, 1])

In [None]:
for i in logistic_prediction:
  if i==0:
    print('Logistic Regression says: Ham')
  else:
    print('Logistic Regression says: Spam')

print()

for i in multinomial_prediction:
  if i==0:
    print('MultinomialNB says: Ham')
  else:
    print('MultinomialNB says: Spam')

Logistic Regression says: Spam
Logistic Regression says: Ham
Logistic Regression says: Spam

MultinomialNB says: Ham
MultinomialNB says: Ham
MultinomialNB says: Spam
