# OASIS TASK - 4
# EMAIL SPAM DETECTION WITH MACHINE LEARNING
### We’ve all been the recipient of spam emails before. Spam mail, or junk mail, is a type of email
### that is sent to a massive number of users at one time, frequently containing cryptic
### messages, scams, or most dangerously, phishing content.

### In this Project, use Python to build an email spam detector. Then, use machine learning to
### train the spam detector to recognize and classify emails into spam and non-spam. Let’s get
### started!



## IMPORTING THE LIBRARIES

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

## IMPORTING THE DATASET

In [4]:
data=pd.read_csv("spam.csv",encoding='ISO-8859-1')

In [5]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
data.shape

(5572, 5)

In [7]:
data_set=data.copy()

In [8]:
data_set.shape

(5572, 5)

## REMOVING THE UN-WANTED COLUMNS

In [9]:
data_set.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)

In [10]:
data_set.shape

(5572, 2)

## TRANSFORMED DATASET

In [11]:
data_set.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
data_set.columns=["label","message"]

In [13]:
data_set.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
y=data_set[['label']]

In [15]:
y.shape

(5572, 1)

## CONVERTING TARGET COLUMN INTO NUMERICAL COLUMN

In [16]:
y=pd.get_dummies(y,drop_first=True)

In [17]:
y

Unnamed: 0,label_spam
0,0
1,0
2,1
3,0
4,0
...,...
5567,1
5568,0
5569,0
5570,0


In [18]:
y.shape

(5572, 1)

## IMPORTING THE LIBRARIES REQUIRED FOR NLP TASKS

In [19]:
import nltk
from nltk.corpus import stopwords

In [20]:
stop_words=stopwords.words('english')

In [21]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [22]:
from nltk.stem import WordNetLemmatizer

In [23]:
stemming=WordNetLemmatizer()

In [24]:
import re 
corpus=[]
for i in range(len(data_set['message'])):
    words=re.sub('[^a-zA-Z0-9]',' ',data_set['message'][i])
    words=words.lower()
    words=words.split()
    words=[stemming.lemmatize(word) for word in words if not word in stop_words]
    words=' '.join(words)
    corpus.append(words)
    

In [25]:
corpus

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry question std txt rate c apply 08452810075over18',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling 3 week word back like fun still tb ok xxx std chgs send 1 50 rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press 9 copy friend callertune',
 'winner valued network customer selected receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobile 11 month u r entitled update latest colour mobile camera free call mobile update co free 08002986030',
 'gonna home soon want talk stuff anymore tonight k cried enough today',
 'six chance win cash 100 20 000 pound txt csh11 send 87575 cost 150p day 6days 16 tsandcs apply reply

## SPLITTING THE DATA INTO TRAINING AND TESTING

In [26]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(corpus,y,test_size=0.2)

In [27]:
y_train.shape

(4457, 1)

In [28]:
y_test.shape

(1115, 1)

## CREATING THE BAG OF WORDS FOR TEXTUAL DATA

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=2500,ngram_range=(1,2))

In [31]:
x_train=cv.fit_transform(x_train)

In [32]:
x_test=cv.transform(x_test)

In [33]:
x_train=x_train.toarray()

In [34]:
x_test=x_test.toarray()

In [35]:
x_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [36]:
x_train.shape

(4457, 2500)

In [37]:
cv.vocabulary_

{'go': 913,
 'ok': 1539,
 'na': 1463,
 'thanks': 2130,
 'ringtone': 1794,
 'order': 1568,
 'reference': 1756,
 'number': 1521,
 'mobile': 1406,
 'charged': 464,
 '50': 110,
 'tone': 2187,
 'arrive': 245,
 'please': 1635,
 'call': 391,
 'customer': 581,
 'service': 1888,
 'thanks ringtone': 2132,
 'ringtone order': 1795,
 'reference number': 1757,
 'mobile charged': 1410,
 'charged 50': 465,
 '50 tone': 113,
 'tone arrive': 2190,
 'arrive please': 246,
 'please call': 1636,
 'call customer': 398,
 'customer service': 582,
 'much': 1451,
 'though': 2147,
 'shd': 1901,
 'fun': 875,
 'town': 2207,
 'something': 1977,
 'sound': 1992,
 'moment': 1420,
 'day': 602,
 'morning': 1430,
 'brings': 367,
 'hope': 1067,
 'afternoon': 181,
 'evening': 740,
 'luv': 1308,
 'night': 1497,
 'rest': 1782,
 'wish': 2424,
 'find': 801,
 'today': 2177,
 'good': 935,
 'good morning': 942,
 'yes': 2482,
 'saw': 1835,
 'message': 1368,
 'tv': 2231,
 'ring': 1793,
 'return': 1784,
 'king': 1170,
 'store': 2045,


## TRANING THE MODEL

In [38]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier()

In [40]:
model.fit(x_train,y_train)

  model.fit(x_train,y_train)


RandomForestClassifier()

## PREDICTING THE MODEL

In [41]:
y_pred=model.predict(x_test)

## ACCURACY OF THE MODEL

In [42]:
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
accuracy=accuracy_score(y_test,y_pred)
print(f"the accuracy of the model to detect the spam or ham classification is {accuracy}")

the accuracy of the model to detect the spam or ham classification is 0.9811659192825112


In [43]:
confusion_matrix(y_test,y_pred)

array([[970,   2],
       [ 19, 124]], dtype=int64)

## CLASSIFICATION REPORT

In [44]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       972
           1       0.98      0.87      0.92       143

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115

