# Task 4 - Email Spam detection model

### Import neccesary libraries

In [1]:
import pandas as pd
import numpy as np 
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

### Load the data

In [2]:
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')
df.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


### Preprocessing the Dataset

In [3]:
df = df.drop(df.columns[2:], axis=1)
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
df.rename({'v1': 'spam', 'v2': 'text'}, inplace=True, axis=1)

In [5]:
df['spam'] = df['spam'].apply(lambda x : 1 if x == 'spam' else 0)

df.head(10)

Unnamed: 0,spam,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


In [6]:
df.groupby('spam').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
spam,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,4825,4516,"Sorry, I'll call later",30
1,747,653,Please call our customer service representativ...,4


In [7]:
X = df['text']
y = df['spam']

print("shape: ", X.shape, y.shape)

shape:  (5572,) (5572,)


As we can see classes are imbalanced

### Importing necessary Machine learning Libraries

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split

In [9]:
tf_vector = TfidfVectorizer(stop_words='english')

X_tfid = tf_vector.fit_transform(X)

### To handle class imbalance

In [11]:
from imblearn.over_sampling import SMOTE

In [12]:
smote = SMOTE(sampling_strategy='minority')

X_sm, y_sm = smote.fit_resample(X_tfid, y)

y_sm.value_counts()

spam
0    4825
1    4825
Name: count, dtype: int64

*As we can see now the classes are balanced*

### Spliting the features and target variable

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.25, stratify=y_sm, random_state=4)

print("Shapes: ", X_train.shape, y_train.shape)

Shapes:  (7237, 8404) (7237,)


### Building the Machine Learning Model

In [14]:
clf = LogisticRegression()

clf.fit(X_train, y_train)

pred = clf.predict(X_test)

### Evaluating the Model

In [15]:
accuracy_score(y_test, pred)

confusion_matrix(y_test, pred)

array([[1195,   12],
       [  14, 1192]], dtype=int64)

### Testing the Real time data

In [16]:
text = pd.Series(['''That is some Machine learning model'''])

text_tfid = tf_vector.transform(text)

clf.predict(text_tfid)

array([0], dtype=int64)