# Spam Mail Prediction with  data using LOGISTIC REGRESSION

In [1]:
import os
from urllib import request
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer # transform raw text into numberical using TF-IDF score
# TF-IDF = Term Frequency–Inverse Document Frequency

%matplotlib inline
sns.set()

$TF-IDF(w, d) = t_f * log((N + 1) / (N_w + 1)) + 1$  
• Where:

- __$t_f$__ (term frequency) is the number of times the word w appears in the query document d.
- __N__ is the total number of documents in the training set.
- __$N_w$__ is the number of documents in the training set in which the word w appears.

### Load data

In [2]:
url = "https://drive.google.com/uc?export=download&id=12VQYaUscWYUBakKG1WI9Ayob3NzYKAuc"
root = os.getcwd()
path = os.path.join(root, "mail_data.csv")
request.urlretrieve(url, path)
raw_mail = pd.read_csv(path)

In [3]:
raw_mail.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
raw_mail.shape

(5572, 2)

In [5]:
raw_mail.isna().sum()

Unnamed: 0,0
Category,0
Message,0


In [6]:
# Select non-null data
raw_mail = raw_mail.where(raw_mail.notnull(), '')

In [7]:
raw_mail.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Label encoding

In [8]:
# Label "spam" as '0' and "ham" as '1'
raw_mail.loc[raw_mail['Category']=='spam', 'Category'] = 0
raw_mail.loc[raw_mail['Category']=='ham', 'Category'] = 1

In [9]:
raw_mail.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


### Separate `Feature` and `Target` columns

In [10]:
X = raw_mail['Message']
y = raw_mail['Category']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4457,), (1115,), (4457,), (1115,))

In [15]:
# Transform text data to feature vectors
feature_extraction = TfidfVectorizer(min_df  = 1,            # minimum document frequency
                                     # keep when found minimum 1 time
                                     stop_words = 'english', # Remove common English words like "the", "is", "and"
                                     lowercase = True)     # convert all text to lower case

In [28]:
X_train_features = feature_extraction.fit_transform(X_train) # transformed as sparse matrix
X_test_features = feature_extraction.transform(X_test)       # transformed as sparse matrix

# Convert y_train and y_test values as integers
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [22]:
print(X_train_features[0])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5 stored elements and shape (1, 7431)>
  Coords	Values
  (0, 2329)	0.38783870336935383
  (0, 3811)	0.34780165336891333
  (0, 2224)	0.413103377943378
  (0, 4456)	0.4168658090846482
  (0, 5413)	0.6198254967574347


### Train model

In [24]:
model = LogisticRegression()

In [29]:
model.fit(X_train_features, y_train)

In [32]:
# Prediction on training data
ypred_train = model.predict(X_train_features)
ypred_test = model.predict(X_test_features)

In [33]:
accuracy_train = accuracy_score(y_train, ypred_train)
accuracy_test = accuracy_score(y_test, ypred_test)

In [35]:
print("Accuracy score on training data:", accuracy_train)
print("Accuracy score on testing data:", accuracy_test)

Accuracy score on training data: 0.9676912721561588
Accuracy score on testing data: 0.9668161434977578


### Build a predictive system

In [37]:
# ham type mail
mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"]

# Convert text to feature vectors
new_mail_features = feature_extraction.transform(mail)

# Prediction
n_pred = model.predict(new_mail_features)

if n_pred[0] == 1:
    print("ham mail")
else:
    print("spam mail")

ham mail
