# Project - Spam mail detection

#### 1. Import Libraries

In [1]:
import pandas as pd                                       # to load data and many other operations
from sklearn.feature_extraction.text import TfidfVectorizer     # text data into numbers , numerical form
from sklearn.model_selection import train_test_split    # to split data into 4 parts , 2 train and 2 test
from sklearn.linear_model import LogisticRegression    # our problem is classification problem
from sklearn.metrics import accuracy_score            # to check accuracy of our model

In [2]:
# Load dataset

data = pd.read_csv('spam.csv', encoding='latin-1')     # to overcome 'utf -8' error

# display top 5 rows

data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [20]:
# replace null values with null string 

# mail_data = data.where((pd.notnull(data)),'')

In [4]:
#mail_data.head()

In [24]:
data=mail_data.drop(columns = ['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1)   

# removed columns, these are black columns and no use of this , so just drop it

In [25]:
# print top 5 rows
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [26]:
# check rows and columns of dataset

data.shape

(5572, 2)

## 2. data preprocessing

#### label encoding 

In [28]:
# encode our data v1 column into two category
    #  0 : spam
 #  1 : ham

In [30]:
data.loc[data['v1']=='spam', 'v1',]=0     # spam as 0
data.loc[data['v1']=='ham', 'v1',]=1     # ham as 1

In [31]:
data.head()    # we can see encoding in df

Unnamed: 0,v1,v2
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [32]:
# seperating the data as texts and labels

X = data['v2']      # independant variable
y = data['v1']      # target column, i.e dependant variable

In [33]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object

In [34]:
print(y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: v1, Length: 5572, dtype: object

In [36]:
#  data split in train and test

Xtrain,Xtest, ytrain, ytest = train_test_split(X,y, test_size=0.2, random_state=2)    # 20% data for testing purpose

In [37]:
print(Xtrain)

3890    No problem. We will be spending a lot of quali...
5553                          Hahaha..use your brain dear
4366    I like dis sweater fr mango but no more my siz...
3968    Thanks for your message. I really appreciate y...
3771                Does uncle timi help in clearing cars
                              ...                        
3335                         Then u go back urself lor...
1099    Been up to ne thing interesting. Did you have ...
2514        Ok ill send you with in  &lt;DECIMAL&gt;  ok.
3606                         I have no idea where you are
2575                 Now thats going to ruin your thesis!
Name: v2, Length: 4457, dtype: object


In [39]:
# Check shape of main df and splited df

print(X.shape)
print(Xtrain.shape)
print(Xtest.shape)

(5572,)
(4457,)
(1115,)


#### convert text data into meaningful 
#### Feature extraction

In [41]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase = 'True')   

# min_df is score by vectorizer to the words  

In [44]:
# fit TfidfVectorizer on train and test data, to convert into numerical

Xtrain_features = feature_extraction.fit_transform(Xtrain)

Xtest_features = feature_extraction.transform(Xtest)

In [42]:
# convert ytrain and ytest values/datatypes to integers

ytrain = ytrain.astype('int')
ytest = ytest.astype('int')

In [50]:
print(Xtrain_features)     # coverted to numerical

  (0, 6566)	0.29071829686789585
  (0, 5260)	0.5087678982336444
  (0, 4010)	0.4050685955975014
  (0, 6073)	0.5665799184805557
  (0, 5169)	0.4141795823412651
  (1, 2116)	0.3523355719340784
  (1, 1423)	0.5940444343697558
  (1, 6859)	0.4124100686677272
  (1, 3141)	0.5940444343697558
  (2, 3565)	0.40508027557608417
  (2, 5897)	0.440107047813195
  (2, 4140)	0.440107047813195
  (2, 2845)	0.3382880530026162
  (2, 6332)	0.440107047813195
  (2, 2258)	0.3104925886287087
  (2, 3910)	0.20972094634778068
  (3, 2105)	0.17893490557083572
  (3, 7191)	0.2812743655569581
  (3, 1763)	0.2431679156261037
  (3, 6620)	0.2114722422733664
  (3, 6461)	0.2757210982911898
  (3, 7045)	0.20776132213481782
  (3, 4865)	0.26062868324883703
  (3, 2251)	0.2952674814882996
  (3, 5175)	0.3101007821209402
  :	:
  (4451, 4355)	0.5653315538937795
  (4452, 6856)	0.8442476281269602
  (4452, 4003)	0.5359533024452802
  (4453, 7234)	0.3927126429320925
  (4453, 4634)	0.3513960490954625
  (4453, 4498)	0.36145784624804855
  (4453, 35

## 3. Train Model - Logistic Regeression

In [51]:


model = LogisticRegression()
model.fit(Xtrain_features, ytrain)

### 4. model evaluate

In [53]:
# train data prediction

train_pred = model.predict(Xtrain_features)
train_accuracy = accuracy_score(ytrain, train_pred)

print('accuracy on train data is :' ,train_accuracy)

accuracy on train data is : 0.971729863136639


In [54]:
# test data prediction

test_pred = model.predict(Xtest_features)
test_accuracy = accuracy_score(ytest, test_pred)

print('accuracy on train data is :' ,test_accuracy)

accuracy on train data is : 0.9560538116591928


### 5. Building predictive system

In [56]:
input_mail = ["Oh k...i'm watching here:)"]

In [59]:
# convert text to feature vectors

input_data_features = feature_extraction.transform(input_mail)

# making prediction

predict = model.predict(input_data_features)

print(predict)

if (predict[0] == 1):
    print('It is an Ham mail')
else:
    print('It is an spam mail')

[1]
It is an Ham mail
