In [23]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

**Data Collection and Preprocessing**

In [24]:
dataset=pd.read_csv("mail_data.csv")

In [47]:
dataset

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will ü b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


In [48]:
dataset.isnull().sum()

Category    0
Message     0
dtype: int64

In [49]:
dataset.shape

(5572, 2)

**Label encodeing**

In [50]:
dataset.loc[dataset['Category'] == 'spam', 'Category',] = 0
dataset.loc[dataset['Category'] == 'ham', 'Category',] = 1

In [51]:
dataset

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will ü b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


In [52]:
x=dataset["Message"]
y=dataset["Category"]

In [53]:
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [54]:
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

**Splitting the data into training data & test dat**

In [55]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [56]:
x_train.shape

(4457,)

In [57]:
y_train.shape

(4457,)

In [58]:
x_test.shape

(1115,)

In [59]:
y_test.shape

(1115,)

In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer

feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [61]:
x_train

5303    I can. But it will tell quite long, cos i have...
681        What is this 'hex' place you talk of? Explain!
3516                     I'm job profile seems like bpo..
4610                               Y de asking like this.
4705                     Yar but they say got some error.
                              ...                        
2206    Haha, my legs and neck are killing me and my a...
2496    WINNER! As a valued network customer you hvae ...
3128    Thats cool. i liked your photos. You are very ...
692     Sorry to trouble u again. Can buy 4d for my da...
1740                   U guys never invite me anywhere :(
Name: Message, Length: 4457, dtype: object

In [62]:
x_train_features

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 34601 stored elements and shape (4457, 7447)>

**Training the Model**

In [63]:
model = LogisticRegression()

In [64]:
# training the Logistic Regression model with the training data
model.fit(x_train_features,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


**Evaluating the trained model**

In [68]:
# prediction on training data

prediction_on_training_data = model.predict(x_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)*100

In [69]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  96.70181736594121


In [70]:
# prediction on test data

prediction_on_test_data = model.predict(x_test_features)
accuracy_on_test_data = accuracy_score(y_test, prediction_on_test_data)

In [71]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9641255605381166


**Building a Predictive System**

In [77]:
input_mail = ["SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info"]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[0]
Spam mail
