- Importing liberary

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

- Data preprocessing

In [2]:
# Loading the data from csv to pandas dataframe 

In [5]:
data = pd.read_csv('C:\Py\mail_data.csv')

In [6]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# replace missing values to null string

mail_data = data.where((pd.notnull(data)),'')

In [8]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
mail_data.shape

(5572, 2)

- Label encoding to convert column label into numerical value
 - spam mail - 0 
 - Non spam mail - 1

In [None]:
mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 1

- segregate the data as texts and label

In [17]:
x = mail_data['Message']
y = mail_data['Category']

- splitting the data into train/test split

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=3)

In [26]:
print(x.shape)
print(x_train.shape)
print(x_test.shape)

(5572,)
(4457,)
(1115,)


- Feature Extraction

#### transform the text data into feature vectors that can be used as an input to logistic regression

In [29]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = 'True')

In [33]:
x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

In [34]:
# convert y_train & y_test values as integers
    
y_train = y_train.astype('int')  
y_test = y_test.astype('int')    

In [37]:
y_train

3075    1
1787    1
1614    1
4304    1
3266    0
       ..
789     0
968     1
1667    1
3321    1
1688    0
Name: Category, Length: 4457, dtype: int32

- Training the model

In [40]:
model = LogisticRegression()

In [41]:
model.fit(x_train_features, y_train)

LogisticRegression()

- Evaluate the model

In [45]:
# prediction on training data

train_data = model.predict(x_train_features)
train_accuracy = accuracy_score(y_train, train_data)

In [48]:
print('Accuracy on traning data is :' , train_accuracy)

Accuracy on traning data is : 0.9670181736594121


In [49]:
# prediction on test data

test_data = model.predict(x_test_features)
test_accuracy = accuracy_score(y_test, test_data)

In [50]:
print('Accuracy on test data is :' , test_accuracy)

Accuracy on test data is : 0.9659192825112107


- Building a predictive system

In [51]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"]

In [52]:
# convert text to feature vectors

input_features = feature_extraction.transform(input_mail)

In [53]:
# making predictions

prediction = model.predict(input_features)

In [55]:
print(prediction)

[1]
