In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder # Required for text preprocessing

data = pd.read_csv('spam.csv')

In [7]:
data.head()

Unnamed: 0,Category,Message
0,ham,Go until jurong point
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,Nah I don't think he goes to usf
4,spam,Had your mobile 11 months or more? U R entitle...


In [26]:
data['Message'].value_counts()

Message
Congratulations! You've been selected to receive a FREE $100 Gift Card to use at any high street store. Text YES to 80880 to claim! T&C apply.                67
Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030    66
You have won $1000 cash prize! Call 09061701323 now to claim!                                                                                                 66
URGENT! Your mobile number has won a $1000 cash prize! Call 09061701323 now to claim!                                                                         66
Hi Babe                                                                                                                                                       66
                                                                                                                                                              ..
I'm going to be more grate

In [9]:
# Label Encoding for Target Variable 'Category'
encoder = LabelEncoder()
data['Category'] = encoder.fit_transform(data['Category'])
# 'ham' becomes 0, 'spam' becomes 1

In [11]:
# Since Logistic Regression works on numbers, we must convert the text message into a numerical representation.
from sklearn.feature_extraction.text import TfidfVectorizer

data['Message'] = data['Message'].astype(str)
feature_extractor = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_features = feature_extractor.fit_transform(data['Message'])

In [12]:
y = data['Category']

In [19]:
X = X_features

In [20]:
y

0       0
1       0
2       1
3       0
4       1
       ..
1398    0
1399    0
1400    1
1401    0
1402    0
Name: Category, Length: 1403, dtype: int64

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

In [22]:
print(X.shape,X_train.shape,X_test.shape)


(1403, 241) (1122, 241) (281, 241)


In [27]:
model = LogisticRegression(max_iter=1000)

# Train the model using the training data
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [24]:
X_train_prediction = model.predict(X_train)
trained_data_accuracy = accuracy_score(y_train, X_train_prediction)

print('Accuracy on training data: {} %'.format(round(trained_data_accuracy * 100, 2)))

Accuracy on training data: 100.0 %


In [25]:
# Accuracy on Test Data 
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(y_test, X_test_prediction)

print('Accuracy on test data: {} %'.format(round(test_data_accuracy * 100, 2)))

Accuracy on test data: 99.29 %
