In [5]:
import numpy as np # used to create numpy arrays
import pandas as pd # used to create data frames
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection and Pre-Processing

In [7]:
raw_sms_df = pd.read_csv('spam.csv',encoding='latin1')
sms_df = raw_sms_df.where((pd.notnull(raw_sms_df)),'')
sms_df.rename(columns={'v1':'category', 'v2':'message'},inplace=True)

# labels: ham = 0, spam = 1
encoder = LabelEncoder()
sms_df['category'] = encoder.fit_transform(sms_df['category'])

x = sms_df['message']
y = sms_df['category']

Splitting Data Into Training And Testing

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

Feature Extraction

In [9]:
# convert text data to meaningful numerical values

# words with scores lower than 1 will be ignored; stop_words ignores common words (e.g. "is")
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True') 

x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

y_train = y_train.astype('int')
y_test = y_test.astype('int')

Training the Logistic Regression Model 

In [10]:
model = LogisticRegression()
model.fit(x_train_features, y_train)

LogisticRegression()

Evaluating the Trained Model

In [11]:
# prediction on training data
prediction_on_training_data = model.predict(x_train_features)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

# prediction on testing data
prediction_on_testing_data = model.predict(x_test_features)
accuracy_on_testing_data = accuracy_score(y_test, prediction_on_testing_data)

In [12]:
print(accuracy_on_training_data)
print(accuracy_on_testing_data)

0.971729863136639
0.9560538116591928


Building Input System

In [14]:
input = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]

# convert input text into feature vectors
input_data_features = feature_extraction.transform(input)

# making prediction
prediction = model.predict(input_data_features)

if prediction[0] == 0:
    print('Ham mail')
else: 
    print("SPAM MAIL")

SPAM MAIL
