# Import libraries

In [272]:
%matplotlib inline 
import pandas as pd
import numpy as np
import scipy as sp
import warnings
from scipy import stats
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report

warnings.filterwarnings("ignore")

# Load Singapore NUS SMS spam dataset from Kaggle, read into Pandas dataframe
https://www.kaggle.com/uciml/sms-spam-collection-dataset/version/1

In [273]:
data = pd.read_csv('spam.csv', encoding='cp1252', header = 0)

# Describe the read data

In [274]:
data.shape

(5572, 5)

In [275]:
data.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [276]:
data.dtypes

v1            object
v2            object
Unnamed: 2    object
Unnamed: 3    object
Unnamed: 4    object
dtype: object

In [277]:
data.head

<bound method NDFrame.head of         v1                                                 v2 Unnamed: 2  \
0      ham  Go until jurong point, crazy.. Available only ...        NaN   
1      ham                      Ok lar... Joking wif u oni...        NaN   
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3      ham  U dun say so early hor... U c already then say...        NaN   
4      ham  Nah I don't think he goes to usf, he lives aro...        NaN   
5     spam  FreeMsg Hey there darling it's been 3 week's n...        NaN   
6      ham  Even my brother is not like to speak with me. ...        NaN   
7      ham  As per your request 'Melle Melle (Oru Minnamin...        NaN   
8     spam  WINNER!! As a valued network customer you have...        NaN   
9     spam  Had your mobile 11 months or more? U R entitle...        NaN   
10     ham  I'm gonna be home soon and i don't want to tal...        NaN   
11    spam  SIX chances to win CASH! From 100 to 20,000 po

In [278]:
dataset = data[['v2', 'v1']]
X = data['v2']
y = data['v1']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
#X_test.shape
#y_test.shape

# Convert to matrix of TF IDF features

In [279]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_X_train = vectorizer.fit_transform(X_train)
tfidf_X_test = vectorizer.transform(X_test)

print(tfidf_X_test.shape)
print(tfidf_X_train.shape)
print(y_test.shape)
print(y_train.shape)

(1115, 7441)
(4457, 7441)
(1115,)
(4457,)


# Fit log reg classifier to the TF-IDF transformed data and measure accuracy

In [280]:
print('Classification report with logistic regression')
logisticreg = LogisticRegression()
logisticreg.fit(tfidf_X_train, y_train)

y_pred = logisticreg.predict(tfidf_X_test)
print(classification_report(y_test, y_pred))

scores = cross_val_score(logisticreg, tfidf_X_test, y_test, cv=5)
acc = scores.mean()
print('5 fold Cross validation Accuracy: %0.2f percent' % (acc *100))

Classification report with logistic regression
             precision    recall  f1-score   support

        ham       0.97      1.00      0.98       955
       spam       0.98      0.80      0.88       160

avg / total       0.97      0.97      0.97      1115

5 fold Cross validation Accuracy: 88.52 percent


# Fit SVM classifier and measure classification accuracy 

In [281]:
print('Classification report with linear SVM')
linearsvm = SVC()
linearsvm.fit(tfidf_X_train, y_train)
y_pred = linearsvm.predict(tfidf_X_test)
print(classification_report(y_test, y_pred))

scores = cross_val_score(linearsvm, tfidf_X_test, y_test, cv=5)
acc = scores.mean()
print('5 fold cross validation Accuracy: %0.2f percent' % (acc *100))

Classification report with linear SVM
             precision    recall  f1-score   support

        ham       0.86      1.00      0.92       955
       spam       0.00      0.00      0.00       160

avg / total       0.73      0.86      0.79      1115

5 fold cross validation Accuracy: 85.65 percent


# Output (spam or not) with some random sample data

In [282]:
sample_SMS = ['Lets meet up today at 9', 
              'Save more! Flat Rs 100 cashback on bill of Rs 300',
              'How are you doing', 
              'FREE mobile phone contract with 300 minutes FREE', 
              'SIX chances to win CASH! From 100 to 20,000 points']
output = logisticreg.predict(vectorizer.transform(sample_SMS))

for i ,m in enumerate(sample_SMS):
    print('Logistic regression', m, ' ==> ', output[i])
    
output = linearsvm.predict(vectorizer.transform(sample_SMS))

for i ,m in enumerate(sample_SMS):
    print('Linear SVM ', m, ' ==> ', output[i])

Logistic regression Lets meet up today at 9  ==>  ham
Logistic regression Save more! Flat Rs 100 cashback on bill of Rs 300  ==>  ham
Logistic regression How are you doing  ==>  ham
Logistic regression FREE mobile phone contract with 300 minutes FREE  ==>  spam
Logistic regression SIX chances to win CASH! From 100 to 20,000 points  ==>  spam
Linear SVM  Lets meet up today at 9  ==>  ham
Linear SVM  Save more! Flat Rs 100 cashback on bill of Rs 300  ==>  ham
Linear SVM  How are you doing  ==>  ham
Linear SVM  FREE mobile phone contract with 300 minutes FREE  ==>  ham
Linear SVM  SIX chances to win CASH! From 100 to 20,000 points  ==>  ham
