In [1]:
# Importing all necessary libraries

import numpy as np
import pandas as pd

In [2]:
# Reading the tsv file

df_sms_data = pd.read_csv('smsspamcollection.tsv', sep='\t')

In [3]:
# Reading first few rows of data

df_sms_data.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [4]:
# Check for null values

df_sms_data.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [6]:
# Dimensions of the dataframe

df_sms_data.shape

(5572, 4)

In [7]:
# Number of rows of data

len(df_sms_data)

5572

In [18]:
# To observe the data in one column

df_sms_data['label'].sample(10)

4211    ham
2288    ham
4509    ham
2457    ham
1315    ham
1416    ham
2458    ham
4481    ham
3520    ham
4795    ham
Name: label, dtype: object

In [9]:
# To get unique values in the column

df_sms_data['label'].unique()

array(['ham', 'spam'], dtype=object)

In [11]:
# To get the value counts for all unique values in a column

df_sms_data['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [12]:
# Training Classification Model
# So that it will predict as Ham or Spam based on length and punct

from sklearn.model_selection import train_test_split

In [13]:
# X - features

X = df_sms_data[['length', 'punct']]

# y - Label

y = df_sms_data['label']

In [15]:
# Splitting data into training and test data
# By passing it to the instance of train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [16]:
# Viewing the shape of training set and test set

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3900, 2)
(1672, 2)
(3900,)
(1672,)


In [20]:
# Using Linear Regression

from sklearn.linear_model import LogisticRegression

In [21]:
lr_model = LogisticRegression(solver='lbfgs')

In [22]:
lr_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [23]:
from sklearn import metrics

In [24]:
predictions = lr_model.predict(X_test)

In [25]:
predictions

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [26]:
y_test.head()

3245    ham
944     ham
1044    ham
2484    ham
812     ham
Name: label, dtype: object

In [27]:
metrics.confusion_matrix(y_test, predictions)

array([[1404,   44],
       [ 219,    5]])

In [28]:
df_confusion_matrix = pd.DataFrame(metrics.confusion_matrix(y_test, predictions), index=['ham', 'spam'], columns=['ham', 'spam'])

In [29]:
df_confusion_matrix

Unnamed: 0,ham,spam
ham,1404,44
spam,219,5


In [30]:
print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

         ham       0.87      0.97      0.91      1448
        spam       0.10      0.02      0.04       224

   micro avg       0.84      0.84      0.84      1672
   macro avg       0.48      0.50      0.48      1672
weighted avg       0.76      0.84      0.80      1672



In [31]:
print(metrics.accuracy_score(y_test, predictions))

0.8427033492822966


In [33]:
# Using MultinomialNaiveBayes

from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()

nb_model.fit(X_train, y_train)

predictions = nb_model.predict(X_test)

print(metrics.confusion_matrix(y_test, predictions))

[[1438   10]
 [ 224    0]]


In [34]:
print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

         ham       0.87      0.99      0.92      1448
        spam       0.00      0.00      0.00       224

   micro avg       0.86      0.86      0.86      1672
   macro avg       0.43      0.50      0.46      1672
weighted avg       0.75      0.86      0.80      1672



In [36]:
# Using Support Vector Machines
# SVC - Support Vector Classification

from sklearn.svm import SVC

In [38]:
svm_model = SVC(gamma='auto')

svm_model.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [40]:
predictions = svm_model.predict(X_test)

In [41]:
print(metrics.confusion_matrix(y_test, predictions))

[[1373   75]
 [ 121  103]]
