In [94]:
# Installing and importing all the required libraries

!pip install scikit-learn --upgrade --quiet
!pip install pandas --upgrade --quiet
!pip install opendatasets --quiet

In [95]:
import opendatasets as od
import pandas as pd
import os

Downloading the dataset from [Kaggle](https://www.kaggle.com/uciml/sms-spam-collection-dataset/)

In [None]:
od.download('https://www.kaggle.com/uciml/sms-spam-collection-dataset/')

In [4]:
os.listdir('sms-spam-collection-dataset')

['spam.csv']

Reading the downloaded `csv` file into a pandas DataFrame

In [12]:
raw_df = pd.read_csv('./sms-spam-collection-dataset/spam.csv',sep=',',encoding='latin-1')

In [13]:
#Looking at the data
raw_df.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [14]:
raw_df.shape

(5572, 5)

As we can see above, there are 5572 rows and 5 columns in the dataset

In [15]:
raw_df.info() #information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


The columns with the name 'Unnamed: 2', 'Unnamed: 3' and 'Unnamed: 4' are mostly of NULL values. Therefore, it is better to delete those columns from the dataset

In [16]:
#Dropping the unwanted columns from the dataset
raw_df = raw_df.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"],axis=1)

In [17]:
raw_df.head(10) #Looking into the data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


We can see that the column names are 'v1' and 'v2' which do not make any sense. Let us change the names of the columns

In [18]:
raw_df.columns = ['IsSpam','message']

The first column `IsSpam` is our target columns which specifies if the message corresponding is spam or not.

In [11]:
raw_df['IsSpam'] = raw_df.IsSpam.map({'spam':1,'ham':0})

In [19]:
raw_df.head(10) #Looking into the data

Unnamed: 0,IsSpam,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


## Preparing the Data for Training

### Create train/validation split

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
train_df, val_df = train_test_split(raw_df, test_size=0.15, random_state=42)

In [22]:
print('raw_df shape: ',raw_df.shape)
print('train_df shape: ',train_df.shape)
print('validation_df.shape: ', val_df.shape)

raw_df shape:  (5572, 2)
train_df shape:  (4736, 2)
validation_df.shape:  (836, 2)


### Identifying Input and Target Columns

There are exactly 2 columns in the dataset.

In [23]:
input_cols = 'message'
target_col = 'IsSpam'

In [24]:
# 'X_train' is the training input
X_train = train_df[input_cols].copy()
# 'Y_train' is the training output
Y_train = train_df[target_col].copy()

In [25]:
# 'X_val' is the validation input
X_val = val_df[input_cols].copy()
# 'Y_val' is the validation output
Y_val = val_df[target_col].copy()

### Bag of words

The input we have in in the form of text but the machine learning algorithms mostly rely on numeric input.

So, we will take all the message column data and count the frequency of words in that text. This is the main idea for 'Bag of words' concept.

To do this, we can use the `CountVectorizer` method in sklearn. Using this, we can tokenize the each message into individual words and give a integer id to each token. The frequency of each word is found out by counting the occurrences of each token

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()

We have to fit the training data (X_train) to learn the vocabulary and then transform into document-term matrix. 

For the validation data, we do not fit to learn the vocabulary as we are using this data to test our model. So, we only transform the validation input into document-term matrix

In [42]:
#Fitting and transforming the training input into document-term matrix
count_vector.fit(X_train)
train_data = count_vector.transform(X_train)

#Transforming the validation input into document-term matrix
val_data = count_vector.transform(X_val)

This ends our data preprocessing and now we have to build our model to train and evaluate

## Model Building

### 1. Logistic Regression

In [53]:
from sklearn.linear_model import LogisticRegression

In [54]:
model = LogisticRegression(solver='liblinear')

In [55]:
model.fit(train_data,Y_train)

LogisticRegression(solver='liblinear')

**Evaluating the model**

In [56]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [57]:
# Checking the accuracy score on training data
train_pred = model.predict(train_data)
accuracy_score(train_pred,Y_train)

0.9983108108108109

In [58]:
# Checking the accuracy score on validation data
val_pred = model.predict(val_data)
accuracy_score(val_pred,Y_val)

0.9844497607655502

The training accuracy is 99.83% and the validation accuracy is 98.44% which is a good score and we can say that the model is efficient

Let us now see if we can improve the accuracy using other models

### 2. Naives Bayes

In [59]:
from sklearn.naive_bayes import MultinomialNB

In [60]:
model = MultinomialNB().fit(train_data,Y_train)

**Evaluating the model**

In [61]:
# Checking the accuracy score on training data
train_pred = model.predict(train_data)
accuracy_score(train_pred,Y_train)

0.9932432432432432

In [62]:
# Checking the accuracy on validation data
val_pred = model.predict(val_data)
accuracy_score(val_pred,Y_val)

0.9856459330143541

The training accuracy is 99.32% and the validation accuracy is 98.56% which is a good score and we can say that the model is efficient. The naive-bayes model seems to do better for the validation data

### 3. Decision Tree

In [64]:
from sklearn.tree import DecisionTreeClassifier

In [65]:
model = DecisionTreeClassifier(random_state=42).fit(train_data,Y_train)

**Evaluating the model**

In [66]:
# Checking the accuracy score on training data
train_pred = model.predict(train_data)
accuracy_score(train_pred,Y_train)

1.0

In [67]:
# Checking the accuracy on validation data
val_pred = model.predict(val_data)
accuracy_score(val_pred,Y_val)

0.9772727272727273

The training accuracy is 100% and the validation accuracy is 97.72% which depicts that model is overfitting over the training data and we need to regularize the model for a better accuracy

### 4. Random Forest

In [89]:
from sklearn.ensemble import RandomForestClassifier

In [90]:
model = RandomForestClassifier(n_jobs=-1,random_state=42).fit(train_data,Y_train)

In [91]:
# Checking the accuracy score on training data
train_pred = model.predict(train_data)
accuracy_score(train_pred,Y_train)

1.0

In [92]:
# Checking the accuracy on validation data
val_pred = model.predict(val_data)
accuracy_score(val_pred,Y_val)

0.9796650717703349

The training accuracy is 100% and the validation accuracy is 97.96% which depicts that model is overfitting over the training data and we need to regularize the model for a better accuracy. 

The naive bayes model seems to do better in this particular scenario of detecting whether a message is spam or not. It is because that the naive bayes performs good when there are large number of features and considers each feature independent.