<b>Let's import the required packages first.</b>

In [1]:
import numpy as np 
import pandas as pd


from sklearn.pipeline import Pipeline

import string

import nltk
from nltk.stem import SnowballStemmer
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RAHUL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
sms = pd.read_csv(r'C:\Users\RAHUL\Downloads\Capstone 2\SMSSpamCollection', sep='\t', names=['label', 'message'])

In [3]:
sms.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


<h1>Data Preprocessing</h1>

Here we follow the bag of words approach and tokenize, vectorize, provide Tf-idf weights, and stem the data to prepare it for modelling.

In [4]:
stemmer = SnowballStemmer('english')

<b>Creating a function with all the steps of preprocessing.</b>

In [5]:
def remove_punctuation_and_stopwords(mess):
    
    sms_no_punct = [ch for ch in mess if ch not in string.punctuation]
    sms_no_punct = "".join(sms_no_punct).split()
    
    sms_no_punct_no_stopw = \
        [word.lower() for word in sms_no_punct if word.lower() not in stopwords.words("english")]
    msg = " ".join([stemmer.stem(word) for word in sms_no_punct_no_stopw])
        
    return msg

<b>Applying the function created for preprocessing</b>

In [6]:
sms['message'].apply(remove_punctuation_and_stopwords).head()

0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
2    free entri 2 wkli comp win fa cup final tkts 2...
3                  u dun say earli hor u c alreadi say
4            nah dont think goe usf live around though
Name: message, dtype: object

<b>Vectorization</b>

In [7]:
sms['message'].head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: message, dtype: object

In [8]:
count = CountVectorizer(remove_punctuation_and_stopwords).fit(sms['message'])
print(len(count.vocabulary_))

8713


In [9]:
count_tran = count.transform(sms['message'])

<b>Providing weights to frequently occuring words.</b>

In [10]:
tfidf =TfidfTransformer().fit(count_tran)

In [11]:
data_tfidf = tfidf.transform(count_tran)

<h1>Modelling</h1>

<b>Splitting the dataset into train and test data. </b>

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data_tfidf, sms['label'], test_size=0.3, random_state=5)

<b>Making a function to apply the classification model and to calculate its score.</b>

In [13]:
def classification(clf):
    fit = clf.fit(X_train, y_train)
    predict = fit.predict(X_test)
    print('accuracy:', accuracy_score(y_test, predict))
    print('confusion matrix\n', confusion_matrix(y_test, predict))
    print(classification_report(y_test, predict))

<b>Support Vector Machine Classification. </b>

In [14]:
classification(SVC())



accuracy: 0.8666267942583732
confusion matrix
 [[1449    0]
 [ 223    0]]
              precision    recall  f1-score   support

         ham       0.87      1.00      0.93      1449
        spam       0.00      0.00      0.00       223

   micro avg       0.87      0.87      0.87      1672
   macro avg       0.43      0.50      0.46      1672
weighted avg       0.75      0.87      0.80      1672



  'precision', 'predicted', average, warn_for)


<b>Naive Bayes classifier</b>

In [15]:
classification(MultinomialNB())

accuracy: 0.9551435406698564
confusion matrix
 [[1449    0]
 [  75  148]]
              precision    recall  f1-score   support

         ham       0.95      1.00      0.97      1449
        spam       1.00      0.66      0.80       223

   micro avg       0.96      0.96      0.96      1672
   macro avg       0.98      0.83      0.89      1672
weighted avg       0.96      0.96      0.95      1672



<b>Logistic Regression classifier. </b>

In [16]:
classification(LogisticRegression(solver='liblinear', penalty='l1'))

accuracy: 0.9569377990430622
confusion matrix
 [[1439   10]
 [  62  161]]
              precision    recall  f1-score   support

         ham       0.96      0.99      0.98      1449
        spam       0.94      0.72      0.82       223

   micro avg       0.96      0.96      0.96      1672
   macro avg       0.95      0.86      0.90      1672
weighted avg       0.96      0.96      0.95      1672



<b>K Nearest Neighbors Classifier </b>

In [17]:
classification(KNeighborsClassifier(n_neighbors=49))

accuracy: 0.9569377990430622
confusion matrix
 [[1449    0]
 [  72  151]]
              precision    recall  f1-score   support

         ham       0.95      1.00      0.98      1449
        spam       1.00      0.68      0.81       223

   micro avg       0.96      0.96      0.96      1672
   macro avg       0.98      0.84      0.89      1672
weighted avg       0.96      0.96      0.95      1672



<b>Decision Tree Classifier </b>

In [18]:
classification(DecisionTreeClassifier(min_samples_split=7, random_state=111))

accuracy: 0.9671052631578947
confusion matrix
 [[1420   29]
 [  26  197]]
              precision    recall  f1-score   support

         ham       0.98      0.98      0.98      1449
        spam       0.87      0.88      0.88       223

   micro avg       0.97      0.97      0.97      1672
   macro avg       0.93      0.93      0.93      1672
weighted avg       0.97      0.97      0.97      1672



<b>Random Forest Classifier </b>

In [19]:
classification(RandomForestClassifier(n_estimators=31, random_state=111))

accuracy: 0.9778708133971292
confusion matrix
 [[1449    0]
 [  37  186]]
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1449
        spam       1.00      0.83      0.91       223

   micro avg       0.98      0.98      0.98      1672
   macro avg       0.99      0.92      0.95      1672
weighted avg       0.98      0.98      0.98      1672



<b>Cross Validation<b>

In [20]:
cv_results = cross_val_score(RandomForestClassifier(n_estimators=31, random_state=111), data_tfidf,sms['label'], cv=5)

In [21]:
print(cv_results)

[0.97847534 0.97668161 0.97666068 0.97127469 0.97396768]


In [22]:
print(np.mean(cv_results))

0.9754120005474556


In [23]:
RandomForestClassifier().get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 'warn',
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

<b>Hyperparameter Tuning <b>

In [24]:
param_grid={
     'n_estimators': (3,5,10,31),
'random_state': [5,10,35,51,111]}
model = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
model.fit(data_tfidf,sms['label'])

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': (3, 5, 10, 31), 'random_state': [5, 10, 35, 51, 111]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [25]:
model.best_params_

{'n_estimators': 31, 'random_state': 51}