Import Dependencies

In [2]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Data Cleansing

In [3]:
df = pd.read_csv('../spam.csv')
print(df.head())

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


Label Encoder

In [4]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['Category'] = encoder.fit_transform(df['Category'])
print(df.head())

   Category                                            Message
0         0  Go until jurong point, crazy.. Available only ...
1         0                      Ok lar... Joking wif u oni...
2         1  Free entry in 2 a wkly comp to win FA Cup fina...
3         0  U dun say so early hor... U c already then say...
4         0  Nah I don't think he goes to usf, he lives aro...


Check Duplicates

In [5]:
df.duplicated().sum()

415

In [6]:
df.drop_duplicates(keep='first', inplace=True)
df.duplicated().sum()

0

Data Preprocessing

In [7]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess(text):
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    return " ".join(lemmatized_tokens)

df['preprocessed_text'] = df['Message'].apply(preprocess)

#String column
df['preprocessed_text'] = df['preprocessed_text'].astype(str)

print(df.head())

   Category                                            Message  \
0         0  Go until jurong point, crazy.. Available only ...   
1         0                      Ok lar... Joking wif u oni...   
2         1  Free entry in 2 a wkly comp to win FA Cup fina...   
3         0  U dun say so early hor... U c already then say...   
4         0  Nah I don't think he goes to usf, he lives aro...   

                                   preprocessed_text  
0  go jurong point crazy available bugis n great ...  
1                            ok lar joking wif u oni  
2  free entry wkly comp win fa cup final tkts may...  
3                u dun say early hor u c already say  
4                nah think go usf life around though  


Train and Test split

In [8]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df['preprocessed_text'], df['Category'], test_size=0.2, random_state=0, shuffle=True)


TF-IDF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

#Convert documents into a matrix
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train).toarray()
x_test_tfidf = tfidf_vectorizer.transform(x_test).toarray()

print(x_train_tfidf)


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


Evaluate Function 

In [10]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score

def evaluate_model(true, predicted):
    ac_score = accuracy_score(true, predicted)
    pr_score = precision_score(true, predicted)
    con_matrix = confusion_matrix(true, predicted)
    return ac_score, pr_score, con_matrix

Model Selection

In [11]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

models =  {
    "SVC": SVC(),
    "GaussianNB": GaussianNB(),
    "MultinomialNB": MultinomialNB(),
    "BernoulliNB": BernoulliNB(),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier()
}
model_list = []
model_precision = []
model_accuracy = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train_tfidf, y_train)
    
    # Make predictions
    y_test_pred = model.predict(x_test_tfidf)
    
    # Evaluate
    model_ac, model_pr, model_cm = evaluate_model(y_test, y_test_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    model_precision.append(model_pr)
    model_accuracy.append(model_ac)
    
    print('Model performance')
    print('- Accuracy Score: {:.4f}'.format(model_ac))
    print('- Precision Score: {:.4f}'.format(model_pr))
    print('- Confusion Matrix:\n{}'.format(model_cm))

SVC
Model performance
- Accuracy Score: 0.9738
- Precision Score: 0.9821
- Confusion Matrix:
[[895   2]
 [ 25 110]]
GaussianNB
Model performance
- Accuracy Score: 0.8605
- Precision Score: 0.4800
- Confusion Matrix:
[[780 117]
 [ 27 108]]
MultinomialNB
Model performance
- Accuracy Score: 0.9641
- Precision Score: 1.0000
- Confusion Matrix:
[[897   0]
 [ 37  98]]
BernoulliNB
Model performance
- Accuracy Score: 0.9680
- Precision Score: 1.0000
- Confusion Matrix:
[[897   0]
 [ 33 102]]
Random Forest
Model performance
- Accuracy Score: 0.9748
- Precision Score: 1.0000
- Confusion Matrix:
[[897   0]
 [ 26 109]]




AdaBoost
Model performance
- Accuracy Score: 0.9632
- Precision Score: 0.8819
- Confusion Matrix:
[[882  15]
 [ 23 112]]


Results

In [12]:
pd.DataFrame(list(zip(model_list, model_accuracy, model_precision)), columns=['Model Name', 'Accuracy', 'Precision'])

Unnamed: 0,Model Name,Accuracy,Precision
0,SVC,0.973837,0.982143
1,GaussianNB,0.860465,0.48
2,MultinomialNB,0.964147,1.0
3,BernoulliNB,0.968023,1.0
4,Random Forest,0.974806,1.0
5,AdaBoost,0.963178,0.88189
