# Chapter 1: Build a spam classifier

# 1. Imports

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

pd.set_option('max_colwidth', 5000)

In [None]:
!pip install lime

In [None]:
from lime.lime_tabular import LimeTabularExplainer

# 2. Load and inspect the data

In [None]:
# load the CSV data to a Pandas dataframe
df = pd.read_csv('spam_ham_dataset.csv')
# show the first two rows of the dataframe
display(df.head(2))
# show detailed information about column names, data types and missing values
print(df.info())
# 'Loan_Status' is the label: show a bar-chart of the class frequencies
df['label'].value_counts().plot(kind='bar')

## 2.1 Show examples of spam and ham emails

In [None]:
df.query('label=="spam"')['text'].replace('\s+', ' ', regex=True).iloc[1]

In [None]:
df.query('label=="ham"')['text'].replace('\s+', ' ', regex=True).iloc[0]

# 3. Build the model

## 3.1 Training/Test split

In [None]:
# 80%/20% stratified split (use class label for stratification)
X_train, X_test, Y_train, Y_test = train_test_split(df[['email_id', 'text']],
                                                    df['label'],
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=df['label'])

print('Size of Training Data ', X_train.shape[0])
print('Size of Test Data ', X_test.shape[0])

## 3.2 Generate features
Converts text content into features

In [None]:
tfidf = TfidfVectorizer(min_df = 10, ngram_range=(1,2), stop_words="english")
X_train_tf = tfidf.fit_transform(X_train['text'])
X_test_tf = tfidf.transform(X_test['text'])

## 3.3 Train the model

In [None]:
# define model type and hyper-parameter values
model = MultinomialNB()

# fit the model to the training data
model.fit(X_train_tf, Y_train)

## 3.4 Train a baseline
Use "uniform" (random classification) as strategy

In [None]:
baseline = DummyClassifier(strategy="uniform")
baseline.fit(X_train_tf, Y_train)

## 3.5 Generate predictions for the test set

In [None]:
Y_pred = model.predict(X_test_tf)
Y_pred_baseline = baseline.predict(X_test_tf)

# 4. Evaluate the predictions

## 4.1 Evaluation with Confusion Matrix

In [None]:
def plot_confusion_matrix(confusion_matrix, class_labels):
  ax= plt.subplot()

  sns.heatmap(confusion_matrix, annot=True, fmt='', cmap='Blues')
  ax.set_xlabel('Predicted')
  ax.set_ylabel('Actual');
  ax.xaxis.set_ticklabels(class_labels)
  ax.yaxis.set_ticklabels(class_labels);

In [None]:
cf_matrix = confusion_matrix(Y_test, Y_pred)
plot_confusion_matrix(cf_matrix, list(model.classes_))

In [None]:
cf_matrix_baseline = confusion_matrix(Y_test, Y_pred_baseline)
plot_confusion_matrix(cf_matrix_baseline, list(baseline.classes_))

# 5. Understanding the predictions

In [None]:
explainer = LimeTabularExplainer(X_train_tf,
                                 mode='classification',
                                 class_names=list(model.classes_),
                                 feature_names=tfidf.get_feature_names())

In [None]:
X_test[(Y_test == 'spam')].iloc[100]

In [None]:
explanation = explainer.explain_instance(X_test_tf[np.array(Y_test == 'spam')][100,:],
                                         model.predict_proba,
                                         num_features=10)
explanation.show_in_notebook()