In [1]:
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.feature_extraction.text import CountVectorizer

### Import Data

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/bigmlcom/python/master/data/spam.csv')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


### Data Preprocess

In [3]:
#remove the stop words and transform the texts into the vectorized input variables X
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df["v2"])

#transform the values of the output variable into 0 and 1
y = df["v1"].map({'spam':1,'ham':0})

#split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

### Multinomial Naive Bayes Classifier

The multinomial NB classifier has a hyperparameter called **`alpha`**. It is the **smoothing parameter** to avoid **zero counts** when calculating the frequencies. 

For example, if we are now classifying a new SMS with a word "ryan" which never exist in the spam emails within our training dataset, the **likelihood** for this word will be zero. This will casue the **overall likelihood** to be zero (because we take the product of all **individual likelihoods**) for no matter what class of output variable we have.

Therefore, we need to add **additional counts** to each word when calculating the frequencies to avoid have a zero likelihood value. **Alpha** indicates how many **additional counts** we add.

In [4]:
#train and evaluate models with different alpha values
alpha_values = np.arange(0.01, 10, 0.01)
accuracy_scores, recall_scores, precision_scores = [], [], []

for alpha in alpha_values:
    NB = MultinomialNB(alpha=alpha)
    NB.fit(X_train, y_train)
    accuracy_scores.append(metrics.accuracy_score(y_test, NB.predict(X_test)))
    recall_scores.append(metrics.recall_score(y_test, NB.predict(X_test)))
    precision_scores.append(metrics.precision_score(y_test, NB.predict(X_test)))

performance_NB = pd.DataFrame(columns=['alpha', 'accuracy', 'recall', 'precision'])
performance_NB['alpha'] = alpha_values
performance_NB['accuracy'] = accuracy_scores
performance_NB['recall'] = recall_scores
performance_NB['precision'] = precision_scores
performance_NB.head()

Unnamed: 0,alpha,accuracy,recall,precision
0,0.01,0.980263,0.953782,0.911647
1,0.02,0.980263,0.962185,0.905138
2,0.03,0.980861,0.962185,0.90873
3,0.04,0.980861,0.962185,0.90873
4,0.05,0.980861,0.966387,0.905512


In [5]:
#finds the best alpha value
best_index = performance_NB['accuracy'].idxmax()
performance_NB.iloc[best_index, :]

alpha        3.800000
accuracy     0.983852
recall       0.911765
precision    0.973094
Name: 379, dtype: float64

## <a id="4"></a>

# <center>Example: Titanic Survival Prediction</center>

### Import Data

In [6]:
df_train = pd.read_csv('https://github.com/pcsanwald/kaggle-titanic/blob/master/train.csv')
df_test = pd.read_csv('https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/test.csv')

### Data Preprocess

In [7]:
#missing values
df_train = df_train.dropna(subset=['Embarked'])
df_train['Age'].fillna(round(df_train['Age'].dropna().mean()), inplace=True)
df_test['Age'].fillna(round(df_test['Age'].dropna().mean()), inplace=True)

#feature engineering: family size
df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] + 1
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch'] + 1
df_train['Alone'] = df_train['FamilySize'].map({1:1})
df_train['Alone'].fillna(0, inplace=True)
df_test['Alone'] = df_test['FamilySize'].map({1:1})
df_test['Alone'].fillna(0, inplace=True)

#feature engineering: age band
bins = [0, 15, 30, 60, 81]
labels = [0, 1, 2, 3]
df_train['AgeBand'] = pd.cut(df_train['Age'], bins=bins, labels=labels, right=False)
df_test['AgeBand'] = pd.cut(df_test['Age'], bins=bins, labels=labels, right=False)

#feature engineering: fare band
bins = [0, 8, 14, 31, 513]
labels = [0, 1, 2, 3]
df_train['FareBand'] = pd.cut(df_train['Fare'], bins=bins, labels=labels, right=False)
df_test['FareBand'] = pd.cut(df_test['Fare'], bins=bins, labels=labels, right=False)

#encode categorical variables
df_train['Sex'] = df_train['Sex'].map({'male':1, 'female':0})
df_test['Sex'] = df_test['Sex'].map({'male':1, 'female':0})
df_train = pd.get_dummies(df_train, columns= ['Embarked'])
df_test = pd.get_dummies(df_test, columns= ['Embarked'])
df_train.drop(columns=['Embarked_S'], inplace=True)
df_test.drop(columns=['Embarked_S'], inplace=True)

#create datasets for modelling
X_train = df_train.drop(columns=['Survived','Name','Age','SibSp','Parch','Fare','PassengerId','Ticket','Cabin'])
y_train = df_train['Survived']
X_test = df_test.drop(columns=['Name','Age','SibSp','Parch','Fare','Ticket','Cabin']).copy()

#create the validation set
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=0)

### Gaussian Naive Bayes Classifier

There is one hyperparameter we need to tune: **`var_smoothing`**. This is the **portion of the largest variance** of all features that is added to variances for **calculation stability**.

In [8]:
#train models with different smoothing values
smooth_values = np.linspace(1e-9, 1e-2, 1000)
accuracy_scores, recall_scores, precision_scores = [], [], []

for smooth in smooth_values:
    GNB = GaussianNB(var_smoothing=smooth)
    GNB.fit(X_train, y_train)
    accuracy_scores.append(metrics.accuracy_score(y_valid, GNB.predict(X_valid)))
    recall_scores.append(metrics.recall_score(y_valid, GNB.predict(X_valid)))
    precision_scores.append(metrics.precision_score(y_valid, GNB.predict(X_valid)))

performance_GNB = pd.DataFrame(columns=['smooth', 'accuracy', 'recall', 'precision'])
performance_GNB['smooth'] = smooth_values
performance_GNB['accuracy'] = accuracy_scores
performance_GNB['recall'] = recall_scores
performance_GNB['precision'] = precision_scores
performance_GNB.head()

Unnamed: 0,smooth,accuracy,recall,precision
0,1e-09,0.737828,0.7,0.675439
1,1.001101e-05,0.737828,0.7,0.675439
2,2.002102e-05,0.737828,0.7,0.675439
3,3.003103e-05,0.737828,0.7,0.675439
4,4.004104e-05,0.737828,0.7,0.675439


In [9]:
#finds the best smoothing value
best_index = performance_GNB['accuracy'].idxmax()
performance_GNB.iloc[best_index, :]

smooth       0.006096
accuracy     0.752809
recall       0.700000
precision    0.700000
Name: 609, dtype: float64