In [None]:
# data processed below and stored 

In [1]:
import kaggle
import os
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn
from zipfile import ZipFile
import cv2

In [2]:
%store -r X_train
%store -r X_cv
%store -r y_train
%store -r y_cv
%store -r mmscaler
%store -r pca
%store -r X_train_pca
%store -r X_train_scaled

In [3]:
# predict function 
def predict_data(classifier, scale=True, pca=True):
    if scale and pca:
        classifier.fit(X_train_pca, y_train)
        X_cv_scaled = mmscaler.transform(X_cv)
        X_cv_pca = pca.transform(X_cv_scaled)
        X_cv_pred = classifier.predict(X_cv_pca)
    elif scale:
        classifier.fit(X_train_scaled, y_train)
        X_cv_scaled = mmscaler.transform(X_cv)
        X_cv_pred = classifier.predict(X_cv_scaled)
    elif scale==False and pca==False:
        classifier.fit(X_train, y_train)
        X_cv_pred = classifier.predict(X_cv)
    return X_cv_pred

In [4]:
# make data from for evaluation metrics
columns = ['Accuracy', 'TP', 'FP', 'TN', 'FN', 'Sensitivity', 'Specificity', 'Precision', 'Recall', 'Area Under RoC Curve']
evaluation_metrics = pd.DataFrame(columns = columns)
evaluation_metrics

Unnamed: 0,Accuracy,TP,FP,TN,FN,Sensitivity,Specificity,Precision,Recall,Area Under RoC Curve


In [5]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, roc_auc_score

# fill evaluation metrics table
def evaluate_classifier(dataframe, classifier, scale=True, pca=True):
    index = str(classifier)
    pred = predict_data(classifier, scale=scale, pca=pca)
    accuracy = sum(pred==y_cv)/len(y_cv)
    cm = confusion_matrix(y_cv, pred)
    tp, fp, tn, fn = cm[1,1], cm[0,1], cm[0,0], cm[1,0]
    sensitivity = tp / (tp+fn)
    specificity = tn / (tn+fp)
    precision = precision_score(y_cv, pred)
    recall = recall_score(y_cv, pred)
    auc = roc_auc_score(pred, y_cv)
    dataframe.loc[index] = [accuracy, tp, fp, tn, fn, sensitivity, specificity, precision, recall, auc]

In [6]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB, BernoulliNB, CategoricalNB
multinomialNB = MultinomialNB()
gaussianNB = GaussianNB()
complementNB = ComplementNB()
bernoulliNB = BernoulliNB()
categoricalNB = CategoricalNB()

# used normalized, but not feature extracted (not pca dimentionality reduction processed) data set 
# because most Naive Bayes classifiers do not accept negative values on training data
# also, Naive Bayes learners and classifiers can be extremely fast despite of high dimensions 

evaluate_classifier(evaluation_metrics, multinomialNB, pca=False)
evaluate_classifier(evaluation_metrics, gaussianNB, pca=False)
evaluate_classifier(evaluation_metrics, complementNB, pca=False)
evaluate_classifier(evaluation_metrics, bernoulliNB, pca=False)
evaluate_classifier(evaluation_metrics, categoricalNB, pca=False)
evaluation_metrics 

Unnamed: 0,Accuracy,TP,FP,TN,FN,Sensitivity,Specificity,Precision,Recall,Area Under RoC Curve
MultinomialNB(),0.765123,1597.0,697.0,1856.0,363.0,0.814796,0.726988,0.696164,0.814796,0.766288
GaussianNB(),0.6694,1333.0,865.0,1688.0,627.0,0.680102,0.661183,0.60646,0.680102,0.667809
ComplementNB(),0.763572,1598.0,705.0,1848.0,362.0,0.815306,0.723854,0.693878,0.815306,0.765038
BernoulliNB(),0.728784,1673.0,937.0,1616.0,287.0,0.853571,0.632981,0.640996,0.853571,0.745091
CategoricalNB(),0.629958,855.0,565.0,1988.0,1105.0,0.436224,0.778692,0.602113,0.436224,0.622427


## Naive Bayes
- Naive Bayes methods are a set of supervised learning algorithms based on applying Bayes' theorem with the 'naive' assumption of conditional independence between every pair of features given the value of the class variable. 
- The different naive Bayes classifiers differ mainly by the assumptions they make regarding the distribution of $P(x_i|y)$.
- The decoupling of the class conditional feature distributions means that each distsribution can be independently estimated as a one dimensional distribution. 
- It is knwon to be good at claassification, but bad at estimation.

### 1. MultinomialNB
- Mutilnomail Naive Bayes classifier is suitable for classification with discrete features (e.g. word counts for text classification). The numltinomial distribution normally requiures integer feature counts. However, in practice, fractional counts such as tf-idf may also work.
    - tf-idf, short for **term frequency-inverse document frequency**, is numerical statistic that is intended to relfect how important a word is to a document in a collection or corpus. Ofted used as a weighting factor in searches of information retrieval, text mining, and user modeling.
- The parameters $\theta_y$ is estimated by a smoothed version of maximum likelihood, i.e. relative frequency counting:
    - $\hat{\theta_{yi}} = $ $\large {{N_{yi} + \alpha} \over {N_y +  \alpha n}}$    
    - Setting $\alpha=1$ is called Laplace smoothing (Default), while $\alpha<1$ is called Lidstone smoothing.

#### Exploring Attributes and Methods

In [7]:
print("The number of samples encountered for each class during fitting: ", multinomialNB.class_count_)
print("The number of samples encountered for each class during fitting: ", sum(np.array(y_train)==0), sum(np.array(y_train)==1))

The number of samples encountered for each class during fitting:  [10012.  8039.]
The number of samples encountered for each class during fitting:  10012 8039


In [8]:
import math
print("The Smoothed probability for each class: ", np.exp(multinomialNB.class_log_prior_))
print("The smoothed probability for each class: ", sum(np.array(y_train)==0)/len(y_train), sum(np.array(y_train)==1)/len(y_train))

The Smoothed probability for each class:  [0.55465071 0.44534929]
The smoothed probability for each class:  0.5546507118719185 0.44534928812808156


In [9]:
print("The empirical log probability of features given a class, p(x_i|y): \n", multinomialNB.feature_log_prob_)
print("with shape: ", multinomialNB.feature_log_prob_.shape)
print("The probability of features given a class, p(x_i|y): \n",  np.exp(multinomialNB.feature_log_prob_))
print("with shape: ", np.exp(multinomialNB.feature_log_prob_).shape)
print("The sum of the probability of features given a class, p(x_i|y): \n",  np.exp(multinomialNB.feature_log_prob_).sum(axis=1))

The empirical log probability of features given a class, p(x_i|y): 
 [[-9.25276143 -9.28468328 -9.37842624 ... -9.2950064  -9.33821184
  -9.44233523]
 [-9.31141794 -9.32124067 -9.33223266 ... -9.32227054 -9.33741926
  -9.35005999]]
with shape:  (2, 12288)
The probability of features given a class, p(x_i|y): 
 [[9.58466127e-05 9.28353303e-05 8.45281251e-05 ... 9.18819091e-05
  8.79966480e-05 7.92950199e-05]
 [9.03862915e-05 8.95027978e-05 8.85243718e-05 ... 8.94106691e-05
  8.80664197e-05 8.69602022e-05]]
with shape:  (2, 12288)
The sum of the probability of features given a class, p(x_i|y): 
 [1. 1.]


In [10]:
print("Parameteres (default setting): ", multinomialNB.get_params())

Parameteres (default setting):  {'alpha': 1.0, 'class_prior': None, 'fit_prior': True}


#### Testing different alpha

In [11]:
columns = ['Accuracy', 'TP', 'FP', 'TN', 'FN', 'Sensitivity', 'Specificity', 'Precision', 'Recall', 'Area Under RoC Curve']
different_alphas = pd.DataFrame(columns = columns)
# from 0.1 to 1
alphas = np.linspace(0, 1, 11)
alphas = alphas[1:]
for alpha in alphas:
    multinomialNB = MultinomialNB(alpha=alpha)
    evaluate_classifier(different_alphas, multinomialNB, pca=False)
# from 1 to 10
alphas = np.linspace(1, 10, 10)
for alpha in alphas:
    multinomialNB = MultinomialNB(alpha=alpha)
    evaluate_classifier(different_alphas, multinomialNB, pca=False)
# from 10 to 100
alphas = np.linspace(10, 100, 10)
for alpha in alphas:
    multinomialNB = MultinomialNB(alpha=alpha)
    evaluate_classifier(different_alphas, multinomialNB, pca=False)
different_alphas

Unnamed: 0,Accuracy,TP,FP,TN,FN,Sensitivity,Specificity,Precision,Recall,Area Under RoC Curve
MultinomialNB(alpha=0.1),0.764901,1597.0,698.0,1855.0,363.0,0.814796,0.726596,0.695861,0.814796,0.7661
MultinomialNB(alpha=0.2),0.764901,1597.0,698.0,1855.0,363.0,0.814796,0.726596,0.695861,0.814796,0.7661
MultinomialNB(alpha=0.30000000000000004),0.764901,1597.0,698.0,1855.0,363.0,0.814796,0.726596,0.695861,0.814796,0.7661
MultinomialNB(alpha=0.4),0.764901,1597.0,698.0,1855.0,363.0,0.814796,0.726596,0.695861,0.814796,0.7661
MultinomialNB(alpha=0.5),0.765123,1597.0,697.0,1856.0,363.0,0.814796,0.726988,0.696164,0.814796,0.766288
MultinomialNB(alpha=0.6000000000000001),0.765123,1597.0,697.0,1856.0,363.0,0.814796,0.726988,0.696164,0.814796,0.766288
MultinomialNB(alpha=0.7000000000000001),0.765123,1597.0,697.0,1856.0,363.0,0.814796,0.726988,0.696164,0.814796,0.766288
MultinomialNB(alpha=0.8),0.765123,1597.0,697.0,1856.0,363.0,0.814796,0.726988,0.696164,0.814796,0.766288
MultinomialNB(alpha=0.9),0.765123,1597.0,697.0,1856.0,363.0,0.814796,0.726988,0.696164,0.814796,0.766288
MultinomialNB(),0.765123,1597.0,697.0,1856.0,363.0,0.814796,0.726988,0.696164,0.814796,0.766288


### 2. GausianNB
- The likelihood of the features is assumed to be Gaussian: $P(x_i|y) =$ $1 \over \sqrt{2 \pi \sigma_y^2}$ $exp\Big(-$ ${(x_i - \mu_y)^2} \over 2\sigma_y^2 $ $\Big)$
    - The parameter $\sigma_y$ and $\mu_y$ are estimated using maximum likelihood

#### Exploring Attributes and Methods

In [12]:
print("The number of samples encountered for each class during fitting: ", gaussianNB.class_count_)
print("The number of samples encountered for each class during fitting: ", sum(np.array(y_train)==0), sum(np.array(y_train)==1))

The number of samples encountered for each class during fitting:  [10012.  8039.]
The number of samples encountered for each class during fitting:  10012 8039


In [13]:
import math
print("The Smoothed probability for each class: ", gaussianNB.class_prior_)
print("The smoothed probability for each class: ", sum(np.array(y_train)==0)/len(y_train), sum(np.array(y_train)==1)/len(y_train))

The Smoothed probability for each class:  [0.55465071 0.44534929]
The smoothed probability for each class:  0.5546507118719185 0.44534928812808156


In [14]:
print("Mean of each feature per class: ", gaussianNB.theta_)
print("Check if theta_ same as mean(): ")
print(all(gaussianNB.theta_[0] == X_train_scaled[np.array(y_train)==0].mean(axis=0)))
print(all(gaussianNB.theta_[1] == X_train_scaled[np.array(y_train)==1].mean(axis=0)))

Mean of each feature per class:  [[0.67283495 0.65169287 0.59336835 ... 0.64499894 0.6177207  0.55662695]
 [0.78620207 0.77851601 0.77000407 ... 0.77771452 0.76602006 0.75639639]]
Check if theta_ same as mean(): 
True
True


In [15]:
print("Variance of each feature per class: ", gaussianNB.var_)
print("Check if var_ same as var(): ")
print(all(gaussianNB.var_[0] == X_train_scaled[np.array(y_train)==0].var(axis=0)))
print(all(gaussianNB.var_[1] == X_train_scaled[np.array(y_train)==1].var(axis=0)))
print("Check if two arrays are element-wise equal within epsilon_ tolerance: ")
print(np.allclose(gaussianNB.var_[0], X_train_scaled[np.array(y_train)==0].var(axis=0), atol=gaussianNB.epsilon_))
print(np.allclose(gaussianNB.var_[1], X_train_scaled[np.array(y_train)==1].var(axis=0), atol=gaussianNB.epsilon_))

Variance of each feature per class:  [[0.10666819 0.10874434 0.13554036 ... 0.1096265  0.11297788 0.13918359]
 [0.09416442 0.09513553 0.10221959 ... 0.09533106 0.0985865  0.1055053 ]]
Check if var_ same as var(): 
False
False
Check if two arrays are element-wise equal within epsilon_ tolerance: 
True
True


### 3. ComplementNB
- It is an adaptation of the standard multinomial naive Bayes algorithm that is particularly suited for imbalanced data sets.
- It was designed to correct the "severe assumptions" made by the standard multinomial naive Bayes classifier. 

In [16]:
complementNB.feature_count_

array([[6736.42352941, 6524.74901961, 5940.80392157, ..., 6457.72941176,
        6184.61960784, 5572.94901961],
       [6320.27843137, 6258.49019608, 6190.0627451 , ..., 6252.04705882,
        6158.03529412, 6080.67058824]])

### 4. BernoulliNB
- It implements the naive Bayes training and classification algorithms for data that is distributed according to multivariate Bernoulli distributions **(each feature is assumed to be a binary-valued variable)**.
- If any kind of data other than binary-value is provided, the `BernoulliNB` instance binarize its input (depending on `binarize` parameter).
- This might perform better on some datasets, especially those with shorter documents. 

In [17]:
bernoulliNB.feature_count_

array([[9896., 9888., 9809., ..., 9890., 9882., 9804.],
       [7886., 7896., 7892., ..., 7892., 7898., 7901.]])

### 5. Categorical Naive Bayes
- It implements the categorical naive Bayes algorithm for categorically distributed data. 
- It assumes that each feature has its own categorical distribution.
- It assumes that the sample matrix $X$ is encoded such that all categories for each feature $i$ are represented with numbers $0, \cdots, n_i-1$ where $n_i$ is the number of available categories of feature $i$.

In [18]:
print("The number of categories for each feature is all 2: ", all(categoricalNB.n_categories_==2))

The number of categories for each feature is all 2:  True


### Compare to using non-scaled dataset

In [19]:
# evaluation metrics for non-scaled data
columns = ['Accuracy', 'TP', 'FP', 'TN', 'FN', 'Sensitivity', 'Specificity', 'Precision', 'Recall', 'Area Under RoC Curve']
evaluation_metrics_nn = pd.DataFrame(columns = columns)
evaluation_metrics_nn

Unnamed: 0,Accuracy,TP,FP,TN,FN,Sensitivity,Specificity,Precision,Recall,Area Under RoC Curve


In [20]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, ComplementNB, BernoulliNB, CategoricalNB
multinomialNB = MultinomialNB()
gaussianNB = GaussianNB()
complementNB = ComplementNB()
bernoulliNB = BernoulliNB()
categoricalNB = CategoricalNB()

# used normalized, but not feature extracted (not pca dimentionality reduction processed) data set 
# because most Naive Bayes classifiers do not accept negative values on training data
# also, Naive Bayes learners and classifiers can be extremely fast despite of high dimensions 

evaluate_classifier(evaluation_metrics_nn, multinomialNB, scale=False, pca=False)
evaluate_classifier(evaluation_metrics_nn, gaussianNB, scale=False, pca=False)
evaluate_classifier(evaluation_metrics_nn, complementNB, scale=False, pca=False)
evaluate_classifier(evaluation_metrics_nn, bernoulliNB, scale=False, pca=False)
evaluate_classifier(evaluation_metrics_nn, categoricalNB, scale=False, pca=False)
evaluation_metrics_nn

Unnamed: 0,Accuracy,TP,FP,TN,FN,Sensitivity,Specificity,Precision,Recall,Area Under RoC Curve
MultinomialNB(),0.763572,1598.0,705.0,1848.0,362.0,0.815306,0.723854,0.693878,0.815306,0.765038
GaussianNB(),0.6694,1333.0,865.0,1688.0,627.0,0.680102,0.661183,0.60646,0.680102,0.667809
ComplementNB(),0.763572,1598.0,705.0,1848.0,362.0,0.815306,0.723854,0.693878,0.815306,0.765038
BernoulliNB(),0.728784,1673.0,937.0,1616.0,287.0,0.853571,0.632981,0.640996,0.853571,0.745091
CategoricalNB(),0.677376,1245.0,741.0,1812.0,715.0,0.635204,0.709753,0.626888,0.635204,0.671972


In [21]:
evaluation_metrics

Unnamed: 0,Accuracy,TP,FP,TN,FN,Sensitivity,Specificity,Precision,Recall,Area Under RoC Curve
MultinomialNB(),0.765123,1597.0,697.0,1856.0,363.0,0.814796,0.726988,0.696164,0.814796,0.766288
GaussianNB(),0.6694,1333.0,865.0,1688.0,627.0,0.680102,0.661183,0.60646,0.680102,0.667809
ComplementNB(),0.763572,1598.0,705.0,1848.0,362.0,0.815306,0.723854,0.693878,0.815306,0.765038
BernoulliNB(),0.728784,1673.0,937.0,1616.0,287.0,0.853571,0.632981,0.640996,0.853571,0.745091
CategoricalNB(),0.629958,855.0,565.0,1988.0,1105.0,0.436224,0.778692,0.602113,0.436224,0.622427


- Because some Naive Bayes models assume input feature represent count of appearance, we tried to use integer values.
    - The original data range from 0 to 255 discretely, and normalized data range from 0 to 1. 
- When comparing results between using original data and normalized data, 
    - GaussianNB, ComplementNB, BernoulliNB has same result
    - MultinomailNB works slightly (0.1%) better in normalized data
    - CategoricalNB works better (approximately 5%) in non-normalized data 

## Discussions

In [22]:
# Final evaluation table
def highlight_max(x):
    return ['font-weight: bold' if v==x.iloc[evaluation_metrics['Accuracy'].argmax()] else '' for v in x]
evaluation_metrics.style.apply(highlight_max)

Unnamed: 0,Accuracy,TP,FP,TN,FN,Sensitivity,Specificity,Precision,Recall,Area Under RoC Curve
MultinomialNB(),0.765123,1597.0,697.0,1856.0,363.0,0.814796,0.726988,0.696164,0.814796,0.766288
GaussianNB(),0.6694,1333.0,865.0,1688.0,627.0,0.680102,0.661183,0.60646,0.680102,0.667809
ComplementNB(),0.763572,1598.0,705.0,1848.0,362.0,0.815306,0.723854,0.693878,0.815306,0.765038
BernoulliNB(),0.728784,1673.0,937.0,1616.0,287.0,0.853571,0.632981,0.640996,0.853571,0.745091
CategoricalNB(),0.629958,855.0,565.0,1988.0,1105.0,0.436224,0.778692,0.602113,0.436224,0.622427


- Why MultinomailNB works best?
    - It is known to be good at text analysis, spam filtering, etc. Our data is originally image data and mapped into pixel data, where multiple columns represent whether shape exists in a specific location in the image or not. This representation cound be considered similar to the one used in spam filetring, where the existence of specific words decide the class. 
- Why GaussinNB and CategoricalNB show lower accuracy?
    - Our pixel data range from 0 to 255, discretely. 
    - GaussianNB assumes that data is continuous, and CategoricalNB assumes that data is categorical. We guess the reason why these two classifier shows lower accuracy is that these assumes different data types from our data. 
    - This also explains why ComplementNB and BernoulliNB work quite well. ComplementNB is based on MultinomialNB and BernoulliNB see data as binary, which is enough to explain the shape and color (if RGB).



## Bayes Nets
Using `bnlearn` library

In [23]:
pip install bnlearn

Note: you may need to restart the kernel to use updated packages.


In [None]:
import bnlearn as bn
X_train_df = pd.DataFrame(X_train_scaled)
model = bn.structure_learning.fit(X_train_df)
G = bn.plot(model)

[bnlearn] >Computing best DAG using [hc]
[bnlearn] >Set scoring type at [bic]


In [None]:
%store model

## Data preparation 
1. process image to 64 * 64 * 3 (RGB) numeric data
2. train - cross validation split (0.8:0.2)
3. normalization (using min-max scaler)
4. feature extraction (using PCA)

In [1]:
import kaggle
import os
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn
from zipfile import ZipFile
import cv2

In [2]:
!kaggle datasets download -d techsash/waste-classification-data

waste-classification-data.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
# unzip file (downloaded file located in the same folder as the notebook) 
zip_file_name = 'waste-classification-data'
with ZipFile(zip_file_name+'.zip', 'r') as files:
    files.extractall(zip_file_name)

In [4]:
# Process image
# get list of image file names 
train_image_dir = os.path.join('waste-classification-data', 'DATASET', 'TRAIN')
organic_train_dir = os.path.join(train_image_dir, 'O')
recyclable_train_dir = os.path.join(train_image_dir, 'R')
organic_train = os.listdir(organic_train_dir)
recyclable_train = os.listdir(recyclable_train_dir)
print("Number of organic (O) images in training set: ", len(organic_train))
print("Number of recyclable (R) images in training set: ", len(recyclable_train))

# convert list to pd.DataFrame 
file_names = []
file_names.extend(organic_train)
file_names.extend(recyclable_train)

# define function 
def get_fullpath(image_name):
    if image_name.startswith('O'):
        full_path = os.path.join(organic_train_dir, image_name)
    elif image_name.startswith('R'):
        full_path = os.path.join(recyclable_train_dir, image_name)
    return full_path

# use (64 * 64) as a new shape
new_shape = (64, 64)

images = []
# takes file_name with full path and desired shape as parameter
for file in file_names:
    full_path = get_fullpath(file)    
    image_wc = cv2.imread(full_path, cv2.IMREAD_COLOR)
    image_tc = cv2.cvtColor(src=image_wc, code=cv2.COLOR_BGR2RGB)
    # resize image 
    image_tc_rs = cv2.resize(src=image_tc, dsize=new_shape)
    # store image
    images.append(image_tc_rs)

# stack images to a single array
train_X = np.stack(images)
train_X = train_X.reshape(len(train_X), -1)
print("train_X: ", train_X.shape)
     
# define labels for images (0 for organic, 1 for recyclable)
train_y = [0] * len(organic_train)
train_y.extend([1] * len(recyclable_train))

Number of organic (O) images in training set:  12565
Number of recyclable (R) images in training set:  9999
train_X:  (22564, 12288)


In [5]:
# train-test (train-cross validatain) split
from sklearn.model_selection import train_test_split
X_train, X_cv, y_train, y_cv = train_test_split(train_X, train_y, test_size=0.2, random_state=42)

In [6]:
# min-man scale
from sklearn.preprocessing import MinMaxScaler
mmscaler = MinMaxScaler()
X_train_scaled = mmscaler.fit_transform(X_train)

In [7]:
# pca
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
print(X_train_pca.shape)

(18051, 1317)


In [8]:
%store X_train
%store X_cv
%store y_train
%store y_cv
%store mmscaler
%store pca
%store X_train_scaled
%store X_train_pca

Stored 'X_train' (ndarray)
Stored 'X_cv' (ndarray)
Stored 'y_train' (list)
Stored 'y_cv' (list)
Stored 'mmscaler' (MinMaxScaler)
Stored 'pca' (PCA)
Stored 'X_train_scaled' (ndarray)
Stored 'X_train_pca' (ndarray)
