<a href="https://www.kaggle.com/code/larsmagnusson/workshop-15-september-bayes-on-heart-disease?scriptVersionId=143534772" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Load dataset. No need to specify column names since it is part of the data
dataset = pd.read_csv('/kaggle/input/heart-disease-cleveland-uci/heart_cleveland_upload.csv')

# The name of all the features containing categorical data
cat_names = ['sex', 'cp', 'fbs', 'restecg', 'slope', 'thal', 'exang']
all_cat_names = ['sex', 'cp', 'fbs', 'restecg', 'slope', 'thal', 'exang', 'condition']

# Convert all categorical features to appropriate type
dataset[all_cat_names] = dataset[all_cat_names].astype('category')

# Split into 75/25 train/test (the training data will be used for cross validation)
train_data = dataset.groupby('condition', group_keys=False).apply(lambda x: x.sample(frac=0.75))
test_data = dataset.drop(train_data.index)

# Select all the features by dropping the target column in both training and test partition
train_targets = train_data['condition']
train_features = train_data.drop('condition', axis=1)
test_targets = train_data['condition']
test_features = train_data.drop('condition', axis=1)

# Separate categorical features from numerical features in both training and test data.
# This is used when we try to manually solve the issues with different data types, but we 
# don't need it for our custom estimator (which handles this internally)
cat_train_features = train_features[cat_names];
num_train_features = train_features.drop(cat_names, axis=1)
cat_test_features = test_features[cat_names];
num_test_features = test_features.drop(cat_names, axis=1)

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import CategoricalNB, GaussianNB

# Create a combined pipeline using an ordinal encoder to encode the featues and a 
# categorical naive bayes classifier
cat_classifier = make_pipeline(OrdinalEncoder(), CategoricalNB())

# Create a classifier for the numerical data
num_classifier = GaussianNB()

# Train the classifiers on their respective features
cat_classifier.fit(cat_train_features, train_targets)
num_classifier.fit(num_train_features, train_targets)

# Print score/accuracy of the classifiers
(cat_classifier.score(cat_test_features, test_targets),num_classifier.score(num_test_features, test_targets))

In [None]:
# This shows how we can combine the output from two classifiers

# Fetch the class probabilities
cat_proba = cat_classifier.predict_proba(cat_test_features)
num_proba = num_classifier.predict_proba(num_test_features)

# Combine by elementwise multiplication
proba = cat_proba * num_proba

# Print the combined probailities (one element per class per instance)
print(proba)

# Find and print the index of the maximum probability (this should be used to access the 
# class labels)
predictions = np.argmax(proba,axis=1)
predictions

In [None]:
# This shows how to do a simple exhaustive grid search

from sklearn.model_selection import GridSearchCV

# Show the tunable parameters of our chosen algorithm
#test_classifier = GaussianNB()
#test_classifier.get_params()

# Setup which parameter, and parameter values to test. These values are
# not necessarily ideal, but show how to specify a list of values 
params = {'var_smoothing': [0.0001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5]} 

# Create the grid search object. We'll be using a gaussian naive bayes classifier and we'll tune 
# one single parameter. We'll also be using 5-fold cross validation
grid = GridSearchCV(GaussianNB(), params, cv=5)

# Run the grid search and retrain using optimal parameter values
grid.fit(train_features, train_targets)
# Show the result with statistics for each individual fold
grid.cv_results_
# Test the accuracy of the optimized model on test data
#grid.score(test_features, test_targets)


In [None]:
# Shows how to create a custom classifier that combines two classifiers, in our case
# CategoricalNB and GaussianNB, but the setup will work for others

from sklearn.base import BaseEstimator, ClassifierMixin

# Define our estimator/classifier, inherits from BaseEstimator and ClassifierMixin
class OurNaivety(BaseEstimator, ClassifierMixin):
    # The constructor specify all tunable parameters (accessible through get_params()).
    def __init__(self, cat_classifier=CategoricalNB(),num_classifier=GaussianNB()):
        self.cat_classifier= cat_classifier
        self.num_classifier= num_classifier
    
    # Train our classifier
    def fit(self,X,y):
        #self.X = X
        #self.y = y
        
        # Seperate our data into numerical and categorical data
        cat_X = X[cat_names];
        num_X = X.drop(cat_names, axis=1)
        
        # Delegate the training to our two classifiers
        self.cat_classifier.fit(cat_X,y)
        self.num_classifier.fit(num_X,y)
        
        # Store a local copy of the class labels found. We assume that both the classifiers 
        # have found the same labels and in the same order for this to work
        self.classes_ = self.cat_classifier.classes_
        
        # This is to follow the convention
        return self
    
    # Predict the class probabilities
    def predict_proba(self,data):
        # Seperate our data into numerical and categorical data
        cat_data = data[cat_names];
        num_data = data.drop(cat_names, axis=1)
        
        # Combine and return the class probabilities 
        cat_proba = self.cat_classifier.predict_proba(cat_data)
        num_proba = self.num_classifier.predict_proba(num_data)
        proba = cat_proba * num_proba
        
        return proba
        
    # Predict class labels    
    def predict(self,data):
        # Use the index of the maximum probability to access the class labels
        return self.classes_[np.argmax(self.predict_proba(data), axis=1)]

In [None]:
# Shows how to run a grid search using our combined classifier. 

# Test our classifier (not needed)
model = OurNaivety()
model.fit(train_features,train_targets)
model.score(test_features, test_targets)

# Get available parameters
model.get_params(deep=True)

# Set the grid search space
params2 = {
    'cat_classifier__alpha':[0.5, 1.0],
    'num_classifier__var_smoothing': [1e-09, 1e-05]
          }
# Create the grid search, using 5-fold cross validation
grid2 = GridSearchCV(OurNaivety(), params2, cv=5)

# Train the model and tune the parameters, and test the best estimator on test data
grid2.fit(train_features, train_targets)
grid2.score(test_features, test_targets)
#grid2.cv_results_