In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
import time
import matplotlib.pyplot as plt
import math
from os import listdir
from os.path import isfile, join
from collections import Counter
import operator
import numpy as np
from sklearn.inspection import permutation_importance
import csv
import itertools
import random
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import warnings as warnings
import matplotlib.pyplot as plt

In [6]:
class TimeError(Exception):
    """Custom exception class for timer"""

class Timer:
    def __init__(self):
        self._start_time = None

    def start(self):
        """Start a new timer"""
        if self._start_time is not None:
            raise TimerError(f"Timer is running. Use .stop() to stop it")

        self._start_time = time.perf_counter()

    def stop(self):
        """Stop the timer, and report the elapsed time"""
        if self._start_time is None:
            raise TimerError(f"Timer is not running. Use .start() to start it")

        elapsed_time = time.perf_counter() - self._start_time
        self._start_time = None
        return(elapsed_time)


In [7]:
def load_data(data_file):
    df = pd.read_csv('{}'.format(data_file),header=None)

    
    #UCI has ? as missing data
    df = df[~df.eq('?').any(1)]
    df.dropna(axis = 1, how ='all', inplace = True)
    df.dropna(axis = 0, how ='all', inplace = True)
    df.to_csv('cleaned_frame.csv'.format(data_file),index=False)
    
    
    
    #reread for appropriate column dtypes
    df = pd.read_csv('cleaned_frame.csv',header=0)
    result_summary[data_file]["data_type"]=Counter(df.dtypes.tolist())
    
    #convert Y to integer type
    df[df.columns[-1]]=df[df.columns[-1]].astype('str')
    df[df.columns[-1]]=df[df.columns[-1]].str.strip()
    df[df.columns[-1]]=df[df.columns[-1]].astype('category')
    df[df.columns[-1]]=df[df.columns[-1]].cat.codes
    
    #get the columns that needed to be label encoded
    conversion_idx=[]

    for idx,d_type in enumerate(df.dtypes.values):
        if "object" == d_type:
            conversion_idx.append(idx)

    
    
    #clean categories of white space
    for idx in conversion_idx:
        df.iloc[:,idx]=df.iloc[:,idx].astype('str')
        df.iloc[:,idx]=df.iloc[:,idx].str.strip()
        df.iloc[:,idx]=df.iloc[:,idx].astype('category')
    
    #encode strings as numbers
    for idx in conversion_idx:
        labels = df.iloc[:,idx].astype('category').cat.categories.tolist()
        replace_map_comp = {idx : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
        df.iloc[:,idx].replace(replace_map_comp[idx], inplace=True)
    
    df.reset_index(drop=True, inplace=True)
    return df



In [8]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import euclidean_distances


class MNBClassifier(ClassifierMixin, BaseEstimator):

    def __init__(self, kappa=20):
        self.kappa = kappa

    def fit(self, X, y):

        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)

        self.X_ = X
        self.y_ = y
        # Return the classifier
        return self

    def predict(self, X):

        # Check is fit had been called
        check_is_fitted(self, ['X_', 'y_'])

        # Input validation
        X = check_array(X)

        y_pred=[]
        z = np.zeros((np.shape(self.classes_)[0],1 ))
        for i in range(len(self.X_)):
            z[int(self.y_[i])] += 1  

        z = z/sum(z)
            

        
        R = np.zeros((np.shape(self.X_)[1],1))
        for k in range(0, np.shape(self.X_)[1]): 
            R[k] = np.amax(self.X_[:,k]) - np.amin(self.X_[:,k])
        

        
        for i in range(len(X)):
            v = np.zeros((np.shape(self.classes_)[0],1))
            c = np.zeros((np.shape(self.classes_)[0],1))
            p = np.zeros((np.shape(self.classes_)[0],1))
            for j in range(len(self.X_)):
                d=0
                for k in range(0,np.shape(self.X_)[1]):
                    d += ((X[i,k] - self.X_[j,k])/R[k])**2
                v[int(self.y_[j])] += 1/((1 + math.sqrt(d))**self.kappa)
                c[int(self.y_[j])] += 1
            p = z*v/c

            y_pred.append(np.argmax(p))
        
        return y_pred

    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self


In [9]:

def feature_selection(model,X,y,kf,num_CV):


    important_features={}
    for col in X.columns:
        important_features[col]=0
        
    for train_index , test_index in kf.split(X,y):
        X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train , y_test = y.iloc[train_index] , y.iloc[test_index]
        
        model = model.fit(X_train, y_train)
        r = permutation_importance(model, X_test, y_test,
                           n_repeats=30,
                           random_state=0,
                                  scoring= 'accuracy')
        imp_features=[]
        for imp_idx,i in enumerate(r.importances_mean):
            important_features[str(imp_idx)] += i


    
    sorted_features = {k: v for k, v in sorted(important_features.items(), key=lambda item: item[1],reverse = True)}
    final_features=[]
    highest_score = [0,0]
    for k,v in sorted_features.items():
    
        if v>0:
            final_features.append(int(k))
    
            X_feat_selection = X.iloc[:,final_features]


            for train_index , test_index in kf.split(X_feat_selection,y):
                if len(final_features)<2:
                     X_train , X_test = X_feat_selection.iloc[train_index], X_feat_selection.iloc[test_index]
                else:
                    X_train , X_test = X_feat_selection.iloc[train_index,:], X_feat_selection.iloc[test_index,:]

                y_train , y_test = y.iloc[train_index] , y.iloc[test_index]
                y_pred_t = model.fit(X_train, y_train).predict(X_test)
                highest_score[0] += (1-((y_test != y_pred_t).sum()/(X_test.shape[0])))/num_CV

            if highest_score[0]>highest_score[1]:
                highest_score[1]=highest_score[0]
                highest_score[0]=0
            else:
                final_features.pop()
                highest_score[0]=0

    print("final features {}".format(final_features))
    return(final_features)

In [12]:

import random

def naive_bayes_analysis(data_file,result_summary={}):
    
    #set up data collection 


    result_summary[data_file]={}
    result_summary[data_file]["data_file"]=data_file
    result_summary[data_file]["Gaussian"]=0
    result_summary[data_file]["Laplacian"]=0
    result_summary[data_file]["kNN"]=0
    result_summary[data_file]["kNN_20"]=0
    result_summary[data_file]["MNB_optimal"]=0
    result_summary[data_file]["MNB_20"] = 0
    result_summary[data_file]["MNB_60"] = 0
    result_summary[data_file]["MNB_time"]=0
    result_summary[data_file]["Laplace_time"]=0


    seed_iter = list(range(0,10,1))
    for seed in seed_iter:
    

        num_CV= 10        
        kf = KFold(num_CV,shuffle=True,random_state=random.randint(1, 100))
        score_GNB=[]
        score_MNB =[]
        score_HBNB =[]
        score_kNN = []
        t = Timer()
        k_length=100




        df = load_data(data_file)    
        result_summary[data_file]["data_size"]=df.shape

        X = df[df.columns[0:len(df.columns)-1]]

        y= df[df.columns[-1]]
 


        gnb = GaussianNB()
        final_features = feature_selection(gnb,X,y,kf,num_CV)
        result_summary[data_file]["selected_features"]=final_features
        X = X.iloc[:,final_features]




    # determine hyperparameters
        labels = y.unique()
        params_grid={'kappa':range(0,100,1)}
        search_MNB = GridSearchCV(MNBClassifier(), param_grid=params_grid,
                                  n_jobs=-1,cv=kf,scoring='accuracy').fit(X, y)
        params_grid={'n_neighbors':range(1,100,1)}
        search_KNN = GridSearchCV(KNeighborsClassifier(), param_grid=params_grid,
                                  n_jobs=-1,cv=kf,scoring='accuracy').fit(X, y)
        kappa = search_MNB.best_params_['kappa']



        MNB_n="MNB_"




        for train_index , test_index in kf.split(X,y):
            X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
            y_train , y_test = y.iloc[train_index] , y.iloc[test_index]



            #KNN

            neigh = KNeighborsClassifier(n_neighbors=search_KNN.best_params_['n_neighbors'])
            y_kNN_pred = neigh.fit(X_train, y_train).predict(X_test)
            score_kNN.append(1-((y_test != y_kNN_pred).sum()/(X_test.shape[0])))
            result_summary[data_file]["kNN"]+=(1-((y_test != y_kNN_pred).sum()/(X_test.shape[0])))/(num_CV*len(seed_iter))


            #KNN_20

            neigh = KNeighborsClassifier(20)
            y_kNN_pred = neigh.fit(X_train, y_train).predict(X_test)
            score_kNN.append(1-((y_test != y_kNN_pred).sum()/(X_test.shape[0])))
            result_summary[data_file]["kNN_20"]+=(1-((y_test != y_kNN_pred).sum()/(X_test.shape[0])))/(num_CV*len(seed_iter))


            #Gaussian
    #         print("Gaussian method")

            t.start()
            gnb = GaussianNB()
            y_GNB_pred = gnb.fit(X_train, y_train).predict(X_test)
            score_GNB.append(1-((y_test != y_GNB_pred).sum()/(X_test.shape[0])))
            result_summary[data_file]["Gaussian"]+=(1-((y_test != y_GNB_pred).sum()/(X_test.shape[0])))/(num_CV*len(seed_iter))
            t.stop()

    #       
            #modified

    #         for kappa in resolution:


       
            MNB = MNBClassifier(kappa)

            y_MNB_pred = MNB.fit(X_train, y_train).predict(X_test)            
            result_summary[data_file]["MNB_optimal"]+=(1-((y_test != y_MNB_pred).sum()/(X_test.shape[0])))/(num_CV*len(seed_iter))


            MNB = MNBClassifier(20)
            y_MNB_pred = MNB.fit(X_train, y_train).predict(X_test)
            result_summary[data_file]["MNB_20"]+=(1-((y_test != y_MNB_pred).sum()/(X_test.shape[0])))/(num_CV*len(seed_iter))

            MNB = MNBClassifier(60)
            t.start()
            y_MNB_pred = MNB.fit(X_train, y_train).predict(X_test)
            result_summary[data_file]["MNB_60"]+=(1-((y_test != y_MNB_pred).sum()/(X_test.shape[0])))/(num_CV*len(seed_iter))
            result_summary[data_file]["MNB_time"]=t.stop()/num_CV




            #Laplacian



            prob={}
            y_MNB = []
            t.start()
            Z={}
            for label in labels:
                Z[label] = (y_train.values == label).sum()/y.shape[0]
        #     print("Z {}".format(Z))


            for i,row in X_test.iterrows():
                row = row.tolist()
                class_sample_size = 0
                zero_freq=0

                #create prob dict
                for label in labels:
                    prob[label] = 0

                for k, col in enumerate(X_train.columns):
                    #casting to avoid iloc str error
                    col = int(col)
                    hosein_estimate = 1
                    v = 10
                    class_sample_size={}
                    x_col = X_train.iloc[:,k]


                    occ_score={}
                    for label in labels:
                        occ_score[label] = 0
                        class_sample_size[label] = x_col[y_train==label].shape[0]

                    for label in labels:
                        v = (x_col[y_train==label].values == row[k]).sum()

                        if (v == 0):
                            occ_score[label] = 1
                        else:
                            occ_score[label] = v + 1




                    for label in labels:
                        prob[label] += math.log(occ_score[label]/(class_sample_size[label]+len(x_col.unique())))


                for label in labels:
                    prob[label] += math.log(Z[label])



                y_MNB.append(max(prob.items(), key=operator.itemgetter(1))[0])
            score_MNB.append(1-((y_test != y_MNB).sum()/(X_test.shape[0])))
            result_summary[data_file]["Laplacian"]+=(1-((y_test != y_MNB).sum()/(y_test.shape[0])))/(num_CV*len(seed_iter))
            result_summary[data_file]["Laplace_time"]=t.stop()/num_CV



    winner = [result_summary[data_file]["Gaussian"],
              result_summary[data_file]["Laplacian"],
              result_summary[data_file]["kNN"],
              result_summary[data_file]["MNB_optimal"]]
    
    result_summary[data_file]['Winner'] = winner.index(max(winner))
    print(result_summary[data_file]['Winner'])
    print(result_summary[data_file])
    return result_summary
    


result_summary={}
location='datasets3'    



warnings.filterwarnings('ignore')
files = [f for f in listdir(location) if isfile(join(location, f))]
print(files)

for data_file in files:  
    print(data_file)
    naive_bayes_analysis('{}/{}'.format(location,data_file),result_summary)



keys=None
for result in result_summary:
    keys = result_summary[result].keys()

    #rename these results as corr > 0.XX
with open('results.csv', 'w') as f:  # You will need 'wb' mode in Python 2.x
    w = csv.DictWriter(f, keys)
    w.writeheader()
    for result in result_summary:
        w.writerow(result_summary[result])



['iris.data.csv', 'Dry_Bean_Dataset.csv', 'BreastTissue.csv', 'Algerian_forest_fires_dataset_UPDATE_modified.csv', 'sobar-72.csv', 'credi_data.data.csv', 'heart.dat.csv', 'wine.data.csv', 'breast-cancer.data.csv', 'balance-scale.data2.csv', 'winequality-red.csv', 'tic-tac-toe.data.csv', 'australian.dat.csv', 'yeast.data.csv', 'Raisin_Dataset.csv', 'winequality-white.csv', 'data_banknote_authentication.csv', 'balance-scale.data.csv', 'abalone.data.csv', 'heart_failure_clinical_records_dataset.csv', 'bcdata.csv', 'glass.data.csv', 'leaf.data.csv']
iris.data.csv
final features [3, 2]


KeyboardInterrupt: 