In [1]:
import numpy as np
import pandas as pd
import comonotonic as cm
import os
from sklearn.model_selection import train_test_split
import copy
import utils

In [2]:
# Note that we assume the last column is the label
# If the first column is ID's, remove it
filename = "adult.csv"
df = pd.read_csv("Datasets/"+filename)
df = df.drop(['education'], axis=1)
colnames = [('X'+str(i)) for i in range(df.shape[1]-1)]
colnames.append('Y')
df.columns = colnames

In [3]:
# data cleaning for adult.csv
df = df[df.X1 != '?']
df = df[df.X6 != '?']
# unrankable features
encoded_df = utils.encode_df(df, [1,4,5,6,7,8,12,13])
encoded_df = encoded_df.astype('int32')
encoded_df

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,Y
1,82,0,132870,9,0,0,0,0,0,0,4356,18,0,0
3,54,0,140359,4,1,1,1,0,0,0,3900,40,0,0
4,41,0,264663,10,2,2,2,0,0,0,3900,40,0,0
5,34,0,216864,9,1,3,1,0,0,0,3770,45,0,0
6,38,0,150601,6,2,4,1,0,1,0,3770,40,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,0,310152,10,3,11,0,0,1,0,0,40,0,0
32557,27,0,257302,12,4,10,5,0,0,0,0,38,0,0
32558,40,0,154374,9,4,1,4,0,1,0,0,40,0,1
32559,58,0,151910,9,0,4,1,0,0,0,0,40,0,0


In [10]:
# use the original data for Naive Bayes
original_X = encoded_df[colnames[:-1]].to_numpy()
original_Y = encoded_df[colnames[-1]].to_numpy()
original_X_train, original_X_test, original_Y_train, original_Y_test = train_test_split(original_X, original_Y, test_size=0.2, random_state=10)

# discretize real-valued data
df_copy = encoded_df.copy()
# continuous variables; categorical rankable features are [3]
cont_col = [0,2,9,10,11]
num_categories_list = [3,4,5,6,7,8,9,10]

# Here encounters the combinatorial optimization problem
# suppose cont_col has size m, num_categories_list has size n, then there are n^m combinations
# need to use some combinatorial optimization method to find the optimal combination

for col_idx in cont_col:
    discretized_col = pd.cut(df_copy.iloc[:,col_idx],num_categories, labels=[i for i in range(num_categories)])
    df_copy['X'+str(col_idx)] = discretized_col
# For pure comonotonic classifier
X = df_copy[colnames[:-1]].to_numpy()
Y = df_copy[colnames[-1]].to_numpy()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=10)

In [11]:
pure_como_classifier = cm.pure_comonotonic(X_train, Y_train, unrankable = [1,4,5,6,7,8,12])
pure_como_classifier.run()
Y_predict = pure_como_classifier.predict(X_test)
cm.accuracy(Y_predict, Y_test)

0.4699755899104963

In [6]:
# comparation with Naive Bayes
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(original_X_train, original_Y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [7]:
pred = model.predict(original_X_test)
cm.accuracy(pred, original_Y_test)

0.7843775427176566