In [1]:
import numpy as np
import pandas as pd
import comonotonic as cm
import os
from sklearn.model_selection import train_test_split
import copy
import utils
import matplotlib.pyplot as plt

In [2]:
# Note that we assume the last column is the label
# If the first column is ID's, remove it
filename = "adult.csv"
random_state = 42
df = pd.read_csv("Datasets/"+filename)
df = df.drop(['education'], axis=1)
colnames = [('X'+str(i)) for i in range(df.shape[1]-1)]
colnames.append('Y')
df.columns = colnames

In [3]:
# data cleaning for adult.csv
df = df[df.X1 != '?']
df = df[df.X6 != '?']
# unrankable features
encoded_df = utils.encode_df(df, [1,4,5,6,7,8,12,13])
encoded_df = encoded_df.astype('int32')

In [4]:
# discretize real-valued data
df_copy = encoded_df.copy()
# continuous variables; categorical rankable features are [3]
cont_col = [0,2,9,10,11]
num_categories_list = [3,4,5,6,7,8,9,10]
# unrankable columns
unrankable = [1,4,5,6,7,8,12]

In [5]:
# Here encounters the combinatorial optimization problem
# suppose cont_col has size m, num_categories_list has size n, then there are n^m combinations
# need to use some combinatorial optimization method to find the optimal combination
accuracy_history, allocation_history = utils.determine_allocation(cont_col, num_categories_list,
                                                               df_copy, unrankable, colnames, max_itr = 100,
                                                               temp = 2, anneal_schedule = 5, use_cluster = False,
                                                               use_mistaken_accuracy_test = False, random_state = random_state)
plt.plot(list(accuracy_history.values()))

ZeroDivisionError: float division by zero

In [None]:
accuracy_history, allocation_history = utils.determine_allocation(cont_col, num_categories_list,
                                                               df_copy, unrankable, colnames, max_itr = 100,
                                                               temp = 2, anneal_schedule = 5, use_cluster = False,
                                                               use_mistaken_accuracy_test = True, random_state = random_state)
plt.plot(list(accuracy_history.values()))

In [None]:
acc_history = list(accuracy_history.values())
acc_history.index(max(acc_history))

In [None]:
utils.accuracy_test(allocation_history[49], df_copy, unrankable, colnames, 0.5, False, random_state)

In [5]:
# comparation with Naive Bayes
# use the original data for Naive Bayes
original_X = encoded_df[colnames[:-1]].to_numpy()
original_Y = encoded_df[colnames[-1]].to_numpy()
original_X_train, original_X_test, original_Y_train, original_Y_test = train_test_split(original_X, original_Y, test_size=0.2, 
                                                                                        random_state=random_state)

In [6]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(original_X_train, original_Y_train)
pred = model.predict(original_X_test)
utils.get_accuracy(pred, original_Y_test)

0.7895850284784377

In [7]:
# implementation of NB from scratch
nb_classifier = cm.naive_bayes(original_X_train, original_Y_train, cont_col)
nb_classifier.run()
nb_y_predict = nb_classifier.predict(original_X_test)
utils.get_accuracy(nb_y_predict, original_Y_test)

0.8174125305126119

In [9]:
# clustered comonotonic
df_for_cluster = encoded_df.copy()
#continuous variables; categorical rankable features are [3]
cluster_accuracy_history, cluster_allocation_history = utils.determine_allocation(cont_col, num_categories_list,
                                                               df_for_cluster, unrankable,
                                                               colnames, max_itr = 30, temp = 2,
                                                               anneal_schedule = 2, use_cluster = True,
                                                               use_mistaken_accuracy_test = False, 
                                                               random_state = random_state, min_corr = 0.6)

In [10]:
cluster_acc_history = list(cluster_accuracy_history.values())
cluster_acc_history.index(max(cluster_acc_history))

9

In [11]:
cluster_allocation_history[24]

{0: 10, 2: 10, 9: 6, 10: 10, 11: 10}

In [8]:
# clustered comonotonic validation
allocation_book_cluster = {0: 10, 2: 7, 9: 8, 10: 6, 11: 6}
como_cluster_df = encoded_df.copy()
uncategorized_df = encoded_df.copy()

for col_idx in allocation_book_cluster.keys():
    discretized_col = pd.cut(como_cluster_df.iloc[:,col_idx],allocation_book_cluster[col_idx], labels=[i for i in range(allocation_book_cluster[col_idx])])
    como_cluster_df['X'+str(col_idx)] = discretized_col
X_cluster = como_cluster_df[colnames[:-1]].to_numpy()
Y_cluster = como_cluster_df[colnames[-1]].to_numpy()
X_train_cluster, X_test_cluster, Y_train_cluster, Y_test_cluster = train_test_split(X_cluster, Y_cluster, 
                                                                                    test_size=0.2, random_state=random_state)

como_cluster_classifier = cm.clustered_comonotonic(X_train_cluster, Y_train_cluster, unrankable, uncategorized_df, 
                                                   colnames, min_corr = 0.6, random_state = random_state)
como_cluster_classifier.run()
Y_cluster_pred = como_cluster_classifier.predict(X_test_cluster)
utils.get_accuracy(Y_cluster_pred, Y_test_cluster)

0.8084621643612693

In [17]:
# weighted average of naive bayes and cluster comonotonic
weighted_predict_y = []
for i in range(len(X_test_cluster)):
    prob_dist_cluster_como = como_cluster_classifier.get_prob_dist_single(X_test_cluster[i])
    predicted_class, prob_dist_nb = nb_classifier.predict_single(original_X_test[i])
    weighted_predict_y.append(utils.weighted_avg(prob_dist_nb, prob_dist_cluster_como, 0.6))
#     print(prob_dist_cluster_como)
#     print(prob_dist_nb)
#     print("------")
print(utils.get_accuracy(weighted_predict_y, Y_test_cluster))

0.8314076484947112
