In [1]:
import numpy as np
import pandas as pd
import sys
sys.path.insert(1, '/home/r7user5/Desktop/STAT')
import comonotonic as cm
import ensemble_ciber as ec
import random
from scipy.stats import gamma
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import utils
from sklearn import preprocessing
from sklearn.metrics import classification_report

In [2]:
# Hyper-parameters
n_basic_var = 10 # num of basic variables
max_var = 20 # maximum variance of basic variable
n_class = 5 # num of classes
max_slope = 10 # maximum slope of X' = k*X + b
variance = 100 # variance of intersection i.e. X' = k*X + b, Var(b) = variance
var_noise = 20 # variance of the noise term
n_como_var = 10 # num of como variables generated by each basic variable
instance_per_class = 10000 # how large is the simulated dataset

In [3]:
# real comonotonic relation
real_cluster = list()
for i in range(n_basic_var):
    cluster = [i]
    for j in range(n_como_var):
        cluster.append(n_basic_var+i*n_como_var+j)
    real_cluster.append(cluster)

In [4]:
# construct n_class centers representing the classes
centers = list()
for i in range(n_class):
    center = list()
    for j in range(n_basic_var):
        rv = gamma.rvs(np.random.uniform(0,100))
        center.append(rv)
    centers.append(center)
centers = np.array(centers)

In [5]:
# construct simulated data for basic variables
simulated_data = list()
cov = np.zeros((n_basic_var, n_basic_var), dtype = float)
for j in range(n_basic_var):
    cov[j][j] = np.random.uniform(max_var)
for i in range(n_class):
    class_col = np.array([i for itr in range(instance_per_class)]).reshape(-1,1)
    class_data = np.random.multivariate_normal(centers[i], cov, instance_per_class)
    class_data = np.concatenate((class_col, class_data), axis = 1)
    simulated_data.append(class_data)
simulated_data = np.array(simulated_data).reshape(n_class*instance_per_class, -1)

In [6]:
# construct simulated data for comonotonic variables
# for different classes the intersection varies
for i in range(n_basic_var):
    for j in range(n_como_var):
        slope = np.random.uniform(max_slope)
        como_var = np.array([])
        for k in range(n_class):
            class_rows = [i for i in range(k*instance_per_class,(k+1)*instance_per_class)]
            intersection = np.random.normal(0, variance)
            class_como_var = (simulated_data[class_rows,i+1]*slope + intersection)
            como_var = np.concatenate((como_var, class_como_var),axis=0)
        como_var = como_var.reshape(-1,1)
        simulated_data = np.concatenate((simulated_data, como_var), axis = 1)

In [13]:
X = simulated_data[:,1:]
Y = simulated_data[:,0]

In [14]:
cont_col = [i for i in range(X.shape[1])]
categorical = []
discrete_feature_val = None

In [15]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2)

In [17]:
ciber = cm.clustered_comonotonic(X_train,Y_train,discrete_feature_val,cont_col,categorical,
                                0.9,None,corrtype = 'pearson',discrete_method = "auto")
ciber.run()
ciber_predict = ciber.predict(X_test)
print(classification_report(Y_test, ciber_predict))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      2011
         1.0       1.00      1.00      1.00      2018
         2.0       1.00      1.00      1.00      2046
         3.0       1.00      1.00      1.00      1991
         4.0       1.00      1.00      1.00      1934

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000



In [18]:
ciber.print_cluster()

[[0, 10, 16, 109], [1, 27], [2, 20, 29, 55], [3, 40, 44], [4, 58, 73], [5, 23, 42, 64, 67, 69, 93, 98], [6, 71], [7, 87], [8, 68, 90, 92, 95, 99], [9, 105], [11, 104], [12], [13, 78], [14, 15], [17, 19, 24, 80], [18, 32, 60], [21], [22, 25, 46, 75, 102], [26, 59, 72], [28, 34, 35, 37], [30, 51], [31], [33, 86, 96], [36, 53], [38], [39, 50, 63, 65, 94], [41, 76], [43, 45, 52, 70, 108], [47, 62, 74], [48, 79, 89], [49, 91, 100], [54, 56], [57, 85], [61, 66], [77], [81], [82], [83, 101, 107], [84, 88, 103], [97], [106]]


In [19]:
real_cluster

[[0, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [1, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
 [2, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
 [3, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
 [4, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
 [5, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
 [6, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79],
 [7, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89],
 [8, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99],
 [9, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109]]

In [20]:
ciber_nb = cm.clustered_comonotonic(X_train,Y_train,discrete_feature_val,
                                    cont_col,categorical,1,None,
                                    corrtype = 'pearson',discrete_method = "mdlp")
ciber_nb.run()
ciber_nb_predict = ciber_nb.predict(X_test)
print(classification_report(Y_test, ciber_nb_predict))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      2011
         1.0       1.00      1.00      1.00      2018
         2.0       1.00      1.00      1.00      2046
         3.0       1.00      1.00      1.00      1991
         4.0       1.00      1.00      1.00      1934

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000

