In [1]:
import numpy as np
import pandas as pd
import sys
sys.path.insert(1, '/Users/pengcheng/Desktop/CIBer')
import comonotonic as cm
import ensemble_ciber as ec
import random
from scipy.stats import gamma
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import utils
from sklearn import preprocessing
from sklearn.metrics import classification_report
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
# Hyper-parameters
n_basic_var = 10 # num of basic variables
max_var = 10 # maximum variance of basic variable
n_class = 3 # num of classes
max_slope = 10 # maximum slope of X' = k*X + b
variance = 100 # variance of intersection i.e. X' = k*X + b, Var(b) = variance
var_noise = 10 # variance of the noise term
n_como_var = 5 # num of como variables generated by each basic variable
instance_per_class = 10000 # how large is the simulated dataset

In [3]:
# construct n_class centers representing the classes
centers = list()
for i in range(n_class):
    center = list()
    for j in range(n_basic_var):
        rv = gamma.rvs(np.random.uniform(0,100))
        center.append(rv)
    centers.append(center)
centers = np.array(centers)

In [4]:
# construct simulated data for basic variables
simulated_data = list()
for i in range(n_class):
    class_col = np.array([i for itr in range(instance_per_class)]).reshape(-1,1)
    cov = np.zeros((n_basic_var, n_basic_var), dtype = float)
    for j in range(n_basic_var):
        cov[j][j] = np.random.uniform(max_var)
    class_data = np.random.multivariate_normal(centers[i], cov, instance_per_class)
    class_data = np.concatenate((class_col, class_data), axis = 1)
    simulated_data.append(class_data)
simulated_data = np.array(simulated_data).reshape(n_class*instance_per_class, -1)

In [5]:
# construct simulated data for comonotonic variables
for i in range(n_basic_var):
    for j in range(n_como_var):
        slope = np.random.uniform(max_slope)
        intersection = np.random.normal(0, variance)
        noise = np.random.normal(0,var_noise,simulated_data.shape[0])
        como_var = (simulated_data[:,i+1]*slope + intersection + noise).reshape(-1,1)
        simulated_data = np.concatenate((simulated_data, como_var), axis = 1)

In [6]:
X = simulated_data[:,1:]
Y = simulated_data[:,0]

In [7]:
cont_col = [i for i in range(X.shape[1])]
categorical = []
discrete_feature_val = None

In [8]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2)

In [9]:
ciber = cm.clustered_comonotonic(X_train,Y_train,discrete_feature_val,cont_col,categorical,
                                0.9,None,corrtype = 'pearson',discrete_method = "mdlp")
ciber.run()
ciber_predict = ciber.predict(X_test)
print(classification_report(Y_test, ciber_predict))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      2010
         1.0       1.00      1.00      1.00      1961
         2.0       1.00      1.00      1.00      2029

    accuracy                           1.00      6000
   macro avg       1.00      1.00      1.00      6000
weighted avg       1.00      1.00      1.00      6000



In [10]:
ciber.print_cluster()

[[0, 3, 10, 11, 12, 13, 14, 25, 26, 28, 29], [1, 8, 15, 16, 17, 18, 19, 50, 51, 52, 53, 54], [2, 21, 22, 24], [4, 6, 30, 31, 32, 33, 34, 41, 42, 43, 44], [5, 7, 35, 36, 37, 38, 39, 45, 46, 47, 48, 49], [9, 55, 56, 57, 59], [20], [23], [27], [40], [58]]


In [11]:
ciber_nb = cm.clustered_comonotonic(X_train,Y_train,discrete_feature_val,
                                    cont_col,categorical,1,None,
                                    corrtype = 'mutual_info',discrete_method = "mdlp")
ciber_nb.run()
ciber_nb_predict = ciber_nb.predict(X_test)
print(classification_report(Y_test, ciber_nb_predict))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      2010
         1.0       1.00      1.00      1.00      1961
         2.0       1.00      1.00      1.00      2029

    accuracy                           1.00      6000
   macro avg       1.00      1.00      1.00      6000
weighted avg       1.00      1.00      1.00      6000



In [18]:
xgb_clf = xgb.XGBClassifier()
xgb_clf.fit(X_train, Y_train)
xgb_predict = xgb_clf.predict(X_test)
print(classification_report(Y_test,xgb_predict))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       194
         1.0       1.00      1.00      1.00       206

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400



In [19]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, Y_train)
rf_predict = rf_clf.predict(X_test)
print(classification_report(Y_test,rf_predict))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       194
         1.0       1.00      1.00      1.00       206

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400



In [20]:
ada_clf = AdaBoostClassifier()
ada_clf.fit(X_train, Y_train)
ada_predict = ada_clf.predict(X_test)
print(classification_report(Y_test,ada_predict))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       194
         1.0       1.00      1.00      1.00       206

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400



In [21]:
lgb_clf = lgb.LGBMClassifier()
lgb_clf.fit(X_train, Y_train)
lgb_predict = lgb_clf.predict(X_test)
print(classification_report(Y_test,lgb_predict))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       194
         1.0       1.00      1.00      1.00       206

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400



In [22]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, Y_train)
dt_predict = dt_clf.predict(X_test)
print(classification_report(Y_test, dt_predict))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       194
         1.0       1.00      1.00      1.00       206

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400



In [23]:
svm_clf = SVC()
svm_clf.fit(X_train, Y_train)
svm_predict = svm_clf.predict(X_test)
print(classification_report(Y_test, svm_predict))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       194
         1.0       1.00      1.00      1.00       206

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400



In [24]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train, Y_train)
lr_predict = lr_clf.predict(X_test)
print(classification_report(Y_test, lr_predict))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       194
         1.0       1.00      1.00      1.00       206

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400

