In [37]:
import pycaret
import pandas as pd
from pycaret.classification import *
import numpy as np
import random
from pycaret.classification import ClassificationExperiment
import matplotlib.pyplot as plt
import category_encoders as ce
from sklearn import preprocessing

In [38]:
def generating_test_data(how_many_times_repeat, iterations, mu1, sigma1, mu2, 
                         sigma2, plot_classes = False):

    dim = len(mu1)
    testing_data=[]
    belonging_classes=[]

    for repeat in range(how_many_times_repeat):

        random_simulation = np.zeros((iterations,dim))
        which_class_list = np.zeros((iterations,))
        
        for itera in range(iterations):

            which_normal = random.randint(0,1)
            if dim == 1:
                if which_normal == 0:
                    random_simulation[itera,] = np.random.normal(mu1, sigma1)
                else:
                    random_simulation[itera,] = np.random.normal(mu2, sigma2)
            else:
                if which_normal == 0:
                    random_simulation[itera,] = np.random.multivariate_normal(mu1, sigma1)
                else:
                    random_simulation[itera,] = np.random.multivariate_normal(mu2, sigma2)
            which_class_list[itera,] = which_normal
        
        testing_data.append(random_simulation)
        belonging_classes.append(which_class_list)
      
    
    return testing_data, belonging_classes


def put_in_bins(data, how_many_bins, how_to_bin):
    mi = min(data)
    ma = max(data)
    if how_to_bin =='cons_std':
        bin_size = how_many_bins * np.std(data)
        start = (mi + ma)/2

        bins_right = [start]
        current_right = 1.0 * start
        while current_right  < ma:
            current_right += bin_size
            bins_right.append(current_right)

        bins_left = []
        current_left = 1.0 * start
        while current_left  > mi:
            current_left -= bin_size
            bins_left.append(current_left)

        bins = np.concatenate((bins_left[::-1],bins_right))
    elif how_to_bin =='fixed_number':
        bins = np.linspace(mi-0.00000001, ma+0.00000001,how_many_bins)
    else:
        assert('Way of binning unknown')
    digitized = np.digitize(data,bins)
    midpoints_bins = (bins[:len(bins)-1] + bins[1:])/2
    new_data = midpoints_bins[digitized-1]
    return new_data


def standardise(X):
    return (X - np.mean(X)) / np.std(X)

In [39]:
df = pd.read_csv('simulated_dataset.csv')
continuous_variables=['feature1', 'feature2']
target_variable = 'target'
categorical_variables = []
df.head()


df.head()

Unnamed: 0,feature1,feature2,target
0,-0.630297,0.09752,0.0
1,0.537032,0.075847,0.0
2,-0.890937,-2.596034,0.0
3,-0.943496,-0.684865,0.0
4,0.811247,1.602051,1.0


In [40]:
classifier = 'kNN' 
how_to_bin = 'fixed_number'
nr_bins = 100

In [41]:
exp = ClassificationExperiment()
setup = exp.setup(df, target = 'target',train_size = 0.8, normalize =True, fold = 20,verbose=False)

In [55]:
exp.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.755,0.8348,0.7479,0.7636,0.7508,0.5103,0.5164,0.0575
nb,Naive Bayes,0.7538,0.846,0.7553,0.7551,0.7507,0.5076,0.514,0.03
qda,Quadratic Discriminant Analysis,0.7538,0.8459,0.7578,0.7532,0.751,0.5076,0.514,0.029
lr,Logistic Regression,0.7525,0.847,0.7553,0.7528,0.7497,0.5051,0.5113,0.1655
ridge,Ridge Classifier,0.7525,0.0,0.7553,0.7528,0.7497,0.5051,0.5113,0.0255
lda,Linear Discriminant Analysis,0.7525,0.8472,0.7553,0.7528,0.7497,0.5051,0.5113,0.031
knn,K Neighbors Classifier,0.7425,0.7955,0.7628,0.7331,0.7434,0.4852,0.4913,0.03
svm,SVM - Linear Kernel,0.7238,0.0,0.6899,0.7391,0.696,0.4472,0.4634,0.0255
rf,Random Forest Classifier,0.7238,0.7926,0.7172,0.7255,0.719,0.4476,0.4504,0.1195
lightgbm,Light Gradient Boosting Machine,0.72,0.7958,0.7222,0.7207,0.7185,0.4403,0.4437,0.094


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

In [42]:
if classifier == 'logistic':
    lr = exp.create_model('lr', penalty = 'none',verbose=False,fold = 20)
elif classifier == 'kNN':
    kNN = exp.create_model('knn',verbose=False,fold = 20)
result_base_case = exp.pull().loc['Mean']

In [43]:
binned_df = df.copy()
for col in continuous_variables:
    if how_to_bin =='fixed_number':
        binned_df[col] = standardise(put_in_bins(df[col], nr_bins, how_to_bin))
    elif how_to_bin =='cons_std':
        binned_df[col] = standardise(put_in_bins(df[col], nr_bins, how_to_bin))
binned_df.head()

Unnamed: 0,feature1,feature2,target
0,-0.983176,-0.365785,0.0
1,0.074605,-0.365785,0.0
2,-1.232066,-2.633292,0.0
3,-1.232066,-1.022168,0.0
4,0.323494,0.946983,1.0


In [44]:
exp_binned = ClassificationExperiment()
setup_bin = exp_binned.setup(binned_df, target = 'target',train_size = 0.8, normalize =True, fold = 20,verbose = False)

In [45]:
if classifier == 'logistic':
    lr_binned = exp_binned.create_model('lr', penalty = 'none',verbose=False,fold = 20)
elif classifier == 'kNN':
    kNN_binned = exp_binned.create_model('knn',verbose=False,fold = 20)
result_binned = exp_binned.pull().loc['Mean']

In [46]:
labelencoder = ce.OrdinalEncoder(cols=continuous_variables)
simple_df = labelencoder.fit_transform(binned_df)
exp_simple = ClassificationExperiment()
setup_simple = exp_simple.setup(simple_df, target = 'target',train_size = 0.8, normalize =True, fold = 20,verbose = False)

In [47]:
if classifier == 'logistic':
    lr_simple = exp_simple.create_model('lr', penalty = 'none',verbose=False,fold = 20)
elif classifier == 'kNN':
    kNN_simple = exp_simple.create_model('knn',verbose=False,fold = 20)
result_simple = exp_simple.pull().loc['Mean']

In [48]:
encoder = ce.OneHotEncoder(cols=continuous_variables,use_cat_names=True)
onehot_df = encoder.fit_transform(binned_df)
exp_onehot = ClassificationExperiment()
setup_onehot = exp_onehot.setup(onehot_df, target = 'target',train_size = 0.8, normalize =True, fold = 20,verbose = False)

In [49]:
if classifier == 'logistic':
    lr_onehot = exp_onehot.create_model('lr', penalty = 'none',verbose=False,fold = 20)
elif classifier == 'kNN':
    kNN_onehot = exp_onehot.create_model('knn',verbose=False,fold = 20)
result_onehot = exp_onehot.pull().loc['Mean']

In [50]:
encoder = ce.sum_coding.SumEncoder(cols=continuous_variables,verbose=False)
effect_df = encoder.fit_transform(binned_df)
exp_effect = ClassificationExperiment()
setup_effect = exp_effect.setup(effect_df, target = 'target',train_size = 0.8, normalize =True, fold = 20,verbose = False)


In [51]:
if classifier == 'logistic':
    lr_effect  = exp_effect .create_model('lr', penalty = 'none',verbose=False,fold = 20)
elif classifier == 'kNN':
    kNN_effect  = exp_effect.create_model('knn',verbose=False,fold = 20)
result_effect = exp_effect.pull().loc['Mean']


In [52]:
TE_encoder = ce.TargetEncoder(cols=continuous_variables)
te_df = TE_encoder.fit_transform(binned_df, binned_df[target_variable])
exp_te = ClassificationExperiment()
setup_te = exp_te.setup(te_df, target = 'target',train_size = 0.8, normalize =True, fold = 20,verbose = False)

    
    

In [53]:
if classifier == 'logistic':
    lr_te  = exp_te.create_model('lr', penalty = 'none',verbose=False,fold = 20)
elif classifier == 'kNN':
    kNN_te  = exp_te.create_model('knn',verbose=False,fold = 20)
result_te = exp_te.pull().loc['Mean']


In [54]:
results_df=pd.concat({'Base Case': result_base_case,
            'Binned': result_binned,
            'Simple':result_simple,
            'One-Hot':result_onehot,
            'Target':result_te,
            'Effect':result_effect
             },axis=1)

print('Classifier: ' + classifier + ', h = ' + str(nr_bins))
print(results_df)

Classifier: kNN, h = 100
          Base Case  Binned  Simple  One-Hot  Target  Effect
Accuracy     0.7425  0.7275  0.5325   0.6925  0.7700  0.6900
AUC          0.7955  0.7754  0.5381   0.7496  0.8176  0.7382
Recall       0.7628  0.7424  0.5455   0.7016  0.7897  0.7020
Prec.        0.7331  0.7226  0.5291   0.6900  0.7578  0.6887
F1           0.7434  0.7283  0.5352   0.6912  0.7708  0.6914
Kappa        0.4852  0.4550  0.0652   0.3844  0.5398  0.3804
MCC          0.4913  0.4610  0.0661   0.3902  0.5446  0.3845
