In [11]:
import pycaret
import pandas as pd
from pycaret.classification import *
import numpy as np
import random

In [20]:
def generating_test_data(how_many_times_repeat, iterations, mu1, sigma1, mu2, 
                         sigma2, plot_classes = False):

    dim = len(mu1)
    testing_data=[]
    belonging_classes=[]

    for repeat in range(how_many_times_repeat):

        random_simulation = np.zeros((iterations,dim))
        which_class_list = np.zeros((iterations,))
        
        for itera in range(iterations):

            which_normal = random.randint(0,1)
            if dim == 1:
                if which_normal == 0:
                    random_simulation[itera,] = np.random.normal(mu1, sigma1)
                else:
                    random_simulation[itera,] = np.random.normal(mu2, sigma2)
            else:
                if which_normal == 0:
                    random_simulation[itera,] = np.random.multivariate_normal(mu1, sigma1)
                else:
                    random_simulation[itera,] = np.random.multivariate_normal(mu2, sigma2)
            which_class_list[itera,] = which_normal
        
        testing_data.append(random_simulation)
        belonging_classes.append(which_class_list)
      
    
    return testing_data, belonging_classes


def put_in_bins(data, how_many_bins, how_to_bin):
    mi = min(data)
    ma = max(data)
    if how_to_bin =='cons_std':
        bin_size = how_many_bins * np.std(data)
        start = (mi + ma)/2

        bins_right = [start]
        current_right = 1.0 * start
        while current_right  < ma:
            current_right += bin_size
            bins_right.append(current_right)

        bins_left = []
        current_left = 1.0 * start
        while current_left  > mi:
            current_left -= bin_size
            bins_left.append(current_left)

        bins = np.concatenate((bins_left[::-1],bins_right))
    elif how_to_bin =='fixed_number':
        bins = np.linspace(mi-0.00000001, ma+0.00000001,how_many_bins)
    else:
        assert('Way of binning unknown')
    digitized = np.digitize(data,bins)
    midpoints_bins = (bins[:len(bins)-1] + bins[1:])/2
    new_data = midpoints_bins[digitized-1]
    return new_data


def standardise(X):
    return (X - np.mean(X)) / np.std(X)

In [21]:
which_dataset = 'Simulated Dataset'

e1 = [0,0]
e2 = [1,1]
std1 = np.array(([1,0],[0,1]))
std2 = np.array(([1,0],[0,1]))
how_many_rows = 100

testing_data, belonging_classes = generating_test_data(1, how_many_rows, e1, std1,e2, std2)
d = {'feature1':testing_data[0][:,0], 'feature2':testing_data[0][:,1],'target':belonging_classes[0]}
df = pd.DataFrame(data=d)

continuous_variables=['feature1', 'feature2']
target_variable = 'target'
categorical_variables = []
df.head()

Unnamed: 0,feature1,feature2,target
0,0.299878,0.121614,1.0
1,2.273181,-0.531156,1.0
2,2.363569,1.900524,1.0
3,-1.798084,0.86898,0.0
4,0.627284,1.256685,0.0


In [22]:
from pycaret.classification import ClassificationExperiment
exp = ClassificationExperiment()
type(exp)

pycaret.classification.oop.ClassificationExperiment

In [23]:
exp.setup(df, target = 'target')

Unnamed: 0,Description,Value
0,Session id,3912
1,Target,target
2,Target type,Binary
3,Original data shape,"(100, 3)"
4,Transformed data shape,"(100, 3)"
5,Transformed train set shape,"(70, 3)"
6,Transformed test set shape,"(30, 3)"
7,Numeric features,2
8,Preprocess,True
9,Imputation type,simple


<pycaret.classification.oop.ClassificationExperiment at 0x7f7a1c080f70>

In [24]:
best = exp.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.7714,0.0,0.8,0.7783,0.7828,0.5257,0.5366,0.016
lda,Linear Discriminant Analysis,0.7714,0.8167,0.8,0.7783,0.7828,0.5257,0.5366,0.016
lr,Logistic Regression,0.7429,0.8167,0.75,0.7683,0.755,0.4699,0.4738,0.296
knn,K Neighbors Classifier,0.7429,0.7958,0.775,0.765,0.7606,0.4768,0.4885,0.019
nb,Naive Bayes,0.7429,0.7833,0.8,0.75,0.7656,0.4596,0.4781,0.016
qda,Quadratic Discriminant Analysis,0.7429,0.7833,0.8,0.75,0.7656,0.4596,0.4781,0.017
dt,Decision Tree Classifier,0.6857,0.6833,0.725,0.7138,0.6807,0.3621,0.3908,0.015
svm,SVM - Linear Kernel,0.6714,0.0,0.6583,0.6567,0.6479,0.3582,0.3639,0.014
ada,Ada Boost Classifier,0.6714,0.8042,0.7583,0.665,0.6872,0.3377,0.3777,0.045
gbc,Gradient Boosting Classifier,0.6714,0.8083,0.7583,0.6683,0.7005,0.3264,0.3534,0.045


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

In [25]:
exp.models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model._logistic.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors._classification.KNeighborsCl...,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree._classes.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model._stochastic_gradient.SGDC...,True
rbfsvm,SVM - Radial Kernel,sklearn.svm._classes.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process._gpc.GaussianProcessC...,False
mlp,MLP Classifier,sklearn.neural_network._multilayer_perceptron....,False
ridge,Ridge Classifier,sklearn.linear_model._ridge.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble._forest.RandomForestClassifier,True


In [26]:
classifier = 'logistic' 
how_to_bin = 'fixed_number'
nr_bins = 100


binned_df = df.copy()
for col in continuous_variables:
    if how_to_bin =='fixed_number':
        binned_df[col] = standardise(put_in_bins(df[col], nr_bins, how_to_bin))
    elif how_to_bin =='cons_std':
        binned_df[col] = standardise(put_in_bins(df[col], nr_bins, how_to_bin))
binned_df.head()

Unnamed: 0,feature1,feature2,target
0,-0.131915,-0.253307,1.0
1,1.883455,-0.77238,1.0
2,1.975063,1.252002,1.0
3,-2.284697,0.421487,0.0
4,0.188712,0.73293,0.0


In [27]:
exp_binned = ClassificationExperiment()
exp_binned.setup(binned_df, target = 'target')
best_binned =exp_binned.compare_models()

Unnamed: 0,Description,Value
0,Session id,3442
1,Target,target
2,Target type,Binary
3,Original data shape,"(100, 3)"
4,Transformed data shape,"(100, 3)"
5,Transformed train set shape,"(70, 3)"
6,Transformed test set shape,"(30, 3)"
7,Numeric features,2
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.7714,0.8875,0.7833,0.785,0.7613,0.5488,0.5731,0.117
et,Extra Trees Classifier,0.7714,0.8958,0.8167,0.7983,0.782,0.5462,0.588,0.11
lr,Logistic Regression,0.7429,0.825,0.7833,0.77,0.7576,0.4761,0.5091,0.017
ridge,Ridge Classifier,0.7429,0.0,0.7833,0.77,0.7576,0.4761,0.5091,0.026
qda,Quadratic Discriminant Analysis,0.7429,0.825,0.8167,0.745,0.7633,0.4785,0.511,0.019
ada,Ada Boost Classifier,0.7429,0.8375,0.7833,0.7483,0.7429,0.49,0.5222,0.055
lda,Linear Discriminant Analysis,0.7429,0.825,0.7833,0.77,0.7576,0.4761,0.5091,0.019
knn,K Neighbors Classifier,0.7286,0.8542,0.725,0.805,0.7239,0.4612,0.5179,0.025
nb,Naive Bayes,0.7286,0.8083,0.8,0.7367,0.7524,0.4558,0.4806,0.019
dt,Decision Tree Classifier,0.7286,0.7375,0.7917,0.7383,0.7429,0.4602,0.4901,0.023


Processing:   0%|          | 0/61 [00:00<?, ?it/s]