In [15]:
## importing all the libraries
import numpy as np
import pandas as pd
import random
from sklearn.tree import DecisionTreeClassifier

In [9]:
df = pd.read_csv("UCI_Credit_Card.csv")
data = df.to_numpy()
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,20000.0,2,2,1,24,2,2,-1,-1,-2,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,120000.0,2,2,2,26,-1,2,0,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,90000.0,2,2,2,34,0,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,50000.0,2,2,1,37,0,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,50000.0,1,2,1,57,-1,0,-1,0,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [13]:
X = data[:,:-1]
Y = data[:,-1]
n_features = X.shape[1]

In [14]:
def normalize_data(X):
    features = X.shape[1]
    new_data = np.zeros((X.shape))
    for i in range(features):
        new_data[:,i] = X[:,i]/abs(max(X[:,i],key=abs))
    return new_data
print(normalize_data(X)[:2])

[[ 2.00000000e-02  1.00000000e+00  3.33333333e-01  3.33333333e-01
   3.03797468e-01  2.50000000e-01  2.50000000e-01 -1.25000000e-01
  -1.25000000e-01 -2.50000000e-01 -2.50000000e-01  4.05697810e-03
   3.15266010e-03  4.14040355e-04  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  4.09081976e-04  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 1.20000000e-01  1.00000000e+00  3.33333333e-01  6.66666667e-01
   3.29113924e-01 -1.25000000e-01  2.50000000e-01  0.00000000e+00
   0.00000000e+00  0.00000000e+00  2.50000000e-01  2.78068368e-03
   1.75317172e-03  1.61169264e-03  3.66986471e-03  3.72638920e-03
   3.39099727e-03  0.00000000e+00  5.93732912e-04  1.11602161e-03
   1.61030596e-03  0.00000000e+00  3.78310691e-03]]


In [5]:
def stratified_k_fold_sampling(data, n_fold):
    Y = data[:,-1]
    classes = np.unique(Y)
    n_class = len(classes)
    class_samp = []
    n_class_samp_perfold = []
    
    for i in classes:
        samples_in_class = data[np.where(Y==i)]
        random.shuffle(samples_in_class)
        class_samp.append(samples_in_class)
        samples_perfold = int(len(samples_in_class)/n_fold)
        n_class_samp_perfold.append(samples_perfold)
    dataset_split = []
    
    for i in range(n_fold):
        fold = []
        for j in range(n_class):
            n_samp = n_class_samp_perfold[j]
            if(i!=n_fold-1):
                fold.extend(class_samp[j][i*n_samp:(i+1)*n_samp])
            else:
                fold.extend(class_samp[j][i*n_samp:])
        random.shuffle(fold)
        dataset_split.append(fold)
    return dataset_split

In [6]:
def data_preparation(data_splits, k, n_folds):
    test = data_splits[k]
    arr = np.arange(n_folds)
    arr = arr.tolist()
    arr.remove(k)
    train = data_splits[arr[0]]
    for i in range(1,len(arr)):
        train = np.concatenate((train, data_splits[arr[i]]))
    test = np.array(test)
    X_train = train[:,:-1]
    X_test = test[:,:-1]
    Y_train = train[:,-1]
    Y_test = test[:,-1]
    X_train = normalize_data(X_train)
    X_test = normalize_data(X_test)
    return([X_train, X_test, Y_train, Y_test])

In [7]:
def fit(data_splits, n_folds):
    data_folds = []
    for i in range(n_folds):
        data_folds.append(data_preparation(data_splits, i, n_folds))
    
    feature = []
    
    score = []        
    for i in range(n_features):
        accuracy = []
        for j in range(n_folds):
            [X_train, X_test, Y_train, Y_test] = data_folds[j]
            clf = DecisionTreeClassifier(max_depth = 2).fit(X_train[:,i].reshape(-1,1), Y_train)
            accuracy.append(clf.score(X_test[:,i].reshape(-1,1), Y_test))
        score.append(np.mean(accuracy))
    feature.append(np.argmax(score))
    
    acc_scores = []
    while(len(feature) < 0.75*n_features):
        score = []
        for i in range(n_features):
            if(i in feature):
                score.append(0)
                continue
            accuracy = []
            feat_list = feature.copy()
            feat_list.append(i)
            for j in range(n_folds):
                [X_train, X_test, Y_train, Y_test] = data_folds[j]
                clf = DecisionTreeClassifier().fit(X_train[:,feat_list], Y_train)
                accuracy.append(clf.score(X_test[:,feat_list], Y_test))
            score.append(np.mean(accuracy))
        print(np.max(score))
        print(feature)
        acc_scores.append(np.max(score))
        feature.append(np.argmax(score))
    indx = np.argmax(acc_scores)
    
    return(feature[:indx+1])

In [8]:
data_splits = stratified_k_fold(data, 5)
fit(data_splits, 5)

0.8171653692226519
[5]
0.8186654249170331
[5, 6]
0.8211655362502677
[5, 6, 10]
0.8226653143056598
[5, 6, 10, 9]
0.8237661917670266
[5, 6, 10, 9, 1]
0.8273327033519579
[5, 6, 10, 9, 1, 3]
0.8329668924389907
[5, 6, 10, 9, 1, 3, 7]
0.8353675701446578
[5, 6, 10, 9, 1, 3, 7, 2]
0.8237640539468103
[5, 6, 10, 9, 1, 3, 7, 2, 8]
0.8136069646962035
[5, 6, 10, 9, 1, 3, 7, 2, 8, 20]
0.801339474855847
[5, 6, 10, 9, 1, 3, 7, 2, 8, 20, 18]
0.7935704307369017
[5, 6, 10, 9, 1, 3, 7, 2, 8, 20, 18, 22]
0.7901399955733239
[5, 6, 10, 9, 1, 3, 7, 2, 8, 20, 18, 22, 14]
0.7889079281368125
[5, 6, 10, 9, 1, 3, 7, 2, 8, 20, 18, 22, 14, 19]
0.785275849328208
[5, 6, 10, 9, 1, 3, 7, 2, 8, 20, 18, 22, 14, 19, 12]
0.7813751159281599
[5, 6, 10, 9, 1, 3, 7, 2, 8, 20, 18, 22, 14, 19, 12, 15]
0.779009998224221
[5, 6, 10, 9, 1, 3, 7, 2, 8, 20, 18, 22, 14, 19, 12, 15, 11]


[5, 6, 10, 9, 1, 3, 7, 2]