In [100]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import csv
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from sklearn import svm

In [88]:
#read flash.dat to a list of lists
datContent = [i.strip().split() for i in open("./kredit.dat").readlines()]

#write it as a new CSV file
with open("./kredit.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(datContent)

#naming the labels of the columns
columns = ['Status of existing checking account','Duration in month','Credit history','Purpose','Credit amount','Savings account/bonds','Present employment since','Installment rate in percentage of disposable income',
'Personal status and sex','Other debtors/guarantors','Present residence since','Porperty','Age in years','Other installment plans','Housing','Number of existing credits at this bank','Job','Number of people being liable to provide maintenance for',
'Telephone','Foreign worker','Creditworthy']

#creating the dataframe
df = pd.read_csv('./kredit.csv',names=columns)
df.head()

Unnamed: 0,Status of existing checking account,Duration in month,Credit history,Purpose,Credit amount,Savings account/bonds,Present employment since,Installment rate in percentage of disposable income,Personal status and sex,Other debtors/guarantors,...,Porperty,Age in years,Other installment plans,Housing,Number of existing credits at this bank,Job,Number of people being liable to provide maintenance for,Telephone,Foreign worker,Creditworthy
0,A14,36,A32,?,2299,A63,?,4,A93,A101,...,A123,39,A143,A152,1,A173,1,A191,?,1
1,A12,18,A32,A46,1239,A65,A73,4,A93,A101,...,A124,61,A143,A153,1,?,1,A191,A201,1
2,A13,24,A32,A40,947,A61,A74,4,A93,A101,...,A124,38,A141,A153,1,?,2,A191,?,2
3,A14,15,A33,A43,1478,A61,A73,4,A94,A101,...,A121,33,A141,A152,2,A173,1,A191,A201,1
4,A14,24,A32,A40,1525,A64,A74,4,A92,A101,...,A123,34,A143,A152,1,A173,2,A192,A201,1


In [89]:
print(df.dtypes) #display the data type of each column

Status of existing checking account                         object
Duration in month                                            int64
Credit history                                              object
Purpose                                                     object
Credit amount                                                int64
Savings account/bonds                                       object
Present employment since                                    object
Installment rate in percentage of disposable income          int64
Personal status and sex                                     object
Other debtors/guarantors                                    object
Present residence since                                      int64
Porperty                                                    object
Age in years                                                 int64
Other installment plans                                     object
Housing                                                     ob

In [90]:
style = OneHotEncoder()

df.loc[df['Purpose'] == '?', 'Purpose'] = 'Purpose ?'
df.loc[df['Present employment since'] == '?', 'Present employment since'] = 'Present employment since ?'
df.loc[df['Job'] == '?', 'Job'] = 'Job ?'
df.loc[df['Foreign worker'] == '?', 'Foreign worker'] = 'Foreign worker ?'

non_numerics = df.select_dtypes(include='object')
non_numerics = non_numerics.drop('Telephone',axis=1) #only has two classes, which means we can transform it within the column to 0 for A191 and 1 for A192
df.loc[df['Telephone'] == 'A191', 'Telephone'] = 0.0
df.loc[df['Telephone'] == 'A192', 'Telephone'] = 1.0
df['Telephone'] = df['Telephone'].astype('int64')
for i in non_numerics.columns.tolist():
    transformation = style.fit_transform(df[[i]]) #transform column i
    df = df.join(pd.DataFrame(transformation.toarray(), columns=style.categories_[0])) #add new categories (of transformation) to our dataframe
    for col in style.categories_[0]:
        df[col] = df[col].astype('int64')
    df = df.drop(i, axis=1) #dropping old column since we transformed its information

df.loc[df['Creditworthy'] == 1, 'Creditworthy'] = 1.0
df.loc[df['Creditworthy'] == 2, 'Creditworthy'] = -1.0

df = df.astype('float64')

df.head()

Unnamed: 0,Duration in month,Credit amount,Installment rate in percentage of disposable income,Present residence since,Age in years,Number of existing credits at this bank,Number of people being liable to provide maintenance for,Telephone,Creditworthy,A11,...,A152,A153,A171,A172,A173,A174,Job ?,A201,A202,Foreign worker ?
0,36.0,2299.0,4.0,4.0,39.0,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,18.0,1239.0,4.0,4.0,61.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,24.0,947.0,4.0,3.0,38.0,1.0,2.0,0.0,-1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,15.0,1478.0,4.0,3.0,33.0,2.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,24.0,1525.0,4.0,3.0,34.0,1.0,2.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [91]:
def z_score_normalize(X, indices):
    for i in indices:
        column = np.array([v[i] for v in X])
        mean = np.mean(column)
        std = np.std(column)
        for j, v in enumerate(column):
            X[j][i] = (v - mean) / std
    for j in range(0, len(X)):
        X[j] = np.array(X[j])
    return X

numerical_non_missing = [
    'Duration in month', 
    'Credit amount', 
    'Age in years', 
    'Installment rate in percentage of disposable income', 
    'Present residence since', 
    'Number of existing credits at this bank', 
    'Number of people being liable to provide maintenance for'
]

for v in numerical_non_missing:
    df[v] = df[v].astype('float64')

indices = [df.columns.get_loc(v) for v in numerical_non_missing]

$$\mathcal{l}(x^T \theta, y) = \begin{cases} |x^T \theta - y| \cdot 5, & \text{if $x^T \theta - y>0$}.\\ |x^T \theta - y|, & \text{otherwise}.\end{cases}$$

$$\Omega_2(\theta) = \theta^T\theta$$

$$\frac{\partial}{\partial \theta_i} \Omega_2(\theta) = 2 \cdot \theta_i$$

$$L(\theta) = \sum^n_{i = 1} \left[\mathcal{l}(x_i^T \theta, y_i) + \frac{\lambda}{n} \Omega_2(\theta)\right]$$

In [92]:
def loss_function(y_pred, y_true):
    if y_pred == y_true:
        return 0
    if y_pred == 1:
        return 5
    return 1

def imperical_risk(y_pred, y_true):
    loss = 0
    for i in range(0, len(y_pred)):
        loss += loss_function(y_pred[i], y_true[i])
    return loss / len(y_pred)

In [93]:
def svm_classifier():
    splits = 5
    kf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=0)

    X = df[[i for i in df.columns.tolist() if i != 'Creditworthy']].values
    y = df['Creditworthy'].values

    params = {
        'C': [0.1, 1, 10, 100, 1000],
        'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    }

    test_split = [v for v in kf.split(X, y)]

    params_star = {
        'C': 0.1,
        'gamma': 1
    }

    risk = 0
    min_risk_Si = np.inf

    for i, (rest_index, test_index) in enumerate(test_split):
        X_test = X[test_index]
        X_test = z_score_normalize(X_test, indices)
        y_test = y[test_index]

        X_rest = X[rest_index]
        y_rest = y[rest_index]

        tune_split = [v for v in kf.split(X_rest, y_rest)]
        min_risk_without_Si = np.inf
        params_star_i = {
            'C': 0.1,
            'gamma': 1
        }

        for C in params['C']:
            for gamma in params['gamma']:

                risk_without_Si = 0

                for j, (train_index, tune_index) in enumerate(tune_split):
                    X_train = X[train_index]
                    X_train = z_score_normalize(X_train, indices)
                    y_train = y[train_index]

                    X_tune = X[tune_index]
                    X_tune = z_score_normalize(X_tune, indices)
                    y_tune = y[tune_index]

                    model_ij = svm.SVC(C=C, gamma=gamma, kernel='rbf', random_state=0, class_weight='balanced',)
                    model_ij.fit(X_train, y_train)

                    y_pred = model_ij.predict(X_tune)
                    risk_Sj = imperical_risk(y_pred, y_tune)
                    risk_without_Si += risk_Sj

                risk_without_Si = risk_without_Si / splits

                if min_risk_without_Si > risk_without_Si:
                    params_star_i = {
                        'C': C,
                        'gamma': gamma
                    }
                    min_risk_without_Si = risk_without_Si
            
        X_rest = z_score_normalize(X_rest, indices)

        model_i = svm.SVC(C=params_star_i['C'], gamma=params_star_i['gamma'], kernel='rbf', random_state=0, class_weight='balanced',)
        model_i.fit(X_rest, y_rest)
        y_pred = model_i.predict(X_test)

        risk_Si = imperical_risk(y_pred, y_test)
        #print(f"Recall: {recall_score(y_test, y_pred)}")
        #print(f"Precision: {precision_score(y_test, y_pred)}")
        risk += risk_Si
        if min_risk_Si > risk_Si:
            min_risk_Si = risk_Si
            params_star = params_star_i

    risk = risk / len(test_split)

    X = z_score_normalize(X, indices)
    model = svm.SVC(C=params_star['C'], gamma=params_star['gamma'], kernel='rbf', random_state=0, class_weight='balanced',)
    print(risk)
    print(params_star)

svm_classifier()


0.587
{'C': 100, 'gamma': 0.0001}


In [94]:
def decision_tree_classifier():
    splits = 5
    kf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=0)

    X = df[[i for i in df.columns.tolist() if i != 'Creditworthy']].values
    y = df['Creditworthy'].values

    params = {
        'max_depth': [3, 5, 6, 7, 8, 9, 10, 12, 15, 20, 30, None],
        'min_samples_split': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9],
        'min_samples_leaf': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9]
    }

    test_split = [v for v in kf.split(X, y)]

    params_star = {
        'max_depth': 3,
        'min_samples_split': 0.1,
        'min_samples_leaf': 0.1
    }

    risk = 0
    min_risk_Si = np.inf

    for i, (rest_index, test_index) in enumerate(test_split):
        X_test = X[test_index]
        X_test = z_score_normalize(X_test, indices)
        y_test = y[test_index]

        X_rest = X[rest_index]
        y_rest = y[rest_index]

        tune_split = [v for v in kf.split(X_rest, y_rest)]
        min_risk_without_Si = np.inf
        params_star_i = {
            'max_depth': 3,
            'min_samples_split': 0.1,
            'min_samples_leaf': 0.1
        }

        for max_depth in params['max_depth']:
            for min_samples_split in params['min_samples_split']:
                for min_samples_leaf in params['min_samples_leaf']:

                    risk_without_Si = 0

                    for j, (train_index, tune_index) in enumerate(tune_split):
                        X_train = X[train_index]
                        X_train = z_score_normalize(X_train, indices)
                        y_train = y[train_index]

                        X_tune = X[tune_index]
                        X_tune = z_score_normalize(X_tune, indices)
                        y_tune = y[tune_index]

                        model_ij = tree.DecisionTreeClassifier(
                            max_depth=max_depth, 
                            min_samples_split=min_samples_split, 
                            min_samples_leaf=min_samples_leaf,
                            criterion='entropy',
                            class_weight='balanced',
                            random_state=0
                        )
                        model_ij.fit(X_train, y_train)

                        y_pred = model_ij.predict(X_tune)
                        risk_Sj = imperical_risk(y_pred, y_tune)
                        risk_without_Si += risk_Sj

                    risk_without_Si = risk_without_Si / splits

                    if min_risk_without_Si > risk_without_Si:
                        params_star_i = {
                            'max_depth': max_depth,
                            'min_samples_split': min_samples_split,
                            'min_samples_leaf': min_samples_leaf
                        }
                        min_risk_without_Si = risk_without_Si
            
        X_rest = z_score_normalize(X_rest, indices)

        model_i = tree.DecisionTreeClassifier(
            max_depth=params_star_i['max_depth'], 
            min_samples_split=params_star_i['min_samples_split'], 
            min_samples_leaf=params_star_i['min_samples_leaf'],
            criterion='entropy',
            class_weight='balanced',
            random_state=0
        )
        model_i.fit(X_rest, y_rest)
        y_pred = model_i.predict(X_test)

        risk_Si = imperical_risk(y_pred, y_test)
        #print(f"Recall: {recall_score(y_test, y_pred)}")
        #print(f"Precision: {precision_score(y_test, y_pred)}")
        risk += risk_Si
        if min_risk_Si > risk_Si:
            min_risk_Si = risk_Si
            params_star = params_star_i

    risk = risk / len(test_split)

    X = z_score_normalize(X, indices)
    model = tree.DecisionTreeClassifier(
        max_depth=params_star['max_depth'], 
        min_samples_split=params_star['min_samples_split'], 
        min_samples_leaf=params_star['min_samples_leaf'],
        criterion='entropy',
        class_weight='balanced',
        random_state=0
    )
    print(risk)
    print(params_star)

decision_tree_classifier()

0.5820000000000001
{'max_depth': 3, 'min_samples_split': 0.1, 'min_samples_leaf': 0.3}


In [95]:
def random_forest_classifier():
    splits = 5
    kf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=0)

    X = df[[i for i in df.columns.tolist() if i != 'Creditworthy']].values
    y = df['Creditworthy'].values

    params = {
        'n_estimators': [5, 7, 10, 15, 30, 50, 75, 100],
        'max_depth': [3, 5, 6, 7, 8, 9, 10, 12, 15, 20, 30, None],
        'min_samples_split': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9],
        'min_samples_leaf': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9]
    }

    test_split = [v for v in kf.split(X, y)]

    params_star = {
        'n_estimators': 5,
        'max_depth': 3,
        'min_samples_split': 0.1,
        'min_samples_leaf': 0.1
    }

    risk = 0
    min_risk_Si = np.inf

    for i, (rest_index, test_index) in enumerate(test_split):
        X_test = X[test_index]
        X_test = z_score_normalize(X_test, indices)
        y_test = y[test_index]

        X_rest = X[rest_index]
        y_rest = y[rest_index]

        tune_split = [v for v in kf.split(X_rest, y_rest)]
        min_risk_without_Si = np.inf
        params_star_i = {
            'n_estimators': 5,
            'max_depth': 3,
            'min_samples_split': 0.1,
            'min_samples_leaf': 0.1
        }

        for n_estimators in params['n_estimators']:
            for max_depth in params['max_depth']:
                for min_samples_split in params['min_samples_split']:
                    for min_samples_leaf in params['min_samples_leaf']:

                        risk_without_Si = 0

                        for j, (train_index, tune_index) in enumerate(tune_split):
                            X_train = X[train_index]
                            X_train = z_score_normalize(X_train, indices)
                            y_train = y[train_index]

                            X_tune = X[tune_index]
                            X_tune = z_score_normalize(X_tune, indices)
                            y_tune = y[tune_index]

                            model_ij = RandomForestClassifier(
                                n_estimators=n_estimators,
                                max_depth=max_depth, 
                                min_samples_split=min_samples_split, 
                                min_samples_leaf=min_samples_leaf,
                                criterion='entropy',
                                class_weight='balanced',
                                random_state=0
                            )
                            model_ij.fit(X_train, y_train)

                            y_pred = model_ij.predict(X_tune)
                            risk_Sj = imperical_risk(y_pred, y_tune)
                            risk_without_Si += risk_Sj

                        risk_without_Si = risk_without_Si / splits

                        if min_risk_without_Si > risk_without_Si:
                            params_star_i = {
                                'n_estimators': n_estimators,
                                'max_depth': max_depth,
                                'min_samples_split': min_samples_split,
                                'min_samples_leaf': min_samples_leaf
                            }
                            min_risk_without_Si = risk_without_Si
            
        X_rest = z_score_normalize(X_rest, indices)

        model_i = RandomForestClassifier(
            n_estimators=params_star_i['n_estimators'],
            max_depth=params_star_i['max_depth'], 
            min_samples_split=params_star_i['min_samples_split'], 
            min_samples_leaf=params_star_i['min_samples_leaf'],
            criterion='entropy',
            class_weight='balanced',
            random_state=0
        )
        model_i.fit(X_rest, y_rest)
        y_pred = model_i.predict(X_test)

        risk_Si = imperical_risk(y_pred, y_test)
        #print(f"Recall: {recall_score(y_test, y_pred)}")
        #print(f"Precision: {precision_score(y_test, y_pred)}")
        risk += risk_Si
        if min_risk_Si > risk_Si:
            min_risk_Si = risk_Si
            params_star = params_star_i

    risk = risk / len(test_split)

    X = z_score_normalize(X, indices)
    model = RandomForestClassifier(
        n_estimators=params_star['n_estimators'],
        max_depth=params_star['max_depth'], 
        min_samples_split=params_star['min_samples_split'], 
        min_samples_leaf=params_star['min_samples_leaf'],
        class_weight='balanced',
        random_state=0
    )
    print(risk)
    print(params_star)

#random_forest_classifier()

In [96]:
def svm_poly_classifier():
    splits = 5
    kf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=0)

    X = df[[i for i in df.columns.tolist() if i != 'Creditworthy']].values
    y = df['Creditworthy'].values

    params = {
        'C': [0.1, 1, 10, 100, 1000],
        'degree': [2, 3, 4, 5, 7, 9, 10],
    }

    test_split = [v for v in kf.split(X, y)]

    params_star = {
        'C': 0.1,
        'degree': 2
    }

    risk = 0
    min_risk_Si = np.inf

    for i, (rest_index, test_index) in enumerate(test_split):
        X_test = X[test_index]
        X_test = z_score_normalize(X_test, indices)
        y_test = y[test_index]

        X_rest = X[rest_index]
        y_rest = y[rest_index]

        tune_split = [v for v in kf.split(X_rest, y_rest)]
        min_risk_without_Si = np.inf
        params_star_i = {
            'C': 0.1,
            'degree': 2
        }

        for C in params['C']:
            for degree in params['degree']:

                risk_without_Si = 0

                for j, (train_index, tune_index) in enumerate(tune_split):
                    X_train = X[train_index]
                    X_train = z_score_normalize(X_train, indices)
                    y_train = y[train_index]

                    X_tune = X[tune_index]
                    X_tune = z_score_normalize(X_tune, indices)
                    y_tune = y[tune_index]

                    model_ij = svm.SVC(C=C, degree=degree, kernel='poly', random_state=0, class_weight='balanced')
                    model_ij.fit(X_train, y_train)

                    y_pred = model_ij.predict(X_tune)
                    risk_Sj = imperical_risk(y_pred, y_tune)
                    risk_without_Si += risk_Sj

                risk_without_Si = risk_without_Si / splits

                if min_risk_without_Si > risk_without_Si:
                    params_star_i = {
                        'C': C,
                        'degree': degree
                    }
                    min_risk_without_Si = risk_without_Si
            
        X_rest = z_score_normalize(X_rest, indices)

        model_i = svm.SVC(C=params_star_i['C'], degree=params_star_i['degree'], kernel='poly', random_state=0, class_weight='balanced',)
        model_i.fit(X_rest, y_rest)
        y_pred = model_i.predict(X_test)

        risk_Si = imperical_risk(y_pred, y_test)
        #print(f"Recall: {recall_score(y_test, y_pred)}")
        #print(f"Precision: {precision_score(y_test, y_pred)}")
        risk += risk_Si
        if min_risk_Si > risk_Si:
            min_risk_Si = risk_Si
            params_star = params_star_i

    risk = risk / len(test_split)

    X = z_score_normalize(X, indices)
    model = svm.SVC(C=params_star['C'], degree=params_star['degree'], kernel='poly', random_state=0, class_weight='balanced',)
    print(risk)
    print(params_star)

svm_poly_classifier()


0.6559999999999999
{'C': 0.1, 'degree': 2}


In [104]:
def decision(threshold, y):
    for i, v in enumerate(y):
        if y < threshold:
            y[i] = -1
        else:
            y[i] = 1
    return y

def neural_network_classifier():
    splits = 5
    kf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=0)

    X = df[[i for i in df.columns.tolist() if i != 'Creditworthy']].values
    y = df['Creditworthy'].values

    params = {
        'dropout': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    }

    test_split = [v for v in kf.split(X, y)]

    params_star = {
        'dropout': 0.0,
    }

    risk = 0
    min_risk_Si = np.inf

    for i, (rest_index, test_index) in enumerate(test_split):
        X_test = X[test_index]
        X_test = z_score_normalize(X_test, indices)
        y_test = y[test_index]

        X_rest = X[rest_index]
        y_rest = y[rest_index]

        tune_split = [v for v in kf.split(X_rest, y_rest)]
        min_risk_without_Si = np.inf
        params_star_i = {
            'dropout': 0.0,
        }

        for dropout in params['dropout']:

            risk_without_Si = 0

            for j, (train_index, tune_index) in enumerate(tune_split):
                X_train = X[train_index]
                X_train = z_score_normalize(X_train, indices)
                y_train = y[train_index]

                X_tune = X[tune_index]
                X_tune = z_score_normalize(X_tune, indices)
                y_tune = y[tune_index]

                model_ij = tf.keras.models.Sequential([
                    tf.keras.layers.Input(shape=(64,)),
                    tf.keras.layers.Dropout(dropout),
                    tf.keras.layers.Dense(128, activation='relu'),
                    tf.keras.layers.Dropout(dropout),
                    tf.keras.layers.Dense(1, activation='sigmoid')
                ])
                model_ij.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_error'])
                model_ij.fit(x=X_train, y=y_train, epochs=2000)

                y_pred = decision(0.5, model_ij.predict(X_tune))
                risk_Sj = imperical_risk(y_pred, y_tune)
                risk_without_Si += risk_Sj

            risk_without_Si = risk_without_Si / splits

            if min_risk_without_Si > risk_without_Si:
                params_star_i = {
                    'dropout': dropout,
                }
                min_risk_without_Si = risk_without_Si
            
        X_rest = z_score_normalize(X_rest, indices)

        model_i = tf.keras.models.Sequential([
            tf.keras.layers.Input(shape=(64,)),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dropout(dropout),
            tf.keras.layers.Dense(1, activation='sigmoid')
        ])
        model_i.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_error'])
        model_i.fit(x=X_rest, y=y_rest, epochs=2000)
        y_pred = decision(0.5, model_i.predict(X_test))

        risk_Si = imperical_risk(y_pred, y_test)
        #print(f"Recall: {recall_score(y_test, y_pred)}")
        #print(f"Precision: {precision_score(y_test, y_pred)}")
        risk += risk_Si
        if min_risk_Si > risk_Si:
            min_risk_Si = risk_Si
            params_star = params_star_i

    risk = risk / len(test_split)

    X = z_score_normalize(X, indices)
    # TODO model = svm.SVC(C=params_star['C'], degree=params_star['degree'], kernel='poly', random_state=0, class_weight='balanced',)
    print(risk)
    print(params_star)

neural_network_classifier() 

ValueError: object __array__ method not producing an array

In [None]:
# risk = 0

# for index, row in df.iterrows():
#     rest = df
#     rest = rest.drop(index)
#     for label in numerical_non_missing:
#         rest[label] = z_score(rest[label])   

#     X_test = df_norm.loc[index][[i for i in df_norm.columns.tolist() if i != 'Creditworthy']].values.reshape(1,-1)
#     y_test = row['Creditworthy'].values

#     kf = StratifiedKFold(n_splits=4, shuffle=True, random_state=0)

#     X_rest = rest[[i for i in rest.columns.tolist() if i != 'Creditworthy']].values
#     y_rest = rest['Creditworthy'].values

#     for train_index, tune_index in kf.split(X_rest, y_rest):
#         X_train = X_rest[train_index]
#         y_train = y_rest[train_index]

#         X_tune = X_rest[tune_index]
#         y_tune = y_rest[tune_index]

#         i = 0.1
#         while i <= 100.0:
#             clf = svm.SVC(C=i, random_state=0, kernel='rbf')
#             clf.fit(X_train, y_train)

#             y_pred = clf.predict(X_test)
#             i += 0.1


In [None]:
# df.loc[df['Foreign worker'] == 'A201', 'Foreign worker'] = 1.0
# df.loc[df['Foreign worker'] == 'A202', 'Foreign worker'] = 0.0

# fw_data = df[df.columns.tolist()[3:]]
# fw_data.loc[fw_data['Creditworthy'] == 1, 'Creditworthy'] = 0.0
# fw_data.loc[fw_data['Creditworthy'] == 2, 'Creditworthy'] = 1.0

# missing = fw_data.loc[fw_data['Foreign worker'] == '?']
# fw_data = fw_data.loc[fw_data['Foreign worker'] != '?']
# fw_data['Foreign worker'] = fw_data['Foreign worker'].astype('int64')

# fw_data_norm = fw_data
# for label in numerical_non_missing:
#     fw_data_norm[label] = z_score(fw_data_norm[label])

# risk = 0

# for index, row in fw_data.iterrows():
#     rest = fw_data
#     rest = rest.drop(index)
#     for label in numerical_non_missing:
#         rest[label] = z_score(rest[label])

#     X_test = fw_data_norm.loc[index][[i for i in fw_data_norm.columns.tolist() if i != 'Foreign worker']].values.reshape(1,-1)
#     y_test = row['Foreign worker']

#     X_train = rest[[i for i in rest.columns.tolist() if i != 'Foreign worker']]
#     y_train = rest['Foreign worker']

#     clf = RandomForestClassifier(max_depth=10, n_estimators=1000, random_state=0, criterion='entropy', class_weight='balanced')
#     clf.fit(X_train, y_train)

#     y_pred = clf.predict(X_test)

#     if y_pred != y_test:
#         risk += 1

# print(risk)

    

In [None]:
# risk = 0

# for index, row in df.iterrows():
#     rest = df
#     rest = rest.drop(index)
#     for label in numerical_non_missing:
#         rest[label] = z_score(rest[label])

#     X_test = df_norm.loc[index][[i for i in df_norm.columns.tolist() if i != 'Creditworthy']].values.reshape(1,-1)
#     y_test = row['Creditworthy']

#     X_train = rest[[i for i in rest.columns.tolist() if i != 'Creditworthy']]
#     y_train = rest['Creditworthy']

#     clf = RandomForestClassifier(max_depth=5, n_estimators=100, random_state=0, criterion='entropy')
#     clf.fit(X_train, y_train)

#     y_pred = clf.predict(X_test)

#     if y_pred != y_test:
#         risk += 1

# print(risk)