In [264]:
### Credit Risks 
    # Multiple accuracy measures resembling different k-neighbors used for training your KNN classifier.
    # One printed confusion matrix for the best model.
    
from sklearn.datasets import fetch_openml
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, LabelEncoder
from sklearn.feature_selection import SelectKBest ## categorical feature selection
from sklearn.feature_selection import mutual_info_classif ## categorical feature selection

from sklearn.model_selection import train_test_split ## split data

from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score


from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Ridge, LogisticRegression

import seaborn as sns


In [48]:
## load data
X,y = fetch_openml(name='credit-g', as_frame=True, return_X_y =True)
df=X

## remove na
df.dropna()
df


  " {version}.".format(name=name, version=res[0]['version']))


Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
0,<0,6.0,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4.0,male single,none,4.0,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes
1,0<=X<200,48.0,existing paid,radio/tv,5951.0,<100,1<=X<4,2.0,female div/dep/mar,none,2.0,real estate,22.0,none,own,1.0,skilled,1.0,none,yes
2,no checking,12.0,critical/other existing credit,education,2096.0,<100,4<=X<7,2.0,male single,none,3.0,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes
3,<0,42.0,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2.0,male single,guarantor,4.0,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes
4,<0,24.0,delayed previously,new car,4870.0,<100,1<=X<4,3.0,male single,none,4.0,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,no checking,12.0,existing paid,furniture/equipment,1736.0,<100,4<=X<7,3.0,female div/dep/mar,none,4.0,real estate,31.0,none,own,1.0,unskilled resident,1.0,none,yes
996,<0,30.0,existing paid,used car,3857.0,<100,1<=X<4,4.0,male div/sep,none,4.0,life insurance,40.0,none,own,1.0,high qualif/self emp/mgmt,1.0,yes,yes
997,no checking,12.0,existing paid,radio/tv,804.0,<100,>=7,4.0,male single,none,4.0,car,38.0,none,own,1.0,skilled,1.0,none,yes
998,<0,45.0,existing paid,radio/tv,1845.0,<100,1<=X<4,4.0,male single,none,4.0,no known property,23.0,none,for free,1.0,skilled,1.0,yes,yes


### Step3: Select Features:Min. of 4 Numerical and 3 Nominal
    #Identify Numerical cols -> Correlation of Numerical Features : select if correlation < 0.8
    #Get catrgorical cols. -> select features

In [49]:
 ## Numerical data feature selection:
numerical= df._get_numeric_data()
numerical


Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents
0,6.0,1169.0,4.0,4.0,67.0,2.0,1.0
1,48.0,5951.0,2.0,2.0,22.0,1.0,1.0
2,12.0,2096.0,2.0,3.0,49.0,1.0,2.0
3,42.0,7882.0,2.0,4.0,45.0,1.0,2.0
4,24.0,4870.0,3.0,4.0,53.0,2.0,2.0
...,...,...,...,...,...,...,...
995,12.0,1736.0,3.0,4.0,31.0,1.0,1.0
996,30.0,3857.0,4.0,4.0,40.0,1.0,1.0
997,12.0,804.0,4.0,4.0,38.0,1.0,1.0
998,45.0,1845.0,4.0,4.0,23.0,1.0,1.0


In [44]:
corr_matrix = numerical.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
upper
## take all numerical

Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents
duration,,0.624984,0.074749,0.034067,0.036136,0.011284,0.023834
credit_amount,,,0.271316,0.028926,0.032716,0.020795,0.017142
installment_commitment,,,,0.049302,0.058266,0.021669,0.071207
residence_since,,,,,0.266419,0.089625,0.042643
age,,,,,,0.149254,0.118201
existing_credits,,,,,,,0.109667
num_dependents,,,,,,,


In [104]:
## Categorical data feature selection 
    ## https://machinelearningmastery.com/feature-selection-with-categorical-data/
    ## 2 Feature selection methods when the target variable is also categorical:chi-squared statistic and mutual information statistic.

## Encode labels
le = LabelEncoder()
y_enc = le.fit_transform(y)
    
categorical_cols = list(set(df.columns) - set(numerical.columns))
cat_df = df[df.columns.intersection(categorical_cols)]

## Encode Categorical features: Ordinal encoder
Oencoder = OrdinalEncoder()
cat_df_enc = pd.DataFrame(Oencoder.fit_transform(cat_df))
cat_df_enc.columns = cat_df.columns
cat_df_enc

Unnamed: 0,checking_status,credit_history,purpose,savings_status,employment,personal_status,other_parties,property_magnitude,other_payment_plans,housing,job,own_telephone,foreign_worker
0,1.0,1.0,6.0,4.0,3.0,3.0,2.0,3.0,1.0,1.0,1.0,1.0,1.0
1,0.0,3.0,6.0,2.0,0.0,0.0,2.0,3.0,1.0,1.0,1.0,0.0,1.0
2,3.0,1.0,2.0,2.0,1.0,3.0,2.0,3.0,1.0,1.0,3.0,0.0,1.0
3,1.0,3.0,3.0,2.0,1.0,3.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0
4,1.0,2.0,4.0,2.0,0.0,3.0,2.0,2.0,1.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,3.0,3.0,3.0,2.0,1.0,0.0,2.0,3.0,1.0,1.0,3.0,0.0,1.0
996,1.0,3.0,9.0,2.0,0.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0
997,3.0,3.0,6.0,2.0,3.0,3.0,2.0,0.0,1.0,1.0,1.0,0.0,1.0
998,1.0,3.0,6.0,2.0,0.0,3.0,2.0,2.0,1.0,0.0,1.0,1.0,1.0


In [141]:
#### Mutual Information for  Feature Selection -> the mutual_info_classif() function -> SelectKBest 
fs = SelectKBest(score_func=mutual_info_classif, k=5)
#fs.fit_transform(cat_df_enc, y_enc)
cat_df_reduced = fs.fit_transform(cat_df_enc, y_enc)

#cat_df_reduced.shape()
cols = fs.get_support(indices=True)
selected_columns = cat_df_enc.iloc[:,cols].columns.tolist()
print(selected_columns)

## Selected Nominal features    
Nominal_df = cat_df_enc[selected_columns]


['checking_status', 'credit_history', 'purpose', 'property_magnitude', 'job']


In [160]:

Nominal_scale = pd.DataFrame(scaler.fit_transform(Nominal_df))
Nominal_scale.columns = Nominal_df.columns
Nominal_scale[]

Unnamed: 0,checking_status,credit_history,purpose,property_magnitude,job
0,0.333333,0.25,0.666667,1.000000,0.333333
1,0.000000,0.75,0.666667,1.000000,0.333333
2,1.000000,0.25,0.222222,1.000000,1.000000
3,0.333333,0.75,0.333333,0.333333,0.333333
4,0.333333,0.50,0.444444,0.666667,0.333333
...,...,...,...,...,...
995,1.000000,0.75,0.333333,1.000000,1.000000
996,0.333333,0.75,1.000000,0.333333,0.000000
997,1.000000,0.75,0.666667,0.000000,0.333333
998,0.333333,0.75,0.666667,0.666667,0.333333


### Step4: Preprocessing 
    ## scale Numerical Col.
    
    

In [161]:
### encode Label data

Lencoder = LabelEncoder()
label = Lencoder.fit_transform(y)

### Scale Numerical features
scaler = MinMaxScaler()

numerical_scale = pd.DataFrame(scaler.fit_transform(numerical))
numerical_scale.columns = numerical.columns
numerical_scale 

## scale Nominal features
Nominal_scale = pd.DataFrame(scaler.fit_transform(Nominal_df))
Nominal_scale.columns = Nominal_df.columns

## combine Selected Nominal and scaled Numerical features
df_processed = pd.concat([numerical_scale, Nominal_scale], axis=1)
df_processed


Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,checking_status,credit_history,purpose,property_magnitude,job
0,0.029412,0.050567,1.000000,1.000000,0.857143,0.333333,0.0,0.333333,0.25,0.666667,1.000000,0.333333
1,0.647059,0.313690,0.333333,0.333333,0.053571,0.000000,0.0,0.000000,0.75,0.666667,1.000000,0.333333
2,0.117647,0.101574,0.333333,0.666667,0.535714,0.000000,1.0,1.000000,0.25,0.222222,1.000000,1.000000
3,0.558824,0.419941,0.333333,1.000000,0.464286,0.000000,1.0,0.333333,0.75,0.333333,0.333333,0.333333
4,0.294118,0.254209,0.666667,1.000000,0.607143,0.333333,1.0,0.333333,0.50,0.444444,0.666667,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.117647,0.081765,0.666667,1.000000,0.214286,0.000000,0.0,1.000000,0.75,0.333333,1.000000,1.000000
996,0.382353,0.198470,1.000000,1.000000,0.375000,0.000000,0.0,0.333333,0.75,1.000000,0.333333,0.000000
997,0.117647,0.030483,1.000000,1.000000,0.339286,0.000000,0.0,1.000000,0.75,0.666667,0.000000,0.333333
998,0.602941,0.087763,1.000000,1.000000,0.071429,0.000000,0.0,0.333333,0.75,0.666667,0.666667,0.333333


### Step5: Splitting the Data 80% training, 10% test, 10% val

In [259]:
## https://datascience.stackexchange.com/questions/15135/train-test-validation-set-splitting-in-sklearn
train_ratio = 0.80
validation_ratio = 0.10
test_ratio = 0.10

X_train, X_test, y_train, y_test = train_test_split(df_processed, label, test_size=1 - train_ratio)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio))

y_test

array([0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1])

### Step6: Train Classifiers
    KNN -> select best k (trying 1,2,5,10,20,40,50,60,70,80,90,100)

In [261]:
### Iterate ove Ks to get best k
accuracy_k_df =[]
for k in [1,2,3,4,5,10, 15,20,25,30,40,50,60,70,80,90,100]:
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(X_train, y_train)
    
    y_pred = knn_model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1score = f1_score(y_test, y_pred)
    
    accuracy_k_df.append((k, accuracy, precision, recall, f1score))

scores = pd.DataFrame(accuracy_k_df, columns=("k", "accuracy", "precision","recall", "F1score"))
scores

Unnamed: 0,k,accuracy,precision,recall,F1score
0,1,0.71,0.76,0.838235,0.797203
1,2,0.67,0.786885,0.705882,0.744186
2,3,0.75,0.759036,0.926471,0.834437
3,4,0.73,0.773333,0.852941,0.811189
4,5,0.69,0.707865,0.926471,0.802548
5,10,0.7,0.731707,0.882353,0.8
6,15,0.7,0.706522,0.955882,0.8125
7,20,0.71,0.724138,0.926471,0.812903
8,25,0.7,0.702128,0.970588,0.814815
9,30,0.71,0.709677,0.970588,0.819876


In [263]:
## Validation testing
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, y_train)
y_pred = knn_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
accuracy

0.77

### Step7: Other Classifiers
    SVM, Logistic Regression, Decision Trees

In [266]:
## SVM
svm = SVC(kernel="rbf")
svm.fit(X_train, y_train)
y_pred = svm.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
accuracy

0.73

In [265]:
#### Logistic Regression
LR = LogisticRegression()
LR.fit(X_train, y_train)
y_pred = LR.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
accuracy

0.75

In [268]:
## decision tree
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
DT.fit(X_train, y_train)
y_pred = DT.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
accuracy

0.71