In [2]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
data=pd.read_csv('data/data.csv')
y = np.array(data['DEFAULT'])

X_train, X_test, y_train, y_test = train_test_split(
    data, y, stratify=y, test_size=0.3, random_state=42
)

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier


selected_numeric_features = ['INCOME', 'SAVINGS', 'DEBT', 'R_SAVINGS_INCOME', 'R_DEBT_INCOME',
       'R_DEBT_SAVINGS', 'T_CLOTHING_12', 
       'R_CLOTHING_INCOME', 'R_CLOTHING_SAVINGS', 'R_CLOTHING_DEBT',
       'T_EDUCATION_12', 'R_EDUCATION_INCOME',
       'R_EDUCATION_SAVINGS', 'R_EDUCATION_DEBT', 'T_ENTERTAINMENT_12',
       'R_ENTERTAINMENT_INCOME',
       'R_ENTERTAINMENT_SAVINGS', 'R_ENTERTAINMENT_DEBT', 'T_FINES_12',
       'R_FINES_INCOME', 'R_FINES_SAVINGS',
       'R_FINES_DEBT', 'T_GAMBLING_12',
       'R_GAMBLING_INCOME', 'R_GAMBLING_SAVINGS', 'R_GAMBLING_DEBT',
       'T_GROCERIES_12', 'R_GROCERIES_INCOME',
       'R_GROCERIES_SAVINGS', 'R_GROCERIES_DEBT', 'T_HEALTH_12',
       'R_HEALTH_INCOME', 'R_HEALTH_SAVINGS', 'R_HEALTH_DEBT',
       'T_HOUSING_12', 'R_HOUSING_INCOME',
       'R_HOUSING_SAVINGS', 'R_HOUSING_DEBT', 'T_TAX_12',
       'R_TAX_INCOME', 'R_TAX_SAVINGS', 'R_TAX_DEBT', 'T_TRAVEL_12',
       'R_TRAVEL_INCOME', 'R_TRAVEL_SAVINGS',
       'R_TRAVEL_DEBT', 'T_UTILITIES_12', 'T_EXPENDITURE_12',
       'R_UTILITIES_INCOME', 'R_UTILITIES_SAVINGS', 'R_UTILITIES_DEBT',
       'R_EXPENDITURE_INCOME', 'R_EXPENDITURE_SAVINGS', 'R_EXPENDITURE_DEBT']


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), selected_numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['CAT_GAMBLING'])
    ])



In [16]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('pca', PCA(n_components=15)),
                      ('classifier', RandomForestClassifier(n_estimators=500,
                                                            min_samples_split=39,
                                                            min_samples_leaf=18,
                                                            max_features='sqrt',
                                                            max_depth=11,
                                                            bootstrap=False,
                                                            random_state=0))])
#      almost same                                           min_samples_leaf=18,
#                                                            max_features='log2',
#                                                            max_depth=11,
#                                                            bootstrap=False,
#                                                            random_state=0



In [17]:
clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

In [18]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(y_test, predictions))
print("Accuracy:", accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           0       0.74      0.98      0.84       172
           1       0.71      0.15      0.24        68

    accuracy                           0.74       240
   macro avg       0.73      0.56      0.54       240
weighted avg       0.74      0.74      0.67       240

Accuracy: 0.7416666666666667


In [19]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
cv_scores = cross_val_score(clf, X_train, y_train,scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise') 

print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", np.mean(cv_scores))

Cross-validation scores: [0.75       0.73214286 0.75       0.71428571 0.75       0.75
 0.75       0.75       0.75       0.73214286 0.75       0.73214286
 0.73214286 0.73214286 0.75       0.67857143 0.75       0.73214286
 0.75       0.75       0.78571429 0.69642857 0.75       0.78571429
 0.73214286 0.69642857 0.75       0.73214286 0.73214286 0.73214286]
Mean cross-validation score: 0.7392857142857143


In [48]:
cv.get_n_splits(X_train, y_train)

30