In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
# Calculate f1 score on test data after gridsearch
# create script: function to create train test split, function for griserach, print metrics
# and save best model.

In [2]:
def split_data(file_name, test_size=0.25):
    """This method reads the xls fil, and do the train test split.
    Args:
        file_name (str): excel file name
        test_size (float): traction for test set
    Returns:
        X_train, X_test, y_train, y_test (np.arrays)
    """
    df = pd.read_excel(file_name, header=1)
    # put ID as DataFrame INDEX
    df.set_index('ID', inplace=True)
    drop = ['PAY_AMT5', 'BILL_AMT5','BILL_AMT4','PAY_3','PAY_4',
     'EDUCATION','PAY_6','SEX','MARRIAGE','PAY_5']
    # train test split
    X, y = df.drop(['default payment next month']+drop, axis=1).values, df['default payment next month'].values
    return train_test_split(X, y, test_size=test_size, random_state=42)

In [3]:
file_name = "default of credit card clients.xls"
X_train, X_test, y_train, y_test = split_data(file_name)

In [14]:
# define hyperparameter space for random search
dist = dict(n_estimators=[200, 300, 500], max_depth=[7, 13, 15], min_samples_leaf=[5, 10, 20],
           max_samples=[0.6, 0.8], min_samples_split=[10, 6, 20], max_features=[0.7, 0.8])

rfc = RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample', random_state=42)
clf = RandomizedSearchCV(rfc, dist,refit=True,
                         random_state=42, n_iter=20, scoring='f1', n_jobs=8)
search = clf.fit(X_train, y_train)

In [15]:
search.best_score_

0.545871347291232

0    5580
1    1920
dtype: int64

In [16]:
search.best_estimator_

RandomForestClassifier(class_weight='balanced_subsample', max_depth=13,
                       max_features=0.7, max_samples=0.6, min_samples_leaf=20,
                       min_samples_split=20, n_estimators=200, random_state=42)

In [17]:
y_pred = search.best_estimator_.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.85      0.86      5873
           1       0.50      0.55      0.52      1627

    accuracy                           0.78      7500
   macro avg       0.68      0.70      0.69      7500
weighted avg       0.79      0.78      0.79      7500



# Project Descrition
This repository aims to create a model for default payment prediction from the dataset [https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients]()

## Environment setup
In the following lines of commands, you will create the env, install the main packages, and made the env a visible kernel in Jupyter notebooks.
```terminal
$ conda create -n credit_card python=3.6
$ conda activate credit_card
$ pip install pandas
$ conda install -c conda-forge fdasrsf
$ pip install scikit-fda
$ pip install statsmodels
$ pip install ipykernel
$ python -m ipykernel install --user --name=credit_card
$ pip install xlrd
$ pip install imblearn
```
# How to read this project
1. Start by `default_payment.ipynb`, where you can find a full exploratory data analysis, variable selection and two models training with hand-made hyperparameter tuning.
1. GridSearch

In [18]:
ls

'default of credit card clients.xls'   GridSearch.ipynb   yeh2009.pdf
 default_payment.ipynb                 README.md
