In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

data_all = pd.read_csv('../data/data.csv')

train, test = train_test_split(data_all, test_size=0.2, random_state=42)

X_train = train.drop(['CREDIT_SCORE','DEFAULT'], axis=1)
y_train = train['DEFAULT']

X_test = test.drop(['CREDIT_SCORE','DEFAULT'], axis=1)
y_test = test['DEFAULT']

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

class BaseColumnsSelector(BaseEstimator, TransformerMixin):
    def __init__(self, subset):
        self.subset = subset
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.loc[:, self.subset]
    
baseColumns = ['INCOME', 'SAVINGS', 'DEBT', 'T_CLOTHING_12', 'T_CLOTHING_6', 
       'T_EDUCATION_12', 'T_EDUCATION_6', 'T_ENTERTAINMENT_12',
       'T_ENTERTAINMENT_6', 'T_FINES_12',
       'T_FINES_6', 'T_GAMBLING_12', 'T_GAMBLING_6', 
       'T_GROCERIES_12', 'T_GROCERIES_6', 'T_HEALTH_12', 'T_HEALTH_6',
       'T_HOUSING_12', 'T_HOUSING_6', 'T_TAX_12', 'T_TAX_6', 'T_TRAVEL_12',
       'T_TRAVEL_6', 'R_TRAVEL', 'T_UTILITIES_12', 'T_UTILITIES_6', 
       'T_EXPENDITURE_12', 'T_EXPENDITURE_6', 
       'CAT_GAMBLING', 'CAT_CREDIT_CARD', 'CAT_MORTGAGE',
       'CAT_SAVINGS_ACCOUNT', 'CAT_DEPENDENTS']

groups = ['CLOTHING', 'EDUCATION', 'ENTERTAINMENT', 'FINES', 'GAMBLING', 'GROCERIES', 'HEALTH', 'HOUSING', 'TAX', 'TRAVEL', 'UTILITIES']
no_sense_in_division =['T_EDUCATION_12', 'T_FINES_12', 'T_GAMBLING_12', 'T_HOUSING_12', 'T_TAX_12', 'T_TRAVEL_12']
division_groups = ['CLOTHING',  'ENTERTAINMENT',  'GROCERIES', 'HEALTH',  'UTILITIES']


In [4]:
#być może zrobić fines_6 i fines_12 jako kategoryczne???? i gambling_6 i gambling_12 też
class OutliersReplacer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        for column in X.columns:
            if column == 'SAVINGS':
                X.loc[X[column] > 2500000, column] = 2500000
            elif column == 'DEBT':
                X.loc[X[column] > 4000000, column] = 4000000
            elif column == 'T_CLOTHING_12':
                X.loc[X[column] > 32000, column] = 32000
            elif column == 'T_CLOTHING_6':
                X.loc[X[column] > 25000, column] = 25000
            elif column == 'T_HEALTH_12':
                X.loc[X[column] > 25000, column] = 25000
            elif column == 'T_HEALTH_6':
                X.loc[X[column] > 18000, column] = 18000
            elif column == 'T_TRAVEL_12':
                X.loc[X[column] > 150000, column] = 150000
            elif column == 'T_TRAVEL_6':
                X.loc[X[column] > 110000, column] = 110000
        return X

In [5]:
class RatioColumnsAdder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        for idx, row in X.iterrows():
            #DEBT/INCOME
            if row['INCOME'] == 0:
                X.at[idx, 'R_DEBT_INCOME'] = None
            else:
                X.at[idx, 'R_DEBT_INCOME'] = row['DEBT'] / row['INCOME']
            #SAVINGS/INCOME
            if row['INCOME'] == 0:
                X.at[idx, 'R_SAVINGS_INCOME'] = None
            else:
                X.at[idx, 'R_SAVINGS_INCOME'] = row['SAVINGS'] / row['INCOME']
            #DEBT/SAVINGS
            if row['SAVINGS'] == 0:
                X.at[idx, 'R_DEBT_SAVINGS'] = None
            else:
                X.at[idx, 'R_DEBT_SAVINGS'] = row['DEBT'] / row['SAVINGS']
            for group in division_groups:
                if row['T_' + group + '_12'] == 0:
                    X.at[idx, 'R_' + group] = None
                else:
                    X.at[idx, 'R_' + group] = row['T_' + group + '_6'] / row['T_' + group + '_12']
            for group in groups:
                if row['INCOME'] == 0:
                    X.at[idx, 'R_'+group+'_INCOME'] = None
                else:
                    X.at[idx, 'R_'+group+'_INCOME'] = row['T_'+group+'_6'] / row['INCOME']
                    
                if row['SAVINGS'] == 0:
                    X.at[idx, 'R_'+group+'_SAVINGS'] = None
                else:
                    X.at[idx, 'R_'+group+'_SAVINGS'] = row['T_'+group+'_6'] / row['SAVINGS']

                if row['DEBT'] == 0:
                    X.at[idx, 'R_'+group+'_DEBT'] = None
                else:
                    X.at[idx, 'R_'+group+'_DEBT'] = row['T_'+group+'_6'] / row['DEBT']
        return X

In [6]:
class DropColumns(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        potentialColumnsToDrop = ['T_EDUCATION_12', 'T_FINES_12', 'T_GAMBLING_12', 'T_HOUSING_12', 'T_TAX_12', 'T_TRAVEL_12', 'T_EDUCATION_6','T_ENTERTAINMENT_6','T_GAMBLING_6','T_GROCERIES_6','T_HOUSING_6','T_EXPENDITURE_12', 'T_EXPENDITURE_6','R_GROCERIES_DEBT', 'INCOME', 'T_UTILITIES_6', 'R_EDUCATION_DEBT', 'T_UTILITIES_12', 'R_CLOTHING_DEBT',
        'CAT_DEPENDENTS', 'R_ENTERTAINMENT_SAVINGS', 'R_FINES_INCOME',
        'R_FINES_SAVINGS', 'R_FINES_DEBT', 'R_GROCERIES_SAVINGS',
        'CAT_SAVINGS_ACCOUNT', 'R_HOUSING_INCOME', 'R_TAX_INCOME',
        'R_TAX_SAVINGS', 'R_TRAVEL_DEBT', 'R_UTILITIES_DEBT', 'CAT_GAMBLING',
        'CAT_DEBT', 'CAT_MORTGAGE', 'SAVINGS', 'R_UTILITIES_SAVINGS', 'R_EDUCATION', 'R_FINES', 'R_GAMBLING', 'R_HOUSING', 'R_GROCERIES_INCOME', 'T_ENTERTAINMENT_12', 'R_ENTERTAINMENT',
       'R_TRAVEL_SAVINGS', 'R_GAMBLING_SAVINGS', 'T_CLOTHING_6']
        for column in potentialColumnsToDrop:
            if column in X.columns:
                X.drop(column, axis=1, inplace=True)
        return X

In [7]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

class MissingValuesFiller(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        numerical_columns = X.select_dtypes(include=[np.number]).columns
        imputer = KNNImputer(n_neighbors=7)
        X[numerical_columns] = imputer.fit_transform(X[numerical_columns])
        return X

In [8]:
#best parameters :
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
#Logistic Regression
from sklearn.linear_model import LogisticRegression
#standart scaler
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

for solver in ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']:
    for c in [0.1, 1, 10, 100]:
        pipeline = Pipeline([
            ('selector', BaseColumnsSelector(baseColumns)),
            ('outliers_replacer', OutliersReplacer()),
            ('ratio_columns_adder', RatioColumnsAdder()),
            ('drop_columns', DropColumns()),
            ('missing_values_filler', MissingValuesFiller()),
            ('scaler', StandardScaler()),
            ('classifier', LogisticRegression(solver=solver, C=c))
        ])
        kfold = KFold(n_splits=5, random_state=42, shuffle=True)
        score_acc = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='accuracy')
        score_recall = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='recall')
        score_precision = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='precision')
        score_f1 = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='f1')
        print(f'{solver} {c} : {score_acc.mean()} {score_recall.mean()} {score_precision.mean()} {score_f1.mean()}')

newton-cg 0.1 : 0.734375 0.17498501498501498 0.6230252100840337 0.26622656018929314
newton-cg 1 : 0.7375 0.22593406593406593 0.6122344322344322 0.31996779388083735
newton-cg 10 : 0.734375 0.23619047619047615 0.5923095035115495 0.3286734693877551
newton-cg 100 : 0.734375 0.23619047619047615 0.5923095035115495 0.3286734693877551
lbfgs 0.1 : 0.734375 0.17498501498501498 0.6230252100840337 0.26622656018929314
lbfgs 1 : 0.7375 0.22593406593406593 0.6122344322344322 0.31996779388083735
lbfgs 10 : 0.734375 0.23619047619047615 0.5923095035115495 0.3286734693877551
lbfgs 100 : 0.734375 0.23619047619047615 0.5923095035115495 0.3286734693877551
liblinear 0.1 : 0.7265625 0.19247419247419245 0.5731601731601732 0.2777563610200058
liblinear 1 : 0.734375 0.22593406593406593 0.5936008918617615 0.3177991669481031
liblinear 10 : 0.734375 0.23619047619047615 0.5923095035115495 0.3286734693877551
liblinear 100 : 0.734375 0.23619047619047615 0.5923095035115495 0.3286734693877551
sag 0.1 : 0.734375 0.1749850



sag 1 : 0.7359375 0.2150915750915751 0.608095238095238 0.3084974636549872




sag 10 : 0.7359375 0.23106227106227104 0.5958730158730159 0.3248110355253212




sag 100 : 0.7375 0.23619047619047615 0.6025396825396825 0.3311410018552876




saga 0.1 : 0.734375 0.17498501498501498 0.6230252100840337 0.26622656018929314




saga 1 : 0.734375 0.2150915750915751 0.593992673992674 0.30654710904505816




saga 10 : 0.734375 0.22021978021978023 0.5908180708180708 0.3134311670160727




saga 100 : 0.734375 0.22021978021978023 0.5908180708180708 0.31194968553459124




In [9]:
#best parameters :
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
#Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

pipeline = Pipeline([
        ('selector', BaseColumnsSelector(baseColumns)),
        ('outliers_replacer', OutliersReplacer()),
        ('ratio_columns_adder', RatioColumnsAdder()),
        ('missing_values_filler', MissingValuesFiller()),
        ('drop_columns', DropColumns()),
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(solver='liblinear', C=10))
    ])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


0.75
[[108   6]
 [ 34  12]]
              precision    recall  f1-score   support

           0       0.76      0.95      0.84       114
           1       0.67      0.26      0.38        46

    accuracy                           0.75       160
   macro avg       0.71      0.60      0.61       160
weighted avg       0.73      0.75      0.71       160



In [12]:
from joblib import dump


dump(pipeline, '../models/lr_pipe.joblib')

['../models/lr_pipe.joblib']

<h1>Jezeli mamy CREDIT SCORE, czy staje lepiej??</h1>

In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

data_all = pd.read_csv('../data/data.csv')

train, test = train_test_split(data_all, test_size=0.2, random_state=42)

X_train = train.drop(['DEFAULT'], axis=1)
y_train = train['DEFAULT']

X_test = test.drop(['DEFAULT'], axis=1)
y_test = test['DEFAULT']

In [11]:
#best parameters :
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
#Logistic Regression
from sklearn.linear_model import LogisticRegression
#standart scaler
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

for solver in ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']:
    for c in [0.1, 1, 10, 100]:
        pipeline = Pipeline([
            ('selector', BaseColumnsSelector(baseColumns)),
            ('outliers_replacer', OutliersReplacer()),
            ('ratio_columns_adder', RatioColumnsAdder()),
            ('drop_columns', DropColumns()),
            ('missing_values_filler', MissingValuesFiller()),
            ('scaler', StandardScaler()),
            ('classifier', LogisticRegression(solver=solver, C=c))
        ])
        kfold = KFold(n_splits=5, random_state=42, shuffle=True)
        score_acc = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='accuracy')
        score_recall = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='recall')
        score_precision = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='precision')
        score_f1 = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='f1')
        print(f'{solver} {c} : {score_acc.mean()} {score_recall.mean()} {score_precision.mean()} {score_f1.mean()}')

newton-cg 0.1 : 0.734375 0.17498501498501498 0.6230252100840337 0.26622656018929314
newton-cg 1 : 0.7375 0.22593406593406593 0.6122344322344322 0.31996779388083735


KeyboardInterrupt: 

<h1>Raczej CREDIT_SCORE nic nie daje juz</h1>