In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance

In [2]:
df = pd.read_csv('../raw_data/Klarna_dataset.csv',delimiter=';')

In [3]:
df.shape

(99976, 43)

In [4]:
df.default.isnull().sum()

10000

In [5]:
# seperate the dataframe useful for model training

data = df.head(89976)

## data preprocessing

In [6]:
data = data.drop_duplicates()

In [7]:
data.drop(columns='uuid',inplace=True)

In [8]:
corr_df = data.corr()
corr_df = corr_df.unstack().reset_index()
corr_df.rename(columns = {'level_0':'feature_1','level_1':'feature_2',0:'correlation'},inplace=True)
corr_df.sort_values(by='correlation',ascending=False,inplace=True)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 89976 entries, 0 to 89975
Data columns (total 42 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   default                              89976 non-null  float64
 1   account_amount_added_12_24m          89976 non-null  int64  
 2   account_days_in_dc_12_24m            79293 non-null  float64
 3   account_days_in_rem_12_24m           79293 non-null  float64
 4   account_days_in_term_12_24m          79293 non-null  float64
 5   account_incoming_debt_vs_paid_0_24m  36619 non-null  float64
 6   account_status                       41042 non-null  float64
 7   account_worst_status_0_3m            41042 non-null  float64
 8   account_worst_status_12_24m          29921 non-null  float64
 9   account_worst_status_3_6m            38038 non-null  float64
 10  account_worst_status_6_12m           35663 non-null  float64
 11  age                         

## feature scaling and encoding

In [10]:
# null values
# drop columns (null values > 50%)
data.drop(columns = ['worst_status_active_inv','account_worst_status_12_24m',\
                     'account_worst_status_6_12m','account_incoming_debt_vs_paid_0_24m',\
                     'account_worst_status_3_6m','account_worst_status_0_3m',\
                     'account_status','avg_payment_span_0_3m'],inplace=True)

In [11]:
# null values
# impute most frequent
imputer = SimpleImputer(strategy = 'most_frequent')
imputer.fit(data[['avg_payment_span_0_12m','num_active_div_by_paid_inv_0_12m',\
                  'num_arch_written_off_12_24m','num_arch_written_off_0_12m']])
data[['avg_payment_span_0_12m','num_active_div_by_paid_inv_0_12m',\
                  'num_arch_written_off_12_24m','num_arch_written_off_0_12m']] = imputer.transform(data[['avg_payment_span_0_12m','num_active_div_by_paid_inv_0_12m',\
                  'num_arch_written_off_12_24m','num_arch_written_off_0_12m']])

In [12]:
# numerical features scaling
num_features = data.select_dtypes(include=['float64', 'int64']).drop(columns='default').columns
scaler = StandardScaler()
data[num_features] = scaler.fit_transform(data[num_features])

In [13]:
# manual ordinal encoding
data['has_paid'] = data['has_paid'].apply(lambda x: 1 if x == 'True' else 0)

In [14]:
# categorical features encoding
data = pd.get_dummies(data)

In [15]:
data.dropna(inplace=True)

In [16]:
# X,y

X = data.drop(columns= 'default')
y = data['default']


In [17]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

## model training

In [18]:
# baseline model

model = LogisticRegression(solver='liblinear') # add liblinear to solve attribute error
model.fit(X_train,y_train)
model.score(X_train,y_train)

0.9850463922169174

In [19]:
scores = cross_val_score(model, X_train, y_train, cv=5,scoring='accuracy')
scores

array([0.98504639, 0.98432574, 0.98486623, 0.98522656, 0.98513647])

In [20]:
feature_im = permutation_importance(model,X_train,y_train)

In [21]:
df_im = pd.DataFrame(np.vstack((X_train.columns,feature_im.importances_mean)).T)
df_im.rename(columns = {0:'features',1:'feature_im'},inplace=True)
df_im.sort_values(by='feature_im',ascending=False)

Unnamed: 0,features,feature_im
10,num_active_inv,0.002551
23,status_max_archived_0_12_months,0.000717
14,num_arch_ok_12_24m,0.000202
18,num_unpaid_bills,0.000119
21,status_3rd_last_archived_0_24m,0.000086
...,...,...
91,merchant_group_Entertainment,-0.00013
2,account_days_in_rem_12_24m,-0.000148
11,num_arch_dc_0_12m,-0.000259
24,status_max_archived_0_24_months,-0.000288


In [22]:
# model training

In [23]:
# model scoring

## model prediction

In [24]:
# predict results