In [18]:
import mysql.connector
import pandas as pd
import getpass
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
password = getpass.getpass()


In [19]:
cnx = mysql.connector.connect(user = 'root', password = password,
                              host = '127.0.0.1', database = 'bank')


In [20]:
cnx.is_connected()


True

In [21]:
cursor = cnx.cursor()


In [23]:
query = '''
        with client_summary as (
select
        account_id,
        avg(amount) as avg_trans_amount,
        count(trans_id) as trans_count,
        avg(balance) as avg_balance
        from trans
        group by 1
)
select 
    c.client_id,
    left(c.birth_number, 2) as age,
    c.district_id,
    a.frequency,
    convert(a.date,date) as account_start_date,
    convert(left(cd.issued,6),date) as card_issued_date,
    convert(l.date,date) as loan_start_date,
    datediff(convert(l.date,date),convert(a.date,date)) as days_between,
    cs.avg_trans_amount,
    cs.avg_balance,
    cs.trans_count,
    cd.type as card_type,
    l.amount as loan_amount,
    l.duration as loan_duration,
    l.payments as loan_payments,
    l.status as loan_status
    from client c
    join disp d on c.client_id = d.client_id
    join account a on d.account_id = a.account_id
    join loan l on l.account_id = a.account_id
    left join card cd on cd.disp_id = d.disp_id
    join client_summary cs on cs.account_id = a.account_id
        '''

In [24]:
cursor.execute(query)


In [25]:
data = pd.DataFrame(cursor.fetchall())
data.columns = [head[0] for head in cursor.description]


In [26]:
data.head()

Unnamed: 0,client_id,age,district_id,frequency,account_start_date,card_issued_date,loan_start_date,days_between,avg_trans_amount,avg_balance,trans_count,card_type,loan_amount,loan_duration,loan_payments,loan_status
0,2,45,1,POPLATEK MESICNE,1993-02-26,,1994-01-05,313,5459.547324,36313.029626,374,,80952,24,3373.0,A
1,3,40,1,POPLATEK MESICNE,1993-02-26,,1994-01-05,313,5459.547324,36313.029626,374,,80952,24,3373.0,A
2,25,39,21,POPLATEK MESICNE,1995-04-07,,1996-04-29,388,4165.924709,15464.59187,259,,30276,12,2523.0,B
3,31,62,68,POPLATEK MESICNE,1996-07-28,,1997-12-08,498,11278.724888,55738.666866,225,,30276,12,2523.0,A
4,45,52,20,POPLATEK MESICNE,1997-08-18,,1998-10-14,422,7871.525263,37845.293508,95,,318480,60,5308.0,D


In [27]:
data['loan_status'].value_counts()


C    493
A    258
D     45
B     31
Name: loan_status, dtype: int64

In [28]:
data.shape


(827, 16)

In [29]:
data.dtypes

client_id               int64
age                    object
district_id             int64
frequency              object
account_start_date     object
card_issued_date       object
loan_start_date        object
days_between            int64
avg_trans_amount      float64
avg_balance           float64
trans_count             int64
card_type              object
loan_amount             int64
loan_duration           int64
loan_payments         float64
loan_status            object
dtype: object

In [30]:
data.isna().sum()

client_id               0
age                     0
district_id             0
frequency               0
account_start_date      0
card_issued_date      657
loan_start_date         0
days_between            0
avg_trans_amount        0
avg_balance             0
trans_count             0
card_type             657
loan_amount             0
loan_duration           0
loan_payments           0
loan_status             0
dtype: int64

In [31]:
data = data.drop(['card_issued_date', 'card_type'], axis = 1)

In [32]:
data.dtypes
data['age'] = data['age'].astype('int')
data['account_start_date'] = pd.to_datetime(data['account_start_date'])
data['loan_start_date'] = pd.to_datetime(data['loan_start_date'])


In [33]:
data['account_start_date'] = data['account_start_date'].map(dt.datetime.toordinal)
data['loan_start_date'] = data['loan_start_date'].map(dt.datetime.toordinal)

In [34]:
data.head()

Unnamed: 0,client_id,age,district_id,frequency,account_start_date,loan_start_date,days_between,avg_trans_amount,avg_balance,trans_count,loan_amount,loan_duration,loan_payments,loan_status
0,2,45,1,POPLATEK MESICNE,727620,727933,313,5459.547324,36313.029626,374,80952,24,3373.0,A
1,3,40,1,POPLATEK MESICNE,727620,727933,313,5459.547324,36313.029626,374,80952,24,3373.0,A
2,25,39,21,POPLATEK MESICNE,728390,728778,388,4165.924709,15464.59187,259,30276,12,2523.0,B
3,31,62,68,POPLATEK MESICNE,728868,729366,498,11278.724888,55738.666866,225,30276,12,2523.0,A
4,45,52,20,POPLATEK MESICNE,729254,729676,422,7871.525263,37845.293508,95,318480,60,5308.0,D


In [35]:
data['district_id'] = data ['district_id'].astype('object')
data = data.drop('client_id', axis = 1)


In [36]:
data.isna().sum().sum() 


0

In [41]:
import numpy as np
from sklearn.preprocessing import Normalizer
x = data.select_dtypes(include = np.number)
x.head()

Unnamed: 0,age,account_start_date,loan_start_date,days_between,avg_trans_amount,avg_balance,trans_count,loan_amount,loan_duration,loan_payments
0,45,727620,727933,313,5459.547324,36313.029626,374,80952,24,3373.0
1,40,727620,727933,313,5459.547324,36313.029626,374,80952,24,3373.0
2,39,728390,728778,388,4165.924709,15464.59187,259,30276,12,2523.0
3,62,728868,729366,498,11278.724888,55738.666866,225,30276,12,2523.0
4,52,729254,729676,422,7871.525263,37845.293508,95,318480,60,5308.0


In [39]:
transformer = Normalizer().fit(x)
x_normalized = transformer.transform(x)
x = pd.DataFrame(x_normalized)
x.columns = x.columns
x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,4.4e-05,0.704329,0.704632,0.000303,0.005285,0.035151,0.000362,0.078361,2.3e-05,0.003265
1,3.9e-05,0.704329,0.704632,0.000303,0.005285,0.035151,0.000362,0.078361,2.3e-05,0.003265
2,3.8e-05,0.706526,0.706902,0.000376,0.004041,0.015,0.000251,0.029367,1.2e-05,0.002447
3,6e-05,0.705487,0.705969,0.000482,0.010917,0.053951,0.000218,0.029305,1.2e-05,0.002442
4,4.8e-05,0.675006,0.675397,0.000391,0.007286,0.03503,8.8e-05,0.294789,5.6e-05,0.004913


In [44]:
cat = data.select_dtypes(include=object)
cat = cat.drop(['loan_status'], axis=1)

cat

Unnamed: 0,district_id,frequency
0,1,POPLATEK MESICNE
1,1,POPLATEK MESICNE
2,21,POPLATEK MESICNE
3,68,POPLATEK MESICNE
4,20,POPLATEK MESICNE
...,...,...
822,54,POPLATEK MESICNE
823,1,POPLATEK TYDNE
824,1,POPLATEK TYDNE
825,61,POPLATEK MESICNE


In [45]:
categorical = pd.get_dummies(cat,columns=['district_id','frequency'])
categorical

  categorical = pd.get_dummies(cat,columns=['district_id','frequency'])


Unnamed: 0,district_id_1,district_id_2,district_id_3,district_id_4,district_id_5,district_id_6,district_id_7,district_id_8,district_id_9,district_id_10,...,district_id_71,district_id_72,district_id_73,district_id_74,district_id_75,district_id_76,district_id_77,frequency_POPLATEK MESICNE,frequency_POPLATEK PO OBRATU,frequency_POPLATEK TYDNE
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
822,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
823,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
824,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
825,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [46]:
x = np.concatenate((x,categorical),axis=1)

In [47]:
y = data['loan_status']

In [48]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.4)

In [49]:
from sklearn.linear_model import LogisticRegression
classification = LogisticRegression().fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [50]:
predictions = classification.predict(x_test)
predictions

array(['C', 'C', 'C', 'C', 'C', 'C', 'A', 'A', 'C', 'C', 'D', 'A', 'C',
       'C', 'C', 'C', 'C', 'C', 'C', 'A', 'C', 'C', 'A', 'A', 'C', 'A',
       'C', 'A', 'C', 'A', 'C', 'A', 'C', 'A', 'C', 'C', 'C', 'C', 'C',
       'C', 'A', 'C', 'C', 'C', 'C', 'A', 'C', 'C', 'C', 'C', 'A', 'C',
       'C', 'A', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'A', 'C', 'C',
       'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'A', 'A', 'A', 'C', 'A',
       'C', 'C', 'A', 'C', 'C', 'C', 'C', 'C', 'A', 'C', 'C', 'C', 'C',
       'A', 'A', 'D', 'C', 'C', 'A', 'A', 'A', 'C', 'C', 'C', 'C', 'C',
       'C', 'A', 'C', 'A', 'C', 'C', 'C', 'D', 'C', 'A', 'C', 'A', 'C',
       'C', 'C', 'C', 'A', 'C', 'C', 'C', 'C', 'A', 'C', 'C', 'C', 'C',
       'C', 'A', 'C', 'A', 'A', 'A', 'C', 'C', 'C', 'C', 'C', 'C', 'C',
       'C', 'A', 'C', 'A', 'A', 'A', 'A', 'C', 'A', 'C', 'C', 'A', 'A',
       'A', 'C', 'C', 'C', 'A', 'C', 'C', 'A', 'C', 'C', 'A', 'C', 'A',
       'C', 'C', 'A', 'A', 'C', 'C', 'C', 'C', 'C', 'C', 'A', 'C

In [51]:
classification.score(x_test,y_test)

0.676737160120846

In [52]:
y_test.value_counts

<bound method IndexOpsMixin.value_counts of 321    C
780    C
185    C
579    A
559    C
      ..
288    B
36     D
601    A
820    C
651    C
Name: loan_status, Length: 331, dtype: object>

In [53]:
pd.Series(predictions).value_counts()

C    226
A     98
D      7
dtype: int64

In [54]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[ 63,   0,  52,   0],
       [  7,   0,   9,   2],
       [ 26,   0, 158,   2],
       [  2,   0,   7,   3]], dtype=int64)