In [1]:
# libraries
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
# load data
df = pd.read_csv('local_training_data/credit_score/credit_score.csv')
df.head()

Unnamed: 0,Status,Seniority,Home,Time,Age,Marital,Records,Job,Expenses,Income,Assets,Debt,Amount,Price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [None]:
# exploratory data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4455 entries, 0 to 4454
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   Status     4455 non-null   int64
 1   Seniority  4455 non-null   int64
 2   Home       4455 non-null   int64
 3   Time       4455 non-null   int64
 4   Age        4455 non-null   int64
 5   Marital    4455 non-null   int64
 6   Records    4455 non-null   int64
 7   Job        4455 non-null   int64
 8   Expenses   4455 non-null   int64
 9   Income     4455 non-null   int64
 10  Assets     4455 non-null   int64
 11  Debt       4455 non-null   int64
 12  Amount     4455 non-null   int64
 13  Price      4455 non-null   int64
dtypes: int64(14)
memory usage: 487.4 KB


In [None]:
# data preparation

In [4]:
# normalize the columns
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,1,9,1,60,30,2,1,3,73,129,0,0,800,846
1,1,17,1,60,58,3,1,1,48,131,0,0,1000,1658
2,2,10,2,36,46,2,2,3,90,200,3000,0,2000,2985
3,1,0,1,60,24,1,1,1,63,182,2500,0,900,1325
4,1,0,1,36,26,1,1,1,46,107,0,0,310,910


In [5]:
# categorize the columns
categorical = ['home', 'marital', 'records', 'job']
numerical = ['seniority', 'time', 'age', 'expenses', 'income', 'assets', 'debt', 'amount', 'price']
target = ['status']

In [6]:
# see prediction distribution
df['status'].value_counts()

status
1    3200
2    1254
0       1
Name: count, dtype: int64

In [7]:
# fillter out 
df = df[ df['status'] != 0 ]

In [8]:
df['status'].value_counts()

status
1    3200
2    1254
Name: count, dtype: int64

In [11]:
# normalize prediction values
df['status'] = (df['status']==2).astype(int)

In [12]:
df['status'].value_counts()

status
0    3200
1    1254
Name: count, dtype: int64

In [13]:
# normalize categorical features
home_values = {
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk' 
}
df['home'] = df['home'].map(home_values)

In [14]:
marital_values = {
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}
df['marital'] = df['marital'].map(marital_values)

In [15]:
records_values = {
    1: 'no',
    2: 'yes',
    0: 'unk' 
}
df['records'] = df['records'].map(records_values)

In [16]:
job_values = {
    1: 'fixed',
    2: 'parttime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}
df['job'] = df['job'].map(job_values)

In [17]:
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,0,9,rent,60,30,married,no,freelance,73,129,0,0,800,846
1,0,17,rent,60,58,widow,no,fixed,48,131,0,0,1000,1658
2,1,10,owner,36,46,married,yes,freelance,90,200,3000,0,2000,2985
3,0,0,rent,60,24,single,no,fixed,63,182,2500,0,900,1325
4,0,0,rent,36,26,single,no,fixed,46,107,0,0,310,910


In [19]:
# describe numerical features
df.describe().round()

Unnamed: 0,status,seniority,time,age,expenses,income,assets,debt,amount,price
count,4454.0,4454.0,4454.0,4454.0,4454.0,4454.0,4454.0,4454.0,4454.0,4454.0
mean,0.0,8.0,46.0,37.0,56.0,763488.0,1060578.0,404473.0,1039.0,1463.0
std,0.0,8.0,15.0,11.0,20.0,8704595.0,10218704.0,6344963.0,475.0,628.0
min,0.0,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,0.0,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1117.0
50%,0.0,5.0,48.0,36.0,51.0,120.0,3500.0,0.0,1000.0,1400.0
75%,1.0,12.0,60.0,45.0,72.0,166.0,6000.0,0.0,1300.0,1692.0
max,1.0,48.0,72.0,68.0,180.0,99999999.0,99999999.0,99999999.0,5000.0,11140.0


In [20]:
# looks like 99999999.0 is placeholder for nan
def replace_column_value(df: pd.DataFrame, column: str, old_value: float, new_value: float = np.nan) -> pd.DataFrame:
    df[column] = df[column].replace(to_replace=old_value, value=new_value)
    return df

In [23]:
df = replace_column_value(df, 'income', 99999999)
df = replace_column_value(df, 'assets', 99999999)
df = replace_column_value(df, 'debt', 99999999)

In [24]:
df.describe().round()

Unnamed: 0,status,seniority,time,age,expenses,income,assets,debt,amount,price
count,4454.0,4454.0,4454.0,4454.0,4454.0,4420.0,4407.0,4436.0,4454.0,4454.0
mean,0.0,8.0,46.0,37.0,56.0,131.0,5404.0,343.0,1039.0,1463.0
std,0.0,8.0,15.0,11.0,20.0,86.0,11574.0,1246.0,475.0,628.0
min,0.0,0.0,6.0,18.0,35.0,0.0,0.0,0.0,100.0,105.0
25%,0.0,2.0,36.0,28.0,35.0,80.0,0.0,0.0,700.0,1117.0
50%,0.0,5.0,48.0,36.0,51.0,120.0,3000.0,0.0,1000.0,1400.0
75%,1.0,12.0,60.0,45.0,72.0,165.0,6000.0,0.0,1300.0,1692.0
max,1.0,48.0,72.0,68.0,180.0,959.0,300000.0,30000.0,5000.0,11140.0


In [25]:
def df_fill_missing_values(df: pd.DataFrame, value: float = 0) -> pd.DataFrame:
    return df.fillna(value)

In [26]:
df = df_fill_missing_values(df)

In [27]:
df.head()

Unnamed: 0,status,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
0,0,9,rent,60,30,married,no,freelance,73,129.0,0.0,0.0,800,846
1,0,17,rent,60,58,widow,no,fixed,48,131.0,0.0,0.0,1000,1658
2,1,10,owner,36,46,married,yes,freelance,90,200.0,3000.0,0.0,2000,2985
3,0,0,rent,60,24,single,no,fixed,63,182.0,2500.0,0.0,900,1325
4,0,0,rent,36,26,single,no,fixed,46,107.0,0.0,0.0,310,910


In [29]:
# data preparation / feature engineering
from sklearn.model_selection import train_test_split

df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=11)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=11)

In [30]:
len(df_train), len(df_val), len(df_test)

(2672, 891, 891)

In [31]:
# let's analyze the data distribution using the df_train_full
from sklearn.metrics import mutual_info_score

# data preparation / mutual information
def print_mutual_information(df: pd.DataFrame, categorical_columns: list[str], target_column: str):
    def calculate_mi(series):
        return mutual_info_score(series, df[target_column])

    df_mi = df[categorical_columns].apply(calculate_mi)
    df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
    print(df_mi)

In [33]:
print_mutual_information(df_train_full, categorical, 'status')

               MI
records  0.038154
job      0.036067
home     0.024469
marital  0.005283


In [34]:
# data preparation / correlation
def compute_correlation(df: pd.DataFrame, numerical_columns: list[str], target_column: str):
    df_corr = df[numerical_columns].corrwith(df[target_column])
    df_corr = df_corr.sort_values(ascending=False).to_frame(name='Correlation')
    return df_corr

In [35]:
compute_correlation(df_train_full, numerical, 'status')

Unnamed: 0,Correlation
amount,0.154918
time,0.107026
expenses,0.023566
price,0.008124
debt,0.004063
age,-0.084724
assets,-0.101276
income,-0.223512
seniority,-0.258317


In [36]:
# target values
y_train = df_train['status'].values
y_val = df_val['status'].values
y_test = df_test['status'].values

del df_train['status']
del df_val['status']
del df_test['status']

In [37]:
y_train[:20]

array([1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [38]:
df_train.head()

Unnamed: 0,seniority,home,time,age,marital,records,job,expenses,income,assets,debt,amount,price
951,10,owner,36,36,married,no,freelance,75,0.0,10000.0,0.0,1000,1400
688,6,parents,48,32,single,yes,fixed,35,85.0,0.0,0.0,1100,1330
2233,1,parents,48,40,married,no,fixed,75,121.0,0.0,0.0,1320,1600
3304,1,parents,48,23,single,no,parttime,35,72.0,0.0,0.0,1078,1079
2271,5,owner,36,46,married,no,freelance,60,100.0,4000.0,0.0,1100,1897


In [39]:
# get dict representation
dict_train = df_train.to_dict(orient='records')
dict_val = df_val.to_dict(orient='records')
dict_test = df_test.to_dict(orient='records')

In [40]:
# train vectorizer
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)
x_train = dv.fit_transform(dict_train)
x_val = dv.transform(dict_val)

In [41]:
x_train[:2]

array([[3.60e+01, 1.00e+03, 1.00e+04, 0.00e+00, 7.50e+01, 0.00e+00,
        0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        1.40e+03, 1.00e+00, 0.00e+00, 1.00e+01, 3.60e+01],
       [3.20e+01, 1.10e+03, 0.00e+00, 0.00e+00, 3.50e+01, 0.00e+00,
        0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        8.50e+01, 1.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 1.00e+00, 0.00e+00, 0.00e+00,
        1.33e+03, 0.00e+00, 1.00e+00, 6.00e+00, 4.80e+01]])

In [None]:
# model selection

In [43]:
# train a decision tree
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)

In [46]:
from sklearn.metrics import roc_auc_score

y_pred = dt.predict_proba(x_train)[:, 1]
train_auc = roc_auc_score(y_train, y_pred)
print('train_auc:', train_auc)

y_pred = dt.predict_proba(x_val)[:, 1]
val_auc = roc_auc_score(y_val, y_pred)
print('val_auc:', val_auc)

train_auc: 1.0
val_auc: 0.6602346758373496


In [48]:
dt = DecisionTreeClassifier(max_depth=2)
dt.fit(x_train, y_train)

In [49]:
y_pred = dt.predict_proba(x_train)[:, 1]
train_auc = roc_auc_score(y_train, y_pred)
print('train_auc:', train_auc)

y_pred = dt.predict_proba(x_val)[:, 1]
val_auc = roc_auc_score(y_val, y_pred)
print('val_auc:', val_auc)

train_auc: 0.7054989859726213
val_auc: 0.6685264343319367


In [50]:
from sklearn.tree import export_text
from sklearn.base import BaseEstimator

def explain_tree_regressor(model: BaseEstimator, features_names: list[str]):
    tree_text = export_text(model, feature_names=features_names)
    return tree_text

In [52]:
out = explain_tree_regressor(dt, dv.feature_names_)
print(out)

|--- records=no <= 0.50
|   |--- seniority <= 6.50
|   |   |--- class: 1
|   |--- seniority >  6.50
|   |   |--- class: 0
|--- records=no >  0.50
|   |--- job=parttime <= 0.50
|   |   |--- class: 0
|   |--- job=parttime >  0.50
|   |   |--- class: 1



In [64]:
def train_evaluate_model(model, x_train, y_train, x_val, y_val):
    # train
    model.fit(x_train, y_train)
    # training metric
    y_pred = model.predict_proba(x_train)[:, 1]
    auc = roc_auc_score(y_train, y_pred)
    print('train auc: %.3f' % auc)
    # validation metric
    y_pred = model.predict_proba(x_val)[:, 1]
    auc = roc_auc_score(y_val, y_pred)
    print('val auc: %.3f' % auc)
    # return model
    return model

In [65]:
# find the best hyper-parameter
for depth in [1, 2, 3, 4, 5, 6, 10, 15, 20, None]:
    dt = DecisionTreeClassifier(max_depth=depth)
    print('depth:', depth)
    train_evaluate_model(dt, x_train, y_train, x_val, y_val)

depth: 1
train auc: 0.628
val auc: 0.606
depth: 2
train auc: 0.705
val auc: 0.669
depth: 3
train auc: 0.776
val auc: 0.739
depth: 4
train auc: 0.816
val auc: 0.761
depth: 5
train auc: 0.843
val auc: 0.766
depth: 6
train auc: 0.875
val auc: 0.758
depth: 10
train auc: 0.964
val auc: 0.684
depth: 15
train auc: 0.999
val auc: 0.663
depth: 20
train auc: 1.000
val auc: 0.657
depth: None
train auc: 1.000
val auc: 0.654


In [66]:
# get the best depths
for m in [4, 5, 6]:
    # find the best min_samples_leaf
    for s in [1, 5, 10, 15, 20, 50, 100, 200]:
        dt = DecisionTreeClassifier(max_depth=m, min_samples_leaf=s)
        print('depth: %s - samples: %s ' % (m, s))
        train_evaluate_model(dt, x_train, y_train, x_val, y_val)

depth: 4 - samples: 1 
train auc: 0.816
val auc: 0.761
depth: 4 - samples: 5 
train auc: 0.816
val auc: 0.761
depth: 4 - samples: 10 
train auc: 0.816
val auc: 0.761
depth: 4 - samples: 15 
train auc: 0.817
val auc: 0.764
depth: 4 - samples: 20 
train auc: 0.817
val auc: 0.761
depth: 4 - samples: 50 
train auc: 0.811
val auc: 0.753
depth: 4 - samples: 100 
train auc: 0.803
val auc: 0.756
depth: 4 - samples: 200 
train auc: 0.794
val auc: 0.747
depth: 5 - samples: 1 
train auc: 0.843
val auc: 0.766
depth: 5 - samples: 5 
train auc: 0.843
val auc: 0.768
depth: 5 - samples: 10 
train auc: 0.843
val auc: 0.762
depth: 5 - samples: 15 
train auc: 0.840
val auc: 0.772
depth: 5 - samples: 20 
train auc: 0.837
val auc: 0.774
depth: 5 - samples: 50 
train auc: 0.829
val auc: 0.767
depth: 5 - samples: 100 
train auc: 0.819
val auc: 0.763
depth: 5 - samples: 200 
train auc: 0.805
val auc: 0.759
depth: 6 - samples: 1 
train auc: 0.875
val auc: 0.757
depth: 6 - samples: 5 
train auc: 0.873
val auc: 

In [67]:
# model training
dt = DecisionTreeClassifier(max_depth=5, min_samples_leaf=15)
dt = train_evaluate_model(dt, x_train, y_train, x_val, y_val)

train auc: 0.840
val auc: 0.773


In [None]:
# model evaluation

In [63]:
x_test = dv.transform(dict_test)

In [70]:
def evaluate(model, x_test, y_test):
    y_pred = model.predict_proba(x_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred)
    print('test auc: %.3f' % auc)

In [71]:
evaluate(dt, x_test, y_test)

test auc: 0.774


In [72]:
# model selection -> try RandomForest
from sklearn.ensemble import RandomForestClassifier

In [73]:
rf = RandomForestClassifier(n_estimators=10)
train_evaluate_model(rf, x_train, y_train, x_val, y_val)

train auc: 1.000
val auc: 0.789


In [74]:
rf = RandomForestClassifier(n_estimators=10, random_state=3)
train_evaluate_model(rf, x_train, y_train, x_val, y_val)

train auc: 1.000
val auc: 0.781


In [77]:
# figure out the number of estimators
for i in range(10, 201, 10):
    rf = RandomForestClassifier(n_estimators=i, random_state=3)
    print('-- estimators: %s' % i)
    train_evaluate_model(rf, x_train, y_train, x_val, y_val)

-- estimators: 10
train auc: 1.000
val auc: 0.781
-- estimators: 20
train auc: 1.000
val auc: 0.801
-- estimators: 30
train auc: 1.000
val auc: 0.813
-- estimators: 40
train auc: 1.000
val auc: 0.816
-- estimators: 50
train auc: 1.000
val auc: 0.817
-- estimators: 60
train auc: 1.000
val auc: 0.818
-- estimators: 70
train auc: 1.000
val auc: 0.820
-- estimators: 80
train auc: 1.000
val auc: 0.819
-- estimators: 90
train auc: 1.000
val auc: 0.821
-- estimators: 100
train auc: 1.000
val auc: 0.820
-- estimators: 110
train auc: 1.000
val auc: 0.820
-- estimators: 120
train auc: 1.000
val auc: 0.821
-- estimators: 130
train auc: 1.000
val auc: 0.822
-- estimators: 140
train auc: 1.000
val auc: 0.822
-- estimators: 150
train auc: 1.000
val auc: 0.821
-- estimators: 160
train auc: 1.000
val auc: 0.820
-- estimators: 170
train auc: 1.000
val auc: 0.819
-- estimators: 180
train auc: 1.000
val auc: 0.819
-- estimators: 190
train auc: 1.000
val auc: 0.820
-- estimators: 200
train auc: 1.000
val 

In [78]:
for depth in [5, 10, 20]:
    print('depth: %s' % depth)
    for i in range(10, 201, 10):
        print('depth: %s - estimators: %s' % (depth, i))
        rf = RandomForestClassifier(n_estimators=i, max_depth=depth, random_state=1)
        train_evaluate_model(rf, x_train, y_train, x_val, y_val)

depth: 5
depth: 5 - estimators: 10
train auc: 0.863
val auc: 0.788
depth: 5 - estimators: 20
train auc: 0.874
val auc: 0.798
depth: 5 - estimators: 30
train auc: 0.875
val auc: 0.800
depth: 5 - estimators: 40
train auc: 0.874
val auc: 0.800
depth: 5 - estimators: 50
train auc: 0.875
val auc: 0.800
depth: 5 - estimators: 60
train auc: 0.875
val auc: 0.801
depth: 5 - estimators: 70
train auc: 0.874
val auc: 0.802
depth: 5 - estimators: 80
train auc: 0.875
val auc: 0.803
depth: 5 - estimators: 90
train auc: 0.876
val auc: 0.803
depth: 5 - estimators: 100
train auc: 0.875
val auc: 0.804
depth: 5 - estimators: 110
train auc: 0.876
val auc: 0.807
depth: 5 - estimators: 120
train auc: 0.876
val auc: 0.807
depth: 5 - estimators: 130
train auc: 0.877
val auc: 0.808
depth: 5 - estimators: 140
train auc: 0.876
val auc: 0.807
depth: 5 - estimators: 150
train auc: 0.876
val auc: 0.807
depth: 5 - estimators: 160
train auc: 0.876
val auc: 0.807
depth: 5 - estimators: 170
train auc: 0.876
val auc: 0.8

In [79]:
rf = RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_leaf=5, random_state=1)
train_evaluate_model(rf, x_train, y_train, x_val, y_val)

train auc: 0.940
val auc: 0.825


In [80]:
evaluate(rf, x_test, y_test)

test auc: 0.824


In [83]:
# model selection -> try RandomForest
import xgboost as xgb

In [85]:
import xgboost as xgb
import numpy as np

def xgb_dmatrix(x: np.ndarray, y: np.ndarray, feature_names: list[str]):
    return xgb.DMatrix(x, label=y, feature_names=feature_names)

In [86]:
dtrain = xgb_dmatrix(x_train, y_train, dv.feature_names_)
type(dtrain)

xgboost.core.DMatrix

In [87]:
dval = xgb_dmatrix(x_val, y_val, dv.feature_names_)

In [88]:
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'binary:logistic',
    'nthread': 8,
    'seed': 1,
    'silent': 1
}
model = xgb.train(xgb_params, dtrain, num_boost_round=10)

Parameters: { "silent" } are not used.



In [90]:
y_pred = model.predict(dval)
y_pred[:10]

array([0.07751459, 0.07131889, 0.0675393 , 0.12096145, 0.05379206,
       0.06382526, 0.06665564, 0.4352498 , 0.41923273, 0.02550563],
      dtype=float32)

In [91]:
roc_auc_score(y_val, y_pred)

0.8065256351262986

In [93]:
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread': 8,
    'seed': 1,
    'silent': 1
}
watchlist = [(dtrain, 'train'), (dval, 'val')]
model = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=10)

[0]	train-auc:0.86730	val-auc:0.77938
[10]	train-auc:0.95447	val-auc:0.80851
[20]	train-auc:0.97464	val-auc:0.81729
[30]	train-auc:0.98579	val-auc:0.81543
[40]	train-auc:0.99421	val-auc:0.80922
[50]	train-auc:0.99653	val-auc:0.80918
[60]	train-auc:0.99817	val-auc:0.81172
[70]	train-auc:0.99934	val-auc:0.80870


Parameters: { "silent" } are not used.



[80]	train-auc:0.99979	val-auc:0.80549
[90]	train-auc:0.99993	val-auc:0.80409
[99]	train-auc:0.99999	val-auc:0.80560


In [95]:
xgb_params = {
    'eta': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread': 8,
    'seed': 1,
    'silent': 1
}
model = xgb.train(xgb_params, dtrain, num_boost_round=500, verbose_eval=10, evals=watchlist)
# best output: [60]	train-auc:0.97708	val-auc:0.82781

[0]	train-auc:0.86730	val-auc:0.77938
[10]	train-auc:0.91874	val-auc:0.80510
[20]	train-auc:0.93873	val-auc:0.81804
[30]	train-auc:0.95338	val-auc:0.82063
[40]	train-auc:0.96325	val-auc:0.82644
[50]	train-auc:0.97195	val-auc:0.82549
[60]	train-auc:0.97708	val-auc:0.82781


Parameters: { "silent" } are not used.



[70]	train-auc:0.98214	val-auc:0.82681
[80]	train-auc:0.98517	val-auc:0.82560
[90]	train-auc:0.98840	val-auc:0.82443
[100]	train-auc:0.99061	val-auc:0.82456
[110]	train-auc:0.99224	val-auc:0.82274
[120]	train-auc:0.99378	val-auc:0.82154
[130]	train-auc:0.99541	val-auc:0.82252
[140]	train-auc:0.99630	val-auc:0.82219
[150]	train-auc:0.99711	val-auc:0.82136
[160]	train-auc:0.99774	val-auc:0.82102
[170]	train-auc:0.99838	val-auc:0.82060
[180]	train-auc:0.99882	val-auc:0.82053
[190]	train-auc:0.99904	val-auc:0.81973
[200]	train-auc:0.99929	val-auc:0.81830
[210]	train-auc:0.99947	val-auc:0.81806
[220]	train-auc:0.99961	val-auc:0.81763
[230]	train-auc:0.99972	val-auc:0.81727
[240]	train-auc:0.99984	val-auc:0.81735
[250]	train-auc:0.99987	val-auc:0.81732
[260]	train-auc:0.99990	val-auc:0.81756
[270]	train-auc:0.99993	val-auc:0.81758
[280]	train-auc:0.99995	val-auc:0.81780
[290]	train-auc:0.99997	val-auc:0.81791
[300]	train-auc:0.99997	val-auc:0.81789
[310]	train-auc:0.99998	val-auc:0.81789
[32

In [96]:
xgb_params = {
    'eta': 0.1,
    'max_depth': 3,
    'min_child_weight': 1,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread': 8,
    'seed': 1,
    'silent': 1
}
model = xgb.train(xgb_params, dtrain, num_boost_round=500, verbose_eval=10, evals=watchlist)

[0]	train-auc:0.77610	val-auc:0.73891
[10]	train-auc:0.84675	val-auc:0.78896
[20]	train-auc:0.86931	val-auc:0.80928
[30]	train-auc:0.88080	val-auc:0.81808
[40]	train-auc:0.89038	val-auc:0.82347
[50]	train-auc:0.89756	val-auc:0.82583
[60]	train-auc:0.90319	val-auc:0.82847
[70]	train-auc:0.90768	val-auc:0.83005
[80]	train-auc:0.91221	val-auc:0.83117
[90]	train-auc:0.91638	val-auc:0.83029
[100]	train-auc:0.91913	val-auc:0.83113
[110]	train-auc:0.92297	val-auc:0.83170


Parameters: { "silent" } are not used.



[120]	train-auc:0.92544	val-auc:0.83124
[130]	train-auc:0.92789	val-auc:0.83169
[140]	train-auc:0.92996	val-auc:0.83088
[150]	train-auc:0.93200	val-auc:0.83108
[160]	train-auc:0.93416	val-auc:0.83154
[170]	train-auc:0.93638	val-auc:0.83129
[180]	train-auc:0.93842	val-auc:0.83046
[190]	train-auc:0.94005	val-auc:0.83014
[200]	train-auc:0.94181	val-auc:0.82986
[210]	train-auc:0.94374	val-auc:0.82907
[220]	train-auc:0.94529	val-auc:0.82902
[230]	train-auc:0.94725	val-auc:0.82884
[240]	train-auc:0.94840	val-auc:0.82846
[250]	train-auc:0.94951	val-auc:0.82877
[260]	train-auc:0.95073	val-auc:0.82834
[270]	train-auc:0.95199	val-auc:0.82806
[280]	train-auc:0.95356	val-auc:0.82778
[290]	train-auc:0.95489	val-auc:0.82745
[300]	train-auc:0.95627	val-auc:0.82647
[310]	train-auc:0.95734	val-auc:0.82537
[320]	train-auc:0.95867	val-auc:0.82570
[330]	train-auc:0.96006	val-auc:0.82514
[340]	train-auc:0.96159	val-auc:0.82462
[350]	train-auc:0.96267	val-auc:0.82407
[360]	train-auc:0.96369	val-auc:0.82366


In [97]:
xgb_params = {
    'eta': 0.1,
    'max_depth': 3,
    'min_child_weight': 1,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread': 8,
    'seed': 1,
    'silent': 1 
}
num_trees = 160
model = xgb.train(xgb_params, dtrain, num_boost_round=num_trees)

Parameters: { "silent" } are not used.



In [98]:
dtest = xgb_dmatrix(x_test, y_test, dv.feature_names_)

In [102]:
y_pred = model.predict(dtest)
auc = roc_auc_score(y_test, y_pred)
print('auc:', auc)
print(y_test[:10])
print(y_pred[:10])
print(x_test.shape)
print(x_test[:10])

auc: 0.8243243243243242
[1 0 0 0 1 0 0 0 0 0]
[0.2691293  0.23095696 0.05407206 0.20785248 0.5611077  0.2912596
 0.00579472 0.03983681 0.34388596 0.22907211]
(891, 29)
[[2.600e+01 8.000e+02 6.000e+04 3.000e+03 3.500e+01 0.000e+00 0.000e+00
  1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  1.000e+00 0.000e+00 0.000e+00 1.000e+03 1.000e+00 0.000e+00 3.000e+00
  3.600e+01]
 [2.800e+01 2.250e+03 1.800e+01 3.000e+03 7.800e+01 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00 3.250e+02 1.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 2.250e+03 1.000e+00 0.000e+00 1.000e+01
  6.000e+01]
 [4.100e+01 1.150e+03 0.000e+00 0.000e+00 5.300e+01 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00 1.810e+02 1.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00
  0.000e+00 0.