In [1]:
import pandas as pd
import numpy as np

# In this project we will build a model to estimate current credit status of a customer. 

In most applications, we would like to *predict* probability of default for a customer in the future, but we don't have data for that.

In [2]:
# this is to read data on Google Drive
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
# You can find the data here: https://www.kaggle.com/wordsforthewise/lending-club
data=pd.read_csv("drive/My Drive/accepted.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


# Steps - compare the order of steps with those for XGBoost:
1. Data Exploration - understand the data
2. Preliminary feature exclusion - remove features that do not make sense, or can not be used
3. Observation exclusion - to creat an unbiased sample that represents the target population and serves model's goal
4. One-Hot Encoding
5. Feature Engineering (not needed for this model)
6. Test/Train split
7. Normalization (not needed for tree-based models)
8. Outlier Treatment (not needed for tree-based models)
9. Missing Value Imputation (not needed for the XGBoost package we use)
10. Feature reduction
11. Grid search, and Bias/Variance analysis - Choose the final model


## 1. Data Exploration - the goal here is to know the data better

**Note: This is a demo. Analysis has been done on only some of the attributes. In an actual project, all attributes, that make sense, should be analyzed.**

In [None]:
data.shape

(2260701, 151)

In [None]:
data.tail(5)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
2260696,88985880,,40000.0,40000.0,40000.0,60 months,10.49,859.56,B,B3,...,,,Cash,N,,,,,,
2260697,88224441,,24000.0,24000.0,24000.0,60 months,14.49,564.56,C,C4,...,,,Cash,Y,Mar-2019,ACTIVE,Mar-2019,10000.0,44.82,1.0
2260698,88215728,,14000.0,14000.0,14000.0,60 months,14.49,329.33,C,C4,...,,,Cash,N,,,,,,
2260699,Total amount funded in policy code 1: 1465324575,,,,,,,,,,...,,,,,,,,,,
2260700,Total amount funded in policy code 2: 521953170,,,,,,,,,,...,,,,,,,,,,


In [None]:
# remove invalid observations
data = data[0:2260699]

In [None]:
# frequently check your steps
print (data.shape[0])
data.tail(5)

2260699


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
2260694,89885898,,24000.0,24000.0,24000.0,60 months,12.79,543.5,C,C1,...,,,Cash,N,,,,,,
2260695,88977788,,24000.0,24000.0,24000.0,60 months,10.49,515.74,B,B3,...,,,Cash,N,,,,,,
2260696,88985880,,40000.0,40000.0,40000.0,60 months,10.49,859.56,B,B3,...,,,Cash,N,,,,,,
2260697,88224441,,24000.0,24000.0,24000.0,60 months,14.49,564.56,C,C4,...,,,Cash,Y,Mar-2019,ACTIVE,Mar-2019,10000.0,44.82,1.0
2260698,88215728,,14000.0,14000.0,14000.0,60 months,14.49,329.33,C,C4,...,,,Cash,N,,,,,,


In [None]:
# check data types - objects imply non-numeric
# note that sometimes numeric columns appear as Object, because of few non-numeric observations. Such as a character that may represent a special value.
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  
    print(data.dtypes)

id                                             object
member_id                                     float64
loan_amnt                                     float64
funded_amnt                                   float64
funded_amnt_inv                               float64
term                                           object
int_rate                                      float64
installment                                   float64
grade                                          object
sub_grade                                      object
emp_title                                      object
emp_length                                     object
home_ownership                                 object
annual_inc                                    float64
verification_status                            object
issue_d                                        object
loan_status                                    object
pymnt_plan                                     object
url                         

## 2. Preliminary feature exclusion

Exclude features that do not make sense or can not be used. For example, some features such as Gender can not be used in a Credit Risk model (fair lending practices).  What other features you can think of that can not be used?

Here we will use a small subsample of features. In an actual project, more features would have been selected.

ID will be used for data merge (needed in an actual project), loan_status will be used to define dependent variable, pymnt_plan and hardship_flag will be used to define exclusions. The rest of the variables will be used as independent variables.

In [None]:
final_data = data[["id", "emp_length", "loan_status", "pymnt_plan", "dti", "delinq_2yrs"
,"fico_range_low", "fico_range_high", "inq_last_6mths", "mths_since_last_delinq", "revol_bal",
"revol_util", "total_acc", "avg_cur_bal", "chargeoff_within_12_mths", "hardship_flag"]].copy()

In [None]:
# check the data frequently
final_data.shape

(2260699, 16)

In [None]:
# check the data frequently
final_data.tail(5)

Unnamed: 0,id,emp_length,loan_status,pymnt_plan,dti,delinq_2yrs,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,revol_bal,revol_util,total_acc,avg_cur_bal,chargeoff_within_12_mths,hardship_flag
2260694,89885898,7 years,Current,n,19.61,0.0,665.0,669.0,0.0,,49431.0,84.4,54.0,26106.0,0.0,N
2260695,88977788,10+ years,Current,n,34.94,0.0,695.0,699.0,1.0,60.0,21665.0,39.0,58.0,6369.0,0.0,N
2260696,88985880,9 years,Current,n,12.75,7.0,705.0,709.0,1.0,9.0,8633.0,64.9,37.0,5680.0,0.0,N
2260697,88224441,6 years,Charged Off,n,18.3,0.0,660.0,664.0,0.0,67.0,17641.0,68.1,31.0,6243.0,0.0,N
2260698,88215728,10+ years,Current,n,23.36,0.0,660.0,664.0,1.0,37.0,7662.0,54.0,22.0,17883.0,0.0,N


## 3. Observation Exclusion

In [None]:
final_data.dtypes

id                           object
emp_length                   object
loan_status                  object
pymnt_plan                   object
dti                         float64
delinq_2yrs                 float64
fico_range_low              float64
fico_range_high             float64
inq_last_6mths              float64
mths_since_last_delinq      float64
revol_bal                   float64
revol_util                  float64
total_acc                   float64
avg_cur_bal                 float64
chargeoff_within_12_mths    float64
hardship_flag                object
dtype: object

In [None]:
# Remove observations under payment plan. Cases that are under payment plan, do not follow normal delinquency process. They often have weak profiles but are not tagged as
# delinquent because they are under payment plan.

final_data[["pymnt_plan", "id"]].groupby(["pymnt_plan"]).count()

Unnamed: 0_level_0,id
pymnt_plan,Unnamed: 1_level_1
n,2260048
y,620


In [None]:
final_data = final_data[final_data.pymnt_plan != "y"]
final_data.shape[0]

2260079

In [None]:
# Remove observations under hardship flag, for the same reason as hardship flag.

final_data[["hardship_flag", "id"]].groupby(["hardship_flag"]).count()

Unnamed: 0_level_0,id
hardship_flag,Unnamed: 1_level_1
N,2259836
Y,212


In [None]:
final_data = final_data[final_data.hardship_flag != "Y"]
final_data.shape[0]

2259867

In [None]:
# Last exclusions are related to the target variable. We intend to analyze the current credit status of customer, so we exclude inactive accounts.
# One category of inactive is "charged off" accounts. These are customers who have defaulted previously, and so we have stopped tracking their credit
# status (target variable). Their profile (independent variables) may have improved since they have been charged off, but the target variable 
# shows "charged off". Therefore for these customers, dependent and independent variables do not show the correct relationship.

final_data[["loan_status", "id"]].groupby(["loan_status"]).count()

Unnamed: 0_level_0,id
loan_status,Unnamed: 1_level_1
Charged Off,268559
Current,878313
Default,40
Does not meet the credit policy. Status:Charged Off,761
Does not meet the credit policy. Status:Fully Paid,1988
Fully Paid,1076751
In Grace Period,8420
Late (16-30 days),4266
Late (31-120 days),20738


In [None]:
# Remove cases with missing loan status as well as inactive accounts. 
# Note that we often don't do missing imputation on Y variable. rather exclude those observations.
final_data = final_data[final_data.loan_status != "Charged Off"]
final_data = final_data[final_data.loan_status != "Default"]
final_data = final_data[final_data.loan_status != "Does not meet the credit policy. Status:Charged Off"]
final_data = final_data[final_data.loan_status != "Does not meet the credit policy. Status:Fully Paid"]
final_data = final_data[final_data.loan_status != "Fully Paid"]
final_data = final_data[final_data.loan_status.notnull()]

final_data.shape[0]

911737

In [None]:
# check
final_data[["loan_status", "id"]].groupby(["loan_status"]).count()

Unnamed: 0_level_0,id
loan_status,Unnamed: 1_level_1
Current,878313
In Grace Period,8420
Late (16-30 days),4266
Late (31-120 days),20738


In [None]:
# Define target variable based on "loan_status". We define everyone who is current or in grace priod as good (0), and others as bad (1).
final_data['30+ Delinquent'] = np.where((final_data.loan_status == "Current") | 
                                        (final_data.loan_status == "In Grace Period"),0, 1)

In [None]:
# check
final_data[["30+ Delinquent", "id"]].groupby(["30+ Delinquent"]).count()

Unnamed: 0_level_0,id
30+ Delinquent,Unnamed: 1_level_1
0,886733
1,25004


In [None]:
# remove attributes that are not neede anymore
final_data.drop(['loan_status', 'hardship_flag', 
                'pymnt_plan'], axis=1, inplace=True)

# Missing value imputation

Missing value imputation should be done after Normalization. We are going to replace missing values by 0 (This is an "ok" approach for Neural Networks). We will also need to Normalize the data (Normalization is a necessary step for Neural Networks). Replacing missings with 0, before normalization, affects the normalization process. We prefer missing value imputation impact "no other steps" as much as possible. So we will normalize the data, then impute missing values. Note that normalization process leave missing values unchanged.



# 4. One-Hot Encoding

In [None]:
# there is only one independent non-numerical variable we need to take care of: emp_length
final_data.dtypes

id                           object
emp_length                   object
dti                         float64
delinq_2yrs                 float64
fico_range_low              float64
fico_range_high             float64
inq_last_6mths              float64
mths_since_last_delinq      float64
revol_bal                   float64
revol_util                  float64
total_acc                   float64
avg_cur_bal                 float64
chargeoff_within_12_mths    float64
30+ Delinquent                int64
dtype: object

In [None]:
# check categories
final_data[["emp_length", "id"]].groupby(["emp_length"]).count()

Unnamed: 0_level_0,id
emp_length,Unnamed: 1_level_1
1 year,59515
10+ years,305044
2 years,81494
3 years,72808
4 years,55799
5 years,55320
6 years,39719
7 years,32943
8 years,31072
9 years,28349


In [None]:
# we can do one-hot encoding on "employment length", but it is an ordinal, not categorical, variable. So we can just convert it to ordinal numbers.
final_data['Employment_Length'] = np.nan
final_data['Employment_Length'] = np.where(final_data.emp_length == "< 1 year", 0, final_data.Employment_Length)
final_data['Employment_Length'] = np.where(final_data.emp_length == "1 year", 1, final_data.Employment_Length)
final_data['Employment_Length'] = np.where(final_data.emp_length == "2 years", 2, final_data.Employment_Length)
final_data['Employment_Length'] = np.where(final_data.emp_length == "3 years", 3, final_data.Employment_Length)
final_data['Employment_Length'] = np.where(final_data.emp_length == "4 years", 4, final_data.Employment_Length)
final_data['Employment_Length'] = np.where(final_data.emp_length == "5 years", 5, final_data.Employment_Length)
final_data['Employment_Length'] = np.where(final_data.emp_length == "6 years", 6, final_data.Employment_Length)
final_data['Employment_Length'] = np.where(final_data.emp_length == "7 years", 7, final_data.Employment_Length)
final_data['Employment_Length'] = np.where(final_data.emp_length == "8 years", 8, final_data.Employment_Length)
final_data['Employment_Length'] = np.where(final_data.emp_length == "9 years", 9, final_data.Employment_Length)
final_data['Employment_Length'] = np.where(final_data.emp_length == "10+ years", 10, final_data.Employment_Length)

In [None]:
# check
final_data[["Employment_Length", "id"]].groupby(["Employment_Length"]).count()

Unnamed: 0_level_0,id
Employment_Length,Unnamed: 1_level_1
0.0,81368
1.0,59515
2.0,81494
3.0,72808
4.0,55799
5.0,55320
6.0,39719
7.0,32943
8.0,31072
9.0,28349


In [None]:
final_data.drop(['emp_length'], axis=1, inplace=True)

# Outlier treatment - should be done after test/train split, and should be done based on the train sample.




# 5. Feature Engineering - not needed fro this model




# 6. Test-Train split

In [None]:
# put 30% in test. This is a random split which is not ideal. Ideally we would like to split based on another variable, for example time. 
# Note that both test and train should be unbiased samples of the whole population.
from sklearn.model_selection import train_test_split
train, test = train_test_split(final_data, test_size=0.3)

In [None]:
# check
train.shape

(638215, 14)

In [None]:
# check - it is a good practice to compare test and train samples to make sure they are not fundamentally different. 
# If so, we will get high variance even with a non-overfitted model.
# Here we compare bad rate in both samples.
print (sum(train["30+ Delinquent"])/len(train["30+ Delinquent"]))
print (sum(test["30+ Delinquent"])/len(test["30+ Delinquent"]))

0.027390456194229217
0.027504186134936128


In [None]:
# Define X and Y varibales to build the ensemble model. 
X_train = train.drop(["id", '30+ Delinquent'], axis = 1)
Y_train = train['30+ Delinquent']

X_test = test.drop(["id", '30+ Delinquent'], axis = 1)
Y_test = test['30+ Delinquent']

# 7. Normalization

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)

StandardScaler()

In [None]:
X_train_normalized = sc.transform(X_train)
X_test_normalized = sc.transform(X_test)

In [None]:
# convert to Pandas DF
X_train_normalized = pd.DataFrame(X_train_normalized, columns=X_train.columns)
X_test_normalized = pd.DataFrame(X_test_normalized, columns=X_test.columns)

# 8. Outlier treatment.

In [None]:
# looking at the following table, seems like there are some outliers. One popular approach is to use 1 percentile as floor and 99 percentile as cap. 
# Howver it is not a written rule and depends on modeler's decision. Here we will cap "dti", "delinq_2yrs", "revol_bal", and "avg_cur_bal" at 99 percentile.
X_train_normalized.describe(percentiles=[0.01, 0.99]).transpose()

Unnamed: 0,count,mean,std,min,1%,50%,99%,max
dti,637308.0,4.95902e-16,1.000001,-1.115096,-1.021388,-0.081469,1.913051,55.620817
delinq_2yrs,638215.0,5.107393e-18,1.000001,-0.341512,-0.341512,-0.341512,4.348,67.656402
fico_range_low,638215.0,-8.587879e-16,1.000001,-1.227336,-1.227336,-0.208411,2.993924,4.15841
fico_range_high,638215.0,1.477019e-15,1.000001,-1.2273,-1.2273,-0.208412,2.993808,4.187363
inq_last_6mths,638215.0,-6.104935000000001e-17,1.000001,-0.604229,-0.604229,-0.604229,3.42001,7.444249
mths_since_last_delinq,303785.0,-7.257811000000001e-17,1.000002,-1.598176,-1.506624,-0.133337,2.109697,7.328185
revol_bal,638215.0,-6.590903000000001e-17,1.000001,-0.720532,-0.717143,-0.234234,3.519144,106.378455
revol_util,637567.0,2.2417360000000003e-17,1.000001,-1.940187,-1.916012,-0.03033,2.008463,5.755651
total_acc,638215.0,-8.634973000000001e-17,1.000001,-1.766196,-1.597637,-0.164886,3.037734,11.971358
avg_cur_bal,638180.0,3.38025e-17,1.000001,-0.813635,-0.790762,-0.381744,3.71485,41.610528


In [None]:
X_train_normalized['dti'] = np.where((X_train_normalized['dti'] > 2.294135), 2.294135, X_train_normalized['dti'])  
X_train_normalized['delinq_2yrs'] = np.where((X_train_normalized['delinq_2yrs'] > 4.390682), 4.390682, X_train_normalized['delinq_2yrs'])  
X_train_normalized['revol_bal'] = np.where((X_train_normalized['revol_bal'] > 3.583833), 3.583833	, X_train_normalized['revol_bal'])  
X_train_normalized['avg_cur_bal'] = np.where((X_train_normalized['avg_cur_bal'] > 3.710172), 3.710172, X_train_normalized['avg_cur_bal'])  


X_train_normalized.describe(percentiles=[0.01, 0.99]).transpose()

Unnamed: 0,count,mean,std,min,1%,50%,99%,max
dti,637271.0,-0.02642065,0.575787,-1.14072,-1.045423,-0.08316,1.965716,2.294135
delinq_2yrs,638215.0,-0.02183145,0.821573,-0.342521,-0.342521,-0.342521,4.370154,4.390682
fico_range_low,638215.0,-1.0732480000000001e-17,1.000001,-1.227461,-1.227461,-0.208764,2.992856,4.157081
fico_range_high,638215.0,-1.144947e-16,1.000001,-1.227424,-1.227424,-0.208765,2.992734,4.18602
inq_last_6mths,638215.0,2.8902e-17,1.000001,-0.604669,-0.604669,-0.604669,3.421447,7.447563
mths_since_last_delinq,303215.0,1.151177e-16,1.000002,-1.59725,-1.505738,-0.133061,2.108979,7.325152
revol_bal,638215.0,-0.03033107,0.722821,-0.74721,-0.743648,-0.241987,3.583833,3.583833
revol_util,637565.0,-3.599159e-16,1.000001,-1.941377,-1.917194,-0.030944,2.008463,5.466588
total_acc,638215.0,9.809534000000001e-17,1.000001,-1.767397,-1.598814,-0.165857,3.037223,11.972129
avg_cur_bal,638182.0,-0.01732229,0.887829,-0.815419,-0.792681,-0.381831,3.710172,3.710172


**Note: Any step you do during modeling process needs to be done on any future data to be passed to the model. This includes for example the above capping process. So for any future datasets, we will use the above tresholds to cap values.**

In [None]:
X_test_normalized['dti'] = np.where((X_test_normalized['dti'] > 2.294135), 2.294135, X_test_normalized['dti'])  
X_test_normalized['delinq_2yrs'] = np.where((X_test_normalized['delinq_2yrs'] > 4.390682), 4.390682, X_test_normalized['delinq_2yrs'])  
X_test_normalized['revol_bal'] = np.where((X_test_normalized['revol_bal'] > 3.583833), 3.583833	, X_test_normalized['revol_bal'])  
X_test_normalized['avg_cur_bal'] = np.where((X_test_normalized['avg_cur_bal'] > 3.710172), 3.710172, X_test_normalized['avg_cur_bal'])  


X_test_normalized.describe(percentiles=[0.01, 0.99]).transpose()

Unnamed: 0,count,mean,std,min,1%,50%,99%,max
dti,273130.0,-0.027918,0.575549,-1.14072,-1.044261,-0.084323,1.952595,2.294135
delinq_2yrs,273522.0,-0.021635,0.822142,-0.342521,-0.342521,-0.342521,4.370154,4.390682
fico_range_low,273522.0,0.000945,0.998643,-1.227461,-1.227461,-0.208764,2.992856,4.157081
fico_range_high,273522.0,0.000942,0.998631,-1.227424,-1.227424,-0.208765,2.992734,4.18602
inq_last_6mths,273522.0,9.8e-05,1.002052,-0.604669,-0.604669,-0.604669,3.421447,7.447563
mths_since_last_delinq,130324.0,0.002243,0.999599,-1.59725,-1.505738,-0.133061,2.108979,6.089742
revol_bal,273522.0,-0.030393,0.724496,-0.74721,-0.743735,-0.242247,3.583833,3.583833
revol_util,273268.0,-0.001703,1.001352,-1.941377,-1.917194,-0.034974,2.008463,5.756781
total_acc,273522.0,-0.002325,0.999303,-1.767397,-1.598814,-0.165857,3.037223,11.550671
avg_cur_bal,273507.0,-0.016832,0.889847,-0.815419,-0.792561,-0.383746,3.70949,3.710172


# 9. Missing Value Imputation

In [None]:
X_train_normalized.fillna(0,inplace=True)
X_test_normalized.fillna(0,inplace=True)

# 10. Feature Selection

Before grid search, we should choose only a sub-sample of features that have predictive power. This will significantly increase speed of grid search, while we don't lose a lot of information. 
An effect approach is to buid a simple Ensemble model, and choose only features with feature importance higher than say 1%. There is no written prescription here, and it is up to modeler to choose the treshold.

Note: Here we have few X variables. Feature selection is really not needed. It is done only for illustration.

Note: For linear models, there are automated feature selection techniques (forward, backward, and stepwise), But even for those, it is beneficial to remove non-important features first, using this approach.

Note: There are several techniques for feature selection (like for all other steps we discussed here). The discussion here is just an example.

In [None]:
import xgboost as xgb

In [None]:
# run XGBoost

xgb_instance = xgb.XGBClassifier(n_estimators=50) # nothing inside paranthesis, meaning we are using default parameters, with 100 trees.

model_for_feature_selection = xgb_instance.fit(X_train_normalized, Y_train)

In [None]:
# check the importances - you can also use SHAP values
feature_importance = {'Feature':X_train_normalized.columns,'Importance':model_for_feature_selection.feature_importances_}
feature_importance = pd.DataFrame(feature_importance)
feature_importance.sort_values("Importance", inplace=True,ascending=False)
feature_importance

Unnamed: 0,Feature,Importance
2,fico_range_low,0.344171
4,inq_last_6mths,0.175074
6,revol_bal,0.127624
9,avg_cur_bal,0.127321
0,dti,0.116045
8,total_acc,0.058663
7,revol_util,0.051102
1,delinq_2yrs,0.0
3,fico_range_high,0.0
5,mths_since_last_delinq,0.0


In [None]:
# choose featires with FI higher than 1%
final_features = feature_importance["Feature"][feature_importance.Importance > 0.01]

X_train_normalized = X_train_normalized[final_features]
X_test_normalized = X_test_normalized[final_features]

In [None]:
# check 
X_train_normalized.head(2)

Unnamed: 0,fico_range_low,inq_last_6mths,revol_bal,avg_cur_bal,dti,total_acc,revol_util
0,1.101635,-0.604229,0.126409,1.956926,-0.684608,0.003673,0.586143
1,-0.645093,-0.604229,-0.306204,-0.484163,0.041771,1.099306,-0.235821


In [None]:
# check
X_test_normalized.head(2)

Unnamed: 0,fico_range_low,inq_last_6mths,revol_bal,avg_cur_bal,dti,total_acc,revol_util
0,-0.06285,0.737184,0.007325,0.520974,0.753384,-0.080606,-0.44937
1,-0.645093,-0.604229,-0.480312,0.881559,-0.427337,-1.007681,0.574055


# Build the model

We wil build a sample NN, and will give a sample code for Grid Search. Figure out Grid Search and use it for your project.

In [None]:
pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
pip install keras

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import tensorflow.keras as keras
from keras.models import Sequential
from keras.layers import Dense

In [None]:
# We build a NN with two hidden layers, and 6 nodes in each hidden layer.

# first step: create a Sequential object, as a sequence of layers. B/C NN is a sequence of layers.
classifier = Sequential()

# add the first hidden layer
classifier.add(Dense(units=6,kernel_initializer='glorot_uniform',
                    activation = 'relu'))

# add the second hidden layer
classifier.add(Dense(units=6,kernel_initializer='glorot_uniform',
                activation = 'relu'))

# add the output layer
classifier.add(Dense(units=1,kernel_initializer='glorot_uniform',
                    activation = 'sigmoid'))

# add additional parameters
classifier.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy', 'FalseNegatives'])

# train the model
classifier.fit(X_train_normalized,Y_train,batch_size=1000,epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f4ea3a31810>

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(Y_test, classifier.predict(X_test_normalized))

0.6173390081151426

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(Y_train, classifier.predict(X_train_normalized))

0.6150265933492123

# Grid Search - Read this part, or look up grid search for NN on internet. Use it in your models.



In [None]:
# fine tuning with Grid Search
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

def build_classifier(optimizer):
    # first step: create a Sequential object, as a sequence of layers. B/C NN is a sequence of layers.
    classifier = Sequential()
    # add the first hidden layer
    classifier.add(Dense(units=6,kernel_initializer='glorot_uniform',
                    activation = 'relu'))
    # add the second hidden layer
    classifier.add(Dense(units=6,kernel_initializer='glorot_uniform',
                    activation = 'relu'))
    # add the output layer
    classifier.add(Dense(units=1,kernel_initializer='glorot_uniform',
                    activation = 'sigmoid'))
    # compiling the NN
    classifier.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
    return classifier

classifier = KerasClassifier(build_fn=build_classifier)

# create a dictionary of hyper-parameters to optimize
parameters = {'batch_size':[1000,2000], 'nb_epoch':[20,10],'optimizer':['adam']}
grid_search = GridSearchCV(estimator = classifier, param_grid = parameters, scoring = 'accuracy', cv=10)
grid_search = grid_search.fit(X_train_normalized,Y_train)

best_parameters = grid_search.best_params_ 
best_accuracy = grid_search.best_score_

means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
params = grid_search.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))