In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
 
%matplotlib inline

In [78]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,BatchNormalization
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import Callback, EarlyStopping, ReduceLROnPlateau

In [128]:
train_data = pd.read_csv("/content/sample_data/Hackathon/train_s3TEQDk.csv")
train_data.head()

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
0,NNVBBKZB,Female,73,RG268,Other,X3,43,No,1045696,No,0
1,IDD62UNG,Female,30,RG277,Salaried,X1,32,No,581988,No,0
2,HD3DSEMC,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0
3,BF3NC7KV,Male,34,RG270,Salaried,X1,19,No,470454,No,0
4,TEASRWXV,Female,30,RG282,Salaried,X1,33,No,886787,No,0


In [129]:
test_data = pd.read_csv("/content/sample_data/Hackathon/test_mSzZ8RL.csv")
test_data.head()

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active
0,VBENBARO,Male,29,RG254,Other,X1,25,Yes,742366,No
1,CCMEWNKY,Male,43,RG268,Other,X2,49,,925537,No
2,VK3KGA9M,Male,31,RG270,Salaried,X1,14,No,215949,No
3,TT8RPZVC,Male,29,RG272,Other,X1,33,No,868070,No
4,SHQZEYTZ,Female,29,RG270,Other,X1,19,No,657087,No


In [130]:
train_data.describe() 

Unnamed: 0,Age,Vintage,Avg_Account_Balance,Is_Lead
count,245725.0,245725.0,245725.0,245725.0
mean,43.856307,46.959141,1128403.0,0.237208
std,14.828672,32.353136,852936.4,0.425372
min,23.0,7.0,20790.0,0.0
25%,30.0,20.0,604310.0,0.0
50%,43.0,32.0,894601.0,0.0
75%,54.0,73.0,1366666.0,0.0
max,85.0,135.0,10352010.0,1.0


In [131]:
train_data.Is_Lead.unique()

array([0, 1])

In [132]:
train_data.dtypes

ID                     object
Gender                 object
Age                     int64
Region_Code            object
Occupation             object
Channel_Code           object
Vintage                 int64
Credit_Product         object
Avg_Account_Balance     int64
Is_Active              object
Is_Lead                 int64
dtype: object

In [133]:
for col in ['Gender','Region_Code','Occupation','Channel_Code','Vintage','Credit_Product']:
  print(col, train_data[col].unique(),train_data[col].nunique())

Gender ['Female' 'Male'] 2
Region_Code ['RG268' 'RG277' 'RG270' 'RG282' 'RG261' 'RG265' 'RG283' 'RG254' 'RG269'
 'RG257' 'RG279' 'RG280' 'RG252' 'RG284' 'RG259' 'RG281' 'RG258' 'RG266'
 'RG260' 'RG274' 'RG256' 'RG275' 'RG273' 'RG267' 'RG272' 'RG251' 'RG262'
 'RG264' 'RG278' 'RG276' 'RG263' 'RG250' 'RG255' 'RG253' 'RG271'] 35
Occupation ['Other' 'Salaried' 'Self_Employed' 'Entrepreneur'] 4
Channel_Code ['X3' 'X1' 'X2' 'X4'] 4
Vintage [ 43  32  26  19  33  20  13  38  49 123  14  31  57  69  97  15  63  21
  99  56  87  62  55 103 104  93  61  91  27  39  50  45  25 117  80  81
  73  67 109  92 122  75  79 121 111 116  85 115  86  37  74 110  44   9
  68  51  98 105 127 129 133   8   7 128 135 134] 66
Credit_Product ['No' nan 'Yes'] 2


In [134]:
#df[df.isna().any(axis=1)].shape
train_data.columns[train_data.isnull().any()],test_data.columns[test_data.isnull().any()]

(Index(['Credit_Product'], dtype='object'),
 Index(['Credit_Product'], dtype='object'))

In [135]:
train_data['Credit_Product'].fillna(train_data['Credit_Product'].mode()[0], inplace =True)
test_data['Credit_Product'].fillna(test_data['Credit_Product'].mode()[0], inplace =True)
train_data.columns[train_data.isnull().any()],test_data.columns[test_data.isnull().any()]

(Index([], dtype='object'), Index([], dtype='object'))

In [136]:
categorical_columns = train_data.select_dtypes(exclude=["number"]).columns.drop('ID')
categorical_columns

Index(['Gender', 'Region_Code', 'Occupation', 'Channel_Code', 'Credit_Product',
       'Is_Active'],
      dtype='object')

In [137]:
train_data.set_index('ID',inplace=True)
test_data.set_index('ID',inplace=True)
#train_data.drop('ID',axis=1,inplace=True)
#test_data.drop('ID',axis=1,inplace=True)


In [138]:
Y_train = train_data[['Is_Lead']]
X_train = train_data.drop('Is_Lead', axis='columns')
X_test = test_data

X_train.head(),Y_train.head(), X_test.head()


(          Gender  Age Region_Code  ... Credit_Product Avg_Account_Balance  Is_Active
 ID                                 ...                                              
 NNVBBKZB  Female   73       RG268  ...             No             1045696         No
 IDD62UNG  Female   30       RG277  ...             No              581988         No
 HD3DSEMC  Female   56       RG268  ...             No             1484315        Yes
 BF3NC7KV    Male   34       RG270  ...             No              470454         No
 TEASRWXV  Female   30       RG282  ...             No              886787         No
 
 [5 rows x 9 columns],           Is_Lead
 ID               
 NNVBBKZB        0
 IDD62UNG        0
 HD3DSEMC        0
 BF3NC7KV        0
 TEASRWXV        0,           Gender  Age Region_Code  ... Credit_Product Avg_Account_Balance  Is_Active
 ID                                 ...                                              
 VBENBARO    Male   29       RG254  ...            Yes              7

In [139]:
X_train.shape,Y_train.shape, X_test.shape

((245725, 9), (245725, 1), (105312, 9))

In [140]:
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
# prepare input data
def prepare_inputs(X_train, X_test):
	oe = OrdinalEncoder()
	oe.fit(X_train)
	X_train_enc = oe.transform(X_train)
	X_test_enc = oe.transform(X_test)
	return X_train_enc, X_test_enc

# prepare target
def prepare_targets(y_train):
	le = LabelEncoder()
	le.fit(y_train)
	y_train_enc = le.transform(y_train)
	#y_test_enc = le.transform(y_test)
	return y_train_enc#, y_test_enc

In [141]:
X_train[categorical_columns], X_test[categorical_columns] = prepare_inputs(X_train[categorical_columns], X_test[categorical_columns])
X_train.head()

Unnamed: 0_level_0,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
NNVBBKZB,0.0,73,18.0,1.0,2.0,43,0.0,1045696,0.0
IDD62UNG,0.0,30,27.0,2.0,0.0,32,0.0,581988,0.0
HD3DSEMC,0.0,56,18.0,3.0,2.0,26,0.0,1484315,1.0
BF3NC7KV,1.0,34,20.0,2.0,0.0,19,0.0,470454,0.0
TEASRWXV,0.0,30,32.0,2.0,0.0,33,0.0,886787,0.0


In [142]:
X_test.head()


Unnamed: 0_level_0,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
VBENBARO,1.0,29,4.0,1.0,0.0,25,1.0,742366,0.0
CCMEWNKY,1.0,43,18.0,1.0,1.0,49,0.0,925537,0.0
VK3KGA9M,1.0,31,20.0,2.0,0.0,14,0.0,215949,0.0
TT8RPZVC,1.0,29,22.0,1.0,0.0,33,0.0,868070,0.0
SHQZEYTZ,0.0,29,20.0,1.0,0.0,19,0.0,657087,0.0


In [143]:
for col in ['Gender','Region_Code','Occupation','Channel_Code','Vintage','Credit_Product']:
  print(col, X_train[col].unique(),X_train[col].nunique())

Gender [0. 1.] 2
Region_Code [18. 27. 20. 32. 11. 15. 33.  4. 19.  7. 29. 30.  2. 34.  9. 31.  8. 16.
 10. 24.  6. 25. 23. 17. 22.  1. 12. 14. 28. 26. 13.  0.  5.  3. 21.] 35
Occupation [1. 2. 3. 0.] 4
Channel_Code [2. 0. 1. 3.] 4
Vintage [ 43  32  26  19  33  20  13  38  49 123  14  31  57  69  97  15  63  21
  99  56  87  62  55 103 104  93  61  91  27  39  50  45  25 117  80  81
  73  67 109  92 122  75  79 121 111 116  85 115  86  37  74 110  44   9
  68  51  98 105 127 129 133   8   7 128 135 134] 66
Credit_Product [0. 1.] 2


In [144]:
X_train_copy =  X_train.copy()
X_test_copy =  X_test.copy()

In [None]:
X_train =  X_train_copy.copy()
X_test =  X_test_copy.copy()

In [120]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_train

array([[0.        , 0.80645161, 0.52941176, ..., 0.        , 0.09920475,
        0.        ],
       [0.        , 0.11290323, 0.79411765, ..., 0.        , 0.0543206 ,
        0.        ],
       [0.        , 0.53225806, 0.52941176, ..., 0.        , 0.14166044,
        1.        ],
       ...,
       [0.        , 0.0483871 , 0.91176471, ..., 0.        , 0.06290342,
        0.        ],
       [0.        , 0.08064516, 0.67647059, ..., 0.        , 0.03743159,
        0.        ],
       [1.        , 0.09677419, 0.55882353, ..., 0.        , 0.10729479,
        0.        ]])

In [118]:
# define the model
model = Sequential()
model.add(Dense(10, input_dim=X_train.shape[1], activation='relu', kernel_initializer='he_normal'))
model.add(Dense(1, activation='sigmoid'))
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit the keras model on the dataset
model.fit(X_train, Y_train, epochs=100, batch_size=16, verbose=2)
# evaluate the keras model
#_, accuracy = model.evaluate(X_test_enc, y_test_enc, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

15358/15358 - 15s - loss: 2924.6692 - accuracy: 0.6271
Epoch 2/100
15358/15358 - 14s - loss: 230.8072 - accuracy: 0.6354
Epoch 3/100
15358/15358 - 14s - loss: 214.3715 - accuracy: 0.6385
Epoch 4/100
15358/15358 - 15s - loss: 218.9323 - accuracy: 0.6413
Epoch 5/100
15358/15358 - 14s - loss: 208.9730 - accuracy: 0.6440
Epoch 6/100


KeyboardInterrupt: ignored

In [117]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,stratify=Y,test_size=0.1,random_state=1)

NameError: ignored

In [None]:
X_train.shape

(221152, 10)

In [None]:
Y_test.shape

(24573,)

In [None]:
#X_train['Credit_Product'].fillna(X_train['Credit_Product'].mode()[0], inplace=True)
#X_train.head()

In [None]:
#X_train.Credit_Product.replace(('Yes','No'),(1,0),inplace=True)

In [None]:
def datapreprocess(X_dat):
  X_dat.drop('ID',axis='columns',inplace=True)
  X_dat.Credit_Product.replace(('Yes','No'),(1,0),inplace=True)
  X_dat.Is_Active.replace(('Yes','No'),(1,0),inplace=True)
  X_dat['Credit_Product'].fillna(X_dat['Credit_Product'].mode()[0], inplace=True)
  X_dat = pd.get_dummies(data=X_dat,columns=['Gender','Occupation','Channel_Code'])
  X_dat.drop('Region_Code',axis='columns',inplace=True)
  col_scale = ["Age","Vintage","Avg_Account_Balance"]
  scaler = MinMaxScaler()
  X_dat[col_scale] = scaler.fit_transform(X_dat[col_scale])
  return X_dat


In [None]:
X_train_prep = datapreprocess(X_train)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [None]:
X_train_prep.sample(10)

Unnamed: 0,Age,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Gender_Female,Gender_Male,Occupation_Entrepreneur,Occupation_Other,Occupation_Salaried,Occupation_Self_Employed,Channel_Code_X1,Channel_Code_X2,Channel_Code_X3,Channel_Code_X4
75337,0.080645,0.09375,1.0,0.032233,0,0,1,0,0,1,0,1,0,0,0
144756,0.048387,0.0625,0.0,0.072939,0,1,0,0,0,1,0,1,0,0,0
80801,0.177419,0.148438,0.0,0.096279,0,0,1,0,1,0,0,1,0,0,0
206307,0.064516,0.054688,1.0,0.146036,0,1,0,0,0,1,0,1,0,0,0
236067,0.354839,0.515625,1.0,0.070748,1,0,1,0,0,0,1,0,1,0,0
172865,0.548387,0.296875,1.0,0.084338,0,1,0,0,0,0,1,0,0,1,0
206076,0.048387,0.109375,0.0,0.126458,0,1,0,0,0,1,0,1,0,0,0
20981,0.5,0.328125,1.0,0.057272,0,0,1,0,0,0,1,0,1,0,0
102513,0.064516,0.046875,0.0,0.079761,0,0,1,0,0,1,0,1,0,0,0
220661,0.370968,0.523438,0.0,0.105511,0,0,1,0,0,0,1,0,1,0,0


In [None]:
X_train_prep.describe()

Unnamed: 0,Age,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Gender_Female,Gender_Male,Occupation_Entrepreneur,Occupation_Other,Occupation_Salaried,Occupation_Self_Employed,Channel_Code_X1,Channel_Code_X2,Channel_Code_X3,Channel_Code_X4
count,221152.0,221152.0,221152.0,221152.0,221152.0,221152.0,221152.0,221152.0,221152.0,221152.0,221152.0,221152.0,221152.0,221152.0,221152.0
mean,0.336466,0.311832,0.29307,0.107161,0.38871,0.454163,0.545837,0.010902,0.285962,0.293074,0.410062,0.421592,0.27595,0.279663,0.022794
std,0.239287,0.252607,0.455171,0.082347,0.487458,0.497896,0.497896,0.103842,0.451872,0.455173,0.491846,0.493815,0.446993,0.448835,0.149247
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.112903,0.101562,0.0,0.0565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.322581,0.195312,0.0,0.084602,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.5,0.515625,1.0,0.130302,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
X_train_prep.shape

(221152, 15)

In [None]:
model1 = Sequential([
                          
                          Dense(16,input_shape=(15,),kernel_regularizer=regularizers.l2(0.001),activation='relu'),
                          Dropout(0.2),
                          Dense(8,kernel_regularizer=regularizers.l2(0.001),activation='relu'),
                          Dropout(0.2),
                          Dense(1,activation='sigmoid')
                          
])

In [None]:
early_stopping = EarlyStopping(monitor='val_prc',patience=30)
#learning_rate_reduction = ReduceLROnPlateau(factor=0.2,patience=15)

In [None]:
from tensorflow import keras

In [None]:
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
      keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]


In [None]:
model1.compile(optimizer='adam',
                  loss='binary_crossentropy',metrics=METRICS)

In [None]:
history = model1.fit(X_train_prep, Y_train, epochs=100, batch_size=256, validation_split=0.12,
                         callbacks=[early_stopping,tf.keras.callbacks.CSVLogger("results.csv")])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
