In [18]:
import pandas as pd
import numpy as np
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
import xgboost as xgb

from imblearn.over_sampling import SMOTE, RandomOverSampler

In [None]:
import cml.data_v1 as cmldata

# Sample in-code customization of spark configurations
#from pyspark import SparkContext
#SparkContext.setSystemProperty('spark.executor.cores', '1')
#SparkContext.setSystemProperty('spark.executor.memory', '2g')

CONNECTION_NAME = "go01-aw-dl"
conn = cmldata.get_connection(CONNECTION_NAME)
spark = conn.get_spark_session()

# Sample usage to run query through spark
EXAMPLE_SQL_QUERY = "show databases"
spark.sql(EXAMPLE_SQL_QUERY).show()


In [19]:
data = pd.read_csv('/home/cdsw/train.csv')

In [20]:
data

Unnamed: 0,ID,Gender,Age,Region_Code,Occupation,Channel_Code,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Is_Lead
0,NNVBBKZB,Female,73,RG268,Other,X3,43,No,1045696,No,0
1,IDD62UNG,Female,30,RG277,Salaried,X1,32,No,581988,No,0
2,HD3DSEMC,Female,56,RG268,Self_Employed,X3,26,No,1484315,Yes,0
3,BF3NC7KV,Male,34,RG270,Salaried,X1,19,No,470454,No,0
4,TEASRWXV,Female,30,RG282,Salaried,X1,33,No,886787,No,0
...,...,...,...,...,...,...,...,...,...,...,...
245720,BPAWWXZN,Male,51,RG284,Self_Employed,X3,109,,1925586,No,0
245721,HFNB7JY8,Male,27,RG268,Salaried,X1,15,No,862952,Yes,0
245722,GEHAUCWT,Female,26,RG281,Salaried,X1,13,No,670659,No,0
245723,GE7V8SAH,Female,28,RG273,Salaried,X1,31,No,407504,No,0


In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245725 entries, 0 to 245724
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ID                   245725 non-null  object
 1   Gender               245725 non-null  object
 2   Age                  245725 non-null  int64 
 3   Region_Code          245725 non-null  object
 4   Occupation           245725 non-null  object
 5   Channel_Code         245725 non-null  object
 6   Vintage              245725 non-null  int64 
 7   Credit_Product       216400 non-null  object
 8   Avg_Account_Balance  245725 non-null  int64 
 9   Is_Active            245725 non-null  object
 10  Is_Lead              245725 non-null  int64 
dtypes: int64(4), object(7)
memory usage: 20.6+ MB


In [22]:
data.describe()

Unnamed: 0,Age,Vintage,Avg_Account_Balance,Is_Lead
count,245725.0,245725.0,245725.0,245725.0
mean,43.856307,46.959141,1128403.0,0.237208
std,14.828672,32.353136,852936.4,0.425372
min,23.0,7.0,20790.0,0.0
25%,30.0,20.0,604310.0,0.0
50%,43.0,32.0,894601.0,0.0
75%,54.0,73.0,1366666.0,0.0
max,85.0,135.0,10352010.0,1.0


In [23]:
data.isna().sum()

ID                         0
Gender                     0
Age                        0
Region_Code                0
Occupation                 0
Channel_Code               0
Vintage                    0
Credit_Product         29325
Avg_Account_Balance        0
Is_Active                  0
Is_Lead                    0
dtype: int64

In [24]:
data['Credit_Product'].value_counts()

Credit_Product
No     144357
Yes     72043
Name: count, dtype: int64

In [25]:
data.dtypes

ID                     object
Gender                 object
Age                     int64
Region_Code            object
Occupation             object
Channel_Code           object
Vintage                 int64
Credit_Product         object
Avg_Account_Balance     int64
Is_Active              object
Is_Lead                 int64
dtype: object

In [26]:
for col in data.columns:
    if data[col].dtypes == 'object':
        print(col, data[col].unique())

ID ['NNVBBKZB' 'IDD62UNG' 'HD3DSEMC' ... 'GEHAUCWT' 'GE7V8SAH' 'BOCZSWLJ']
Gender ['Female' 'Male']
Region_Code ['RG268' 'RG277' 'RG270' 'RG282' 'RG261' 'RG265' 'RG283' 'RG254' 'RG269'
 'RG257' 'RG279' 'RG280' 'RG252' 'RG284' 'RG259' 'RG281' 'RG258' 'RG266'
 'RG260' 'RG274' 'RG256' 'RG275' 'RG273' 'RG267' 'RG272' 'RG251' 'RG262'
 'RG264' 'RG278' 'RG276' 'RG263' 'RG250' 'RG255' 'RG253' 'RG271']
Occupation ['Other' 'Salaried' 'Self_Employed' 'Entrepreneur']
Channel_Code ['X3' 'X1' 'X2' 'X4']
Credit_Product ['No' nan 'Yes']
Is_Active ['No' 'Yes']


In [27]:
import seaborn as sns
sns.heatmap(data.corr(),annot=True)

ValueError: could not convert string to float: 'NNVBBKZB'

In [34]:
data = data.drop(['ID','Region_Code'],axis=1)

KeyError: "['ID', 'Region_Code'] not found in axis"

In [35]:
def preprocess_inputs(df):
    df = df.copy()
    
    df['Gender'] = df['Gender'].replace({'Female' : 1,
                                         'Male'   : 0})
    df['Credit_Product'] = df['Credit_Product'].replace({ np.nan : 'Not Sure'})
    df['Credit_Product'] = df['Credit_Product'].replace({ 'No' : 0,
                                                         'Not Sure':1,
                                                         'Yes' : 2})
    df['Is_Active'] = df['Is_Active'].replace({ 'No'  : 0,
                                                'Yes' : 1})
    columns = ['Occupation','Channel_Code']
    for column in columns:
        dummies = pd.get_dummies(df[column])
        df = pd.concat([df, dummies],axis=1)
        df = df.drop(column, axis=1)
        
    y = df['Is_Lead']  
    X = df.drop('Is_Lead',axis=1)    
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=42)
    
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    X_train = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns, index = X_train.index)
    X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns, index = X_test.index)
    
    return X_train, X_test, y_train, y_test

In [36]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)


X_train

Unnamed: 0,Gender,Age,Vintage,Credit_Product,Avg_Account_Balance,Is_Active,Entrepreneur,Other,Salaried,Self_Employed,X1,X2,X3,X4
138403,1.094636,-0.933883,-0.832754,-0.793351,0.697878,-0.796206,-0.105404,-0.631862,1.552111,-0.834208,-0.854686,-0.616215,1.603567,-0.152254
117015,1.094636,0.749235,-0.307410,-0.793351,-0.963535,-0.796206,-0.105404,-0.631862,-0.644284,1.198741,1.170021,-0.616215,-0.623610,-0.152254
322,1.094636,-0.731909,-1.049072,-0.793351,-0.282440,-0.796206,-0.105404,-0.631862,1.552111,-0.834208,1.170021,-0.616215,-0.623610,-0.152254
64910,1.094636,0.277962,1.392232,1.449411,0.115046,1.255956,-0.105404,-0.631862,-0.644284,1.198741,-0.854686,-0.616215,1.603567,-0.152254
39919,1.094636,-0.866558,-0.863657,-0.793351,-0.479967,-0.796206,-0.105404,1.582623,-0.644284,-0.834208,1.170021,-0.616215,-0.623610,-0.152254
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,-0.913545,-0.058661,-0.863657,1.449411,1.389735,-0.796206,-0.105404,-0.631862,-0.644284,1.198741,-0.854686,1.622811,-0.623610,-0.152254
103694,-0.913545,-0.327960,-1.049072,-0.793351,-0.319747,-0.796206,-0.105404,-0.631862,-0.644284,1.198741,-0.854686,-0.616215,-0.623610,6.567976
131932,-0.913545,0.951210,0.496057,1.449411,-0.070329,1.255956,-0.105404,-0.631862,-0.644284,1.198741,-0.854686,1.622811,-0.623610,-0.152254
146867,-0.913545,-1.001208,-0.461923,-0.793351,0.649166,-0.796206,-0.105404,1.582623,-0.644284,-0.834208,1.170021,-0.616215,-0.623610,-0.152254


In [37]:
data['Is_Lead'].value_counts()

Is_Lead
0    187437
1     58288
Name: count, dtype: int64

In [38]:
smote = SMOTE(random_state=23)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [39]:
models = {
    '  Logistic Regression': LogisticRegression(),
    '           Linear SVM': LinearSVC(),
    '        XGBClassifier': xgb.XGBClassifier(),
    '    Gradient Boosting': GradientBoostingClassifier(),
    '        Decision Tree': DecisionTreeClassifier(),
    '        Random Forest': RandomForestClassifier(),
    ' KNeighborsClassifier': KNeighborsClassifier(),
    '   Bagging Classifier': BaggingClassifier()
}

for name, model in models.items():
    model = model.fit(X_train_smote, y_train_smote)
    print(name + " trained")

  Logistic Regression trained
           Linear SVM trained
        XGBClassifier trained
    Gradient Boosting trained
        Decision Tree trained
        Random Forest trained
 KNeighborsClassifier trained
   Bagging Classifier trained


In [40]:
for name, model in models.items():
    print(name + ": {:.2f}%".format(model.score(X_test, y_test) * 100))

  Logistic Regression: 71.04%
           Linear SVM: 70.85%
        XGBClassifier: 84.71%
    Gradient Boosting: 82.71%
        Decision Tree: 78.30%
        Random Forest: 82.49%
 KNeighborsClassifier: 77.66%
   Bagging Classifier: 83.12%


In [41]:
import tensorflow as tf

In [42]:
inputs = tf.keras.Input(shape=(X_train_smote.shape[1],))

x = tf.keras.layers.Dense(128, activation = 'relu')(inputs)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Dense(64, activation = 'relu')(x)
x = tf.keras.layers.Dropout(0.6)(x)
x = tf.keras.layers.Dense(32, activation = 'relu')(x)
x = tf.keras.layers.Dropout(0.2)(x)

outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)


model = tf.keras.Model(inputs = inputs, outputs = outputs)

model.compile(optimizer="adam",
             loss="binary_crossentropy",
             metrics=["binary_accuracy"])
model.summary()

In [43]:
epochs=100
history = model.fit(
  X_train_smote,
  y_train_smote,
  validation_split=0.2,
  epochs=epochs,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor="val_loss",
            patience=3,
            restore_best_weights=True
            )
    ]
)

Epoch 1/100
[1m6559/6559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - binary_accuracy: 0.7648 - loss: 0.5044 - val_binary_accuracy: 0.6423 - val_loss: 0.5867
Epoch 2/100
[1m6559/6559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - binary_accuracy: 0.8114 - loss: 0.4360 - val_binary_accuracy: 0.6799 - val_loss: 0.6487
Epoch 3/100
[1m6559/6559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - binary_accuracy: 0.8130 - loss: 0.4340 - val_binary_accuracy: 0.7458 - val_loss: 0.5447
Epoch 4/100
[1m6559/6559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - binary_accuracy: 0.8143 - loss: 0.4311 - val_binary_accuracy: 0.7000 - val_loss: 0.6785
Epoch 5/100
[1m6559/6559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - binary_accuracy: 0.8159 - loss: 0.4303 - val_binary_accuracy: 0.7114 - val_loss: 0.5867
Epoch 6/100
[1m6559/6559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - binary_a

In [44]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'My test loss is {loss*100:.2f}% and test accuracy is {accuracy*100:.2f}%')

[1m2304/2304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 735us/step - binary_accuracy: 0.8306 - loss: 0.4141
My test loss is 41.45% and test accuracy is 83.09%
