In [None]:
#AUC Score Optimization: Ensemble and Regularization
#Author: Sudesh V Khillare

In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import ADASYN, SMOTE
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [2]:
data=pd.read_csv('train.csv')
data.head()

Unnamed: 0,age,cost_of_ad,device_type,gender,in_initial_launch_location,income,n_drivers,n_vehicles,prior_ins_tenure,outcome
0,56,0.005737,iPhone,M,0,62717,2,1,4,0
1,50,0.004733,desktop,F,0,64328,2,3,2,0
2,54,0.004129,laptop,M,0,83439,1,3,7,0
3,16,0.005117,Android,F,0,30110,2,3,0,0
4,37,0.003635,desktop,M,0,76565,2,1,5,0


In [3]:
#Checking for missing entries: Some of values in Gender are missing.
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
age                           10000 non-null int64
cost_of_ad                    10000 non-null float64
device_type                   10000 non-null object
gender                        9731 non-null object
in_initial_launch_location    10000 non-null int64
income                        10000 non-null int64
n_drivers                     10000 non-null int64
n_vehicles                    10000 non-null int64
prior_ins_tenure              10000 non-null int64
outcome                       10000 non-null int64
dtypes: float64(1), int64(7), object(2)
memory usage: 781.3+ KB


In [4]:
def Data_Preprocessing(data):
    
    #Using simple backfill and forward technique to fill missing values.
    data.fillna(method='bfill',inplace=True)
    data.fillna(method='ffill',inplace=True)
    
    #Feature Engineering
    data['Multiple_Vehicle']=data.apply(lambda x: 1 if x.n_vehicles>1 else 0, axis=1)
    
    def Age_Bin(Age):
        Val=-1
        if Age <=20:
            Val=1
        if Age >20 and Age<=40:
            Val=2
        else:
            Val=3
        return Val
    data['Age_Bin']=data.apply(lambda x: Age_Bin(x.age), axis=1)
    
    
    def Ads_Bin(Cost):
        if Cost <0.004:
            Val=1
        if Cost >0.004 and Cost <0.005:
            Val=2
        if Cost >0.005 and Cost <0.006:
            Val=3
        if Cost >0.006:
            Val=4
        return Val
    data['Cheap_Ads']=data.apply(lambda x: Ads_Bin(x.cost_of_ad), axis=1)
    
    
    def Income_Bin(Income):
        Val=0
        if Income<=25000:
            Val=1
        if Income>25000 and Income<=50000:
            Val=2
        if Income>50000 and Income<=75000:
            Val=3
        if Income>75000:
            Val=4
        return Val
    data['Income_Bin']=data.apply(lambda x: Income_Bin(x.income), axis=1)
    
    
    
    def Tenure_Bin(Income):
        if Income<6:
            Val=0
        else:
            Val=1
        return Val
    data['Tenure_Bin']=data.apply(lambda x: Tenure_Bin(x.prior_ins_tenure), axis=1)
    
    
    def Mobile_Operations(Device):
        if Device=='iPhone' or Device=='Android':
            Val=1
        else:
            Val=0
        return Val
    data['Mobile_Operations']=data.apply(lambda x: Mobile_Operations(x.device_type), axis=1)
    
    
    def Tenure_And_Income(income,tenure):
        Val=tenure/income
        return Val
    data['Tenure_And_Income']=data.apply(lambda x: Tenure_And_Income(x.income,x.prior_ins_tenure), axis=1)
    
    
    def Number_Of_Vehicle_Per_Driver(Driver,Vehicle):
        Val=Vehicle/Driver
        return Val
    data['Number_Of_Vehicle_Per_Driver']=data.apply(lambda x: Number_Of_Vehicle_Per_Driver(x.n_drivers,x.n_vehicles), axis=1)
    
    
    #Label Encoding: Handling Categorical Variable
    from sklearn.preprocessing  import LabelEncoder
    def MultilabelEncoder(ColumnList, DataFrame):
        for i in ColumnList:
            labelencoder_x=LabelEncoder()
            DataFrame[i]=labelencoder_x.fit_transform(DataFrame[i].astype(str))

    ColumnList=['device_type','gender']
    MultilabelEncoder(ColumnList,data)
    
    return data

In [5]:
data=Data_Preprocessing(data)
data.head()

Unnamed: 0,age,cost_of_ad,device_type,gender,in_initial_launch_location,income,n_drivers,n_vehicles,prior_ins_tenure,outcome,Multiple_Vehicle,Age_Bin,Cheap_Ads,Income_Bin,Tenure_Bin,Mobile_Operations,Tenure_And_Income,Number_Of_Vehicle_Per_Driver
0,56,0.005737,2,1,0,62717,2,1,4,0,0,3,3,3,0,1,6.4e-05,0.5
1,50,0.004733,1,0,0,64328,2,3,2,0,1,3,2,3,0,0,3.1e-05,1.5
2,54,0.004129,3,1,0,83439,1,3,7,0,1,3,2,4,1,0,8.4e-05,3.0
3,16,0.005117,0,0,0,30110,2,3,0,0,1,3,3,2,0,1,0.0,1.5
4,37,0.003635,1,1,0,76565,2,1,5,0,0,2,1,4,0,0,6.5e-05,0.5


In [6]:
def Data_Scaling(data):
    #Storing all column names in variable
    Columns=data.columns
    #Scaling Data 
    scaler = StandardScaler()
    #print(scaler.fit(data))
    scaled_data = scaler.fit_transform(data)
    data_Scaled = pd.DataFrame(scaled_data)
    data_Scaled.columns =Columns
    return data_Scaled

In [7]:
Correlation=data.corr()
Correlation["outcome"].sort_values()

Multiple_Vehicle               -0.200100
n_vehicles                     -0.193192
Number_Of_Vehicle_Per_Driver   -0.191886
device_type                    -0.166496
cost_of_ad                     -0.077210
Cheap_Ads                      -0.069282
Tenure_And_Income              -0.038633
Tenure_Bin                     -0.038285
prior_ins_tenure               -0.029934
age                            -0.011907
income                          0.006375
Income_Bin                      0.017566
Age_Bin                         0.019438
Mobile_Operations               0.052894
n_drivers                       0.100105
in_initial_launch_location      0.116577
gender                          0.137204
outcome                         1.000000
Name: outcome, dtype: float64

In [8]:
X=data[['Multiple_Vehicle','Mobile_Operations','gender','device_type','Cheap_Ads','Tenure_Bin','age','Income_Bin','n_drivers','in_initial_launch_location']]
Y=data['outcome']

Predictor_Columns=X.columns
#Scaling the data
X_Scaled=Data_Scaling(X)

#Dataset split is 80% Training and 20% Test
from sklearn.model_selection import train_test_split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X_Scaled, Y, test_size=0.2, random_state=42)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [9]:
#Handling Data Imbalance:Synthetic Minority Oversampling Technique
sm=SMOTE()
X_Resampled, Y_Resampled=sm.fit_sample(X_Train,Y_Train)
unique, counts=np.unique(Y_Resampled,return_counts= True)
print(unique, counts)

[0 1] [7197 7197]


# Model Analysis

In [80]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
Random_Forest=RandomForestClassifier(max_depth=12, random_state=42,n_estimators=100)
model1=Random_Forest.fit(X_Resampled,Y_Resampled)
RF_Pred=Random_Forest.predict_proba(X_Test)[:,1]

#Model Evaluation: AUC SCORE
auc = roc_auc_score(Y_Test,RF_Pred)
print('AUC Score for Random Forest Classifier: %.2f' % auc)

AUC Score for Random Forest Classifier: 0.81


In [81]:
#Extra Tree Classfier
from sklearn.ensemble import ExtraTreesClassifier
Extra_Tree=ExtraTreesClassifier(max_depth=8, random_state=42)
model2=Extra_Tree.fit(X_Resampled, Y_Resampled)
Y_Pred_ET=Extra_Tree.predict_proba(X_Test)[:,1]

#Model Evaluation: AUC SCORE
auc = roc_auc_score(Y_Test,Y_Pred_ET)
print('AUC Score for Extra Tree Classifier: %.2f' % auc)

AUC Score for Extra Tree Classifier: 0.83




In [12]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [50,100,125,150,175,200,225,250,275,300]
# Number of features to consider at every split
max_features = [2,4,6,8,10]
# Maximum number of levels in tree
max_depth = [2,4,8,12,16,20,32,40,64,128]
# Minimum number of samples required to split a node
min_samples_split = [2, 5,8,10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [50, 100, 125, 150, 175, 200, 225, 250, 275, 300], 'max_features': [2, 4, 6, 8, 10], 'max_depth': [2, 4, 8, 12, 16, 20, 32, 40, 64, 128], 'min_samples_split': [2, 5, 8, 10], 'min_samples_leaf': [1, 2, 4, 6, 8], 'bootstrap': [True, False]}


In [13]:
ET = ExtraTreesClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
ET_random = RandomizedSearchCV(estimator = ET, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
ET_random.fit(X_Resampled,Y_Resampled)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   38.1s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.5min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [50, 100, 125, 150, 175, 200, 225, 250, 275, 300], 'max_features': [2, 4, 6, 8, 10], 'max_depth': [2, 4, 8, 12, 16, 20, 32, 40, 64, 128], 'min_samples_split': [2, 5, 8, 10], 'min_samples_leaf': [1, 2, 4, 6, 8], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [14]:
#Getting the best parameters
ET_random.best_params_

{'n_estimators': 150,
 'min_samples_split': 8,
 'min_samples_leaf': 1,
 'max_features': 2,
 'max_depth': 32,
 'bootstrap': False}

In [82]:
#Extra Tree Classfier: Hyper Parameter Tuned
from sklearn.ensemble import ExtraTreesClassifier
Extra_Tree_Tuned=ExtraTreesClassifier(max_depth=12, random_state=42, n_estimators=150,min_samples_split=2,
                               min_samples_leaf=4,max_features=2,bootstrap=False)
model2=Extra_Tree_Tuned.fit(X_Resampled, Y_Resampled)
Y_Pred_ET_Tuned=Extra_Tree_Tuned.predict_proba(X_Test)[:,1]

#Model Evaluation: AUC SCORE
auc = roc_auc_score(Y_Test,Y_Pred_ET_Tuned)
print('AUC Score for Hyper Parameter Tuned Extra Tree Classifier is: %.2f' % auc)

AUC Score for Hyper Parameter Tuned Extra Tree Classifier is: 0.83


In [83]:
#Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
Grad_Boost=GradientBoostingClassifier(max_depth=3,random_state=42)
model3=Grad_Boost.fit(X_Resampled, Y_Resampled)
Y_Pred_Grad=Grad_Boost.predict_proba(X_Test)[:,1]

#Model Evaluation: AUC SCORE
auc = roc_auc_score(Y_Test,Y_Pred_Grad)
print('AUC Score for Gradient Boosting Classifier is: %.2f' % auc)

AUC Score for Gradient Boosting Classifier is: 0.83


In [84]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
Logistic_Regression=LogisticRegression(random_state=42,solver='lbfgs')
model4=Logistic_Regression.fit(X_Resampled,Y_Resampled)
Y_Pred_Log=Logistic_Regression.predict_proba(X_Test)[:,1]

#Model Evaluation: AUC SCORE
auc = roc_auc_score(Y_Test,Y_Pred_Log)
print('AUC Score for Logistic Regression Classifier is: %.2f' % auc)

AUC Score for Logistic Regression Classifier is: 0.80


In [86]:
#Voting Classifier
from sklearn.ensemble import VotingClassifier
Voting_Classifier=VotingClassifier(estimators=[('Extra_Tree',Extra_Tree),('Grad_Boost',Grad_Boost),('Logistic_Regression',Logistic_Regression)],
                                  voting='soft')

for clf in (Extra_Tree,Grad_Boost,Logistic_Regression,Voting_Classifier):
    clf.fit(X_Resampled,Y_Resampled)
    Voting_Prediction=clf.predict_proba(X_Test)[:,1]

#Model Evaluation: AUC SCORE
auc = roc_auc_score(Y_Test,Voting_Prediction)
print('AUC Score for Voting Classifier Classifier: %.2f' % auc)

AUC Score for Voting Classifier Classifier: 0.83


In [87]:
#Support Vector Classifier
from sklearn.svm import SVC
SVM_clf=SVC(kernel='poly', probability=True)
SVM_clf.fit(X_Resampled,Y_Resampled)
Y_Pred_SVM=SVM_clf.predict_proba(X_Test)[:,1]

#Model Evaluation: AUC SCORE
auc = roc_auc_score(Y_Test,Y_Pred_SVM)
print('AUC Score for Support Vector Classifier: %.2f' % auc)



AUC Score for Support Vector Classifier: 0.80


In [88]:
#Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
Decision_Tree = DecisionTreeClassifier(random_state = 42,max_depth=4,
                                    criterion = 'gini',  splitter='best', min_samples_leaf=2, min_samples_split=4)
Decision_Tree.fit(X_Resampled,Y_Resampled)
Y_Pred_DT=Decision_Tree.predict_proba(X_Test)[:,1]

#Model Evaluation: AUC SCORE
auc = roc_auc_score(Y_Test,Y_Pred_DT)
print('AUC Score for Random Forest Classifier: %.2f' % auc)

AUC Score for Random Forest Classifier: 0.78


In [72]:
from sklearn.ensemble import AdaBoostClassifier
bdt = AdaBoostClassifier(ExtraTreesClassifier(max_depth=8, random_state=42, n_estimators=150,min_samples_split=2,
                               min_samples_leaf=4,max_features=2,bootstrap=False),
                         algorithm="SAMME",
                         n_estimators=100)
bdt.fit(X_Resampled,Y_Resampled)

#Model Evaluation: AUC SCORE
Pred=bdt.predict_proba(X_Test)[:,1]
auc = roc_auc_score(Y_Test,Pred)
print('AUC Score for Boosted Extra Tree Classifier: %.2f' % auc)

AUC Score for Boosted Extra Tree Classifier: 0.79


In [89]:
#Neural Network
import tensorflow as tf
from keras import backend as K

#Creating own evaluation parameter: AUC
def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    K.get_session().run(tf.local_variables_initializer())
    return auc

# Set callback functions to early stop training and save the best model so far
from keras.callbacks import EarlyStopping, ModelCheckpoint
callback = [EarlyStopping(monitor='auc', patience=50, mode=max),
             ModelCheckpoint(filepath='best_model.h5', monitor='auc', save_best_only=True)]

# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Activation, Dense


# Initialising the ANN
NN_classifier = Sequential()
# Adding the input layer and the first hidden layer
NN_classifier.add(Dense(units =16 , kernel_initializer = 'uniform', activation = 'relu', input_dim = 10))
# Adding the second hidden layer
NN_classifier.add(Dense(units = 8, kernel_initializer = 'uniform', activation = 'relu'))
NN_classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
# Compiling the ANN
NN_classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics =[auc])
# Fitting the ANN to the Training set
NN_classifier.fit(X_Resampled,Y_Resampled, batch_size = 100, epochs = 150, callbacks=callback)



Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150


<keras.callbacks.History at 0x1af2ea03518>

In [90]:
ANN_predictions = NN_classifier.predict(X_Test)
ANN_predictions=np.ravel(ANN_predictions)
ANN_predictions

#Model Evaluation: AUC SCORE
auc = roc_auc_score(Y_Test,ANN_predictions)
print('AUC Score for Random Forest Classifier: %.2f' % auc)

AUC Score for Random Forest Classifier: 0.81


In [91]:
#Ensemble of all Models: Combining probabilities and Normalizing it
Total=max(Y_Pred_ET_Tuned)+max(Y_Pred_Grad)+max(Y_Pred_Log)+max(ANN_predictions+max(Y_Pred_DT))
Final_Prob=(Y_Pred_ET_Tuned+Y_Pred_Grad+Y_Pred_Log+ANN_predictions+Y_Pred_DT)/Total

#Model Evaluation: AUC SCORE
auc = roc_auc_score(Y_Test,Final_Prob)
print('AUC Score for Combined Ensemble Classifier: %f' % auc)

AUC Score for Combined Ensemble Classifier: 0.826875


# Test Data Analysis

In [92]:
#Creating complete funciton for data preprocessing, feature engineering and data scaling
#Predicting the individual probabilities, creating ensemble and giving final output by attaching
#Additional Calumn.

def Final_Output(data):
    Output=data.copy()
    data=Data_Preprocessing(data)
    data=Data_Scaling(data)
    #data=data[['Multiple_Vehicle','Mobile_Operations','gender','device_type','Cheap_Ads','Tenure_Bin','age','Income_Bin','n_drivers','in_initial_launch_location']]
    data=data[Predictor_Columns]
    Y_Pred_ET_Tuned=Extra_Tree_Tuned.predict_proba(data)[:,1]
    Y_Pred_Grad=Grad_Boost.predict_proba(data)[:,1]
    Y_Pred_Log=Logistic_Regression.predict_proba(data)[:,1]
    Y_Pred_DT=Decision_Tree.predict_proba(data)[:,1]
    ANN_predictions = NN_classifier.predict(data)
    ANN_predictions=np.ravel(ANN_predictions)
    
    Total=max(Y_Pred_ET_Tuned)+max(Y_Pred_Grad)+max(Y_Pred_Log)+max(ANN_predictions+max(Y_Pred_DT))
    Final_Prob=(Y_Pred_ET_Tuned+Y_Pred_Grad+Y_Pred_Log+ANN_predictions+Y_Pred_DT)/Total
    
    Output['Outcome']=Final_Prob
    return Output

In [93]:
#Loading the dataset and passing it to above function to generate final output.
test=pd.read_csv('test.csv')
result=Final_Output(test)
result.head()

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Unnamed: 0,age,cost_of_ad,device_type,gender,in_initial_launch_location,income,n_drivers,n_vehicles,prior_ins_tenure,Outcome
0,34,0.005134,Android,F,1,40376,1,3,7,0.391433
1,53,0.005223,desktop,F,1,84511,1,1,11,0.716239
2,46,0.004939,laptop,F,0,79322,1,1,4,0.450985
3,36,0.004924,Android,F,0,63295,1,2,0,0.276288
4,28,0.005146,other,F,1,36170,1,3,3,0.270376


In [95]:
#Converting notebook to HTML
!jupyter nbconvert --to html Blue_Owl_Machine_Learning_Assignment.ipynb

[NbConvertApp] Converting notebook Blue_Owl_Machine_Learning_Assignment.ipynb to html
[NbConvertApp] Writing 362129 bytes to Blue_Owl_Machine_Learning_Assignment.html


Code By Sudesh V Khillare