In [1]:
"""
Created on Wed Jan 23 11:48:32 2019

@author: Andrew Welsh

This notebook is for doing model parameter tuning grid searches and feature selection using historical data, 
with intention of finding the optimal parameters to use for the live model fit. 

The data must be extracted via SQL prior to inclusion in this notebook (until a data connection directly to 
the database from Seth's machines is configured), then saved via CSV, which is read here. 

Intended use is for initial model development, as well as when new features are added.

Output: knowledge of optimal model parameters to attain best fit
"""

#import pypyodbc 
import pandas as pd
import churn_common as p #imports F2 score function, model_fit function

# imported data already has nulls cleaned and unused columns dropped
# churn_cloud_model_training_fit.csv is output from 'churn_cloud_model_training_fit.py' python script
cloud = pd.read_csv('churn_cloud_model_training_fit.csv', encoding='ISO-8859-1')

print('Query complete')
pd.options.display.max_columns = None
#cloud = cloud.iloc[1:5000, :].copy() #select first 5k rows for fast testing
cloud = cloud.loc[cloud['avg per line item invoiced in last 6 months'].notnull()] 


Query complete


In [2]:
#index column from CSV is imported as a separate column, labeled 'Unnamed: 0', therefore must be dropped before model fit
cloud.drop(['Unnamed: 0'], axis=1, inplace=True)

#generate TMK date column
cloud['time_month_key_dt']=pd.to_datetime(
        cloud['time_month_key'].apply(str).str.slice(stop=4)+"-"+
        cloud['time_month_key'].apply(str).str.slice(start=4)+"-01")

#hold out evaluation dataset of last 3 months
#Months are hard-coded here, use df['time_month_key'].max() to find max TMK, configure eval set to be 3 most recent months
#Example: max TMK 201812; eval data is 201810 to 201812 inclusive, cloud_num is 201809 and earlier
#min TMK is 201507
#max TMK is 201809

#define model training dataset, set to TMK < max(TMK)-3
cloud_train = cloud.loc[cloud['time_month_key']<201806]
cloud_train.set_index(['account_number','time_month_key'])

#hold out evaluation dataset of last 3 months
cloud_val = cloud.loc[(cloud['time_month_key']>=201806) & (cloud['time_month_key']<201809)]
cloud_val.set_index(['account_number','time_month_key'])

#the table head for the eval data should output below. Most accounts should have 3 rows each (3 latest TMKs).

Unnamed: 0_level_0,Unnamed: 1_level_0,account type - cloud uk,mi_mo - core,mi_mo - managed,mi_mo - mi,mi_mo - mo,mi_mo - unassigned,tam_changed,tam_ratio_changed,tam_acct_ratio,is_cloud_rackconnect_linked,target,average_invoiced_last_12_months,acc_tenure_in_months,seasonal_control,total_invoiced_in_month,baseline_period_start,total number of tickets,avg days to close ticket,avg minutes to first customer comment,avg minutes to first racker comment,avg minutes to started in progress,avg minutes to started in progress - severity urgent,num tickets severity - emergency,number of tickets feedback received,number of tickets solved,rate of tickets feedback received,rate of tickets solved,number of commie tickets,total invoiced in last 6 months,avg per line item invoiced in last 6 months,avg mthly num of invoiced items in last 6 months,total invoiced in last 6 months vs prior 6 months,avg invoiced in last 6 months vs prior 6 months,avg mthly num of invoiced items in last 6 months vs prior 6 months,total invoiced in last 3 months vs prior 3 months,avg invoiced in last 3 months vs prior 3 months,avg mthly num of invoiced items in last 3 months vs prior 3 months,per unit price - next gen servers,per unit price - cloud block storage,per unit price - cloud files,per unit price - first gen servers,per unit price - cloud load balancer,per unit price - cloud sites,per unit price - cloud backup,per unit price - total outgoing bw,per unit price - cloud monitoring,per unit price - cloud databases,per unit price - cloud queues,how many units - next gen servers,how many units - cloud block storage,how many units - cloud files,how many units - first gen servers,how many units - cloud load balancer,how many units - cloud sites,how many units - cloud backup,how many units - total outgoing bw,how many units - cloud monitoring,how many units - cloud databases,how many units - cloud queues,pct of invoice - next gen servers,pct of invoice - cloud block storage,pct of invoice - cloud files,pct of invoice - first gen servers,pct of invoice - cloud load balancer,pct of invoice - cloud sites,pct of invoice - cloud backup,pct of invoice - total outgoing bw,pct of invoice - cloud monitoring,pct of invoice - cloud databases,pct of invoice - cloud queues,num opportunities last 6 months,num opportunities won last 6 months,num opportunities lost last 6 months,pct opportunities won last 6 months,pct opportunities lost last 6 months,num opportunities last 3 months,num opportunities won last 3 months,num opportunities lost last 3 months,pct opportunities won last 3 months,pct opportunities lost last 3 months,total value of opportunities last 6 months,total value of opportunities last 3 months,average value of opportunities last 6 months,average value of opportunities last 3 months,num opportunities,num opps allow_quote - not allowed,num opps allow_quote - allowed,num opps bucket_influence - null,num opps bucket_influence - marketing,num opps bucket_source - sales,num opps bucket_source - null,num opps bucket_source - marketing,num opps category - upgrade,num opps category - new,num opps category - new footprint,num opps category - cloud net revenue,num opps category - new logo,num opps category - migration,num opps category - null,num opps commission_role - null,num opps commission_role - pay commissions,num opps competitors - null,num opps competitors - in-house,num opps competitors - other,num opps contract_length - null,num opps contract_length - 12,num opps contract_length - 1,num opps contract_length - 0,num opps contract_length - 24,num opps cvp_verified - false,num opps cvp_verified - true,"num opps data_quality_description - missing: lead source, next ste","num opps data_quality_description - missing: amount, next steps",num opps data_quality_description - missing: next steps,num opps data_quality_description - missing: lead source,num opps data_quality_description - all opportunity details captur,num opps data_quality_description - missing: amount,num opps data_quality_score - 60,num opps data_quality_score - 40,num opps data_quality_score - 80,num opps data_quality_score - 100,num opps econnect_received - false,num opps econnect_received - true,num opps focus_area - null,num opps focus_area - dedicated,num opps focus_area - cloud office,num opps focus_area - openstack public,num opps focus_area - tricore,num opps focus_area - amazon,num opps forecastcategory - closed,num opps forecastcategory - omitted,num opps forecastcategory - pipeline,num opps forecastcategoryname - closed,num opps forecastcategoryname - omitted,num opps forecastcategoryname - pipeline,num opps iswon - true,num opps iswon - false,num opps leadsource - null,num opps leadsource - chat,num opps leadsource - call in,num opps leadsource - partner network,num opps leadsource - outbound,num opps leadsource - site submission,num opps leadsource - unknown,num opps live_call - false,num opps market_source - null,num opps market_source - no,num opps market_source - yes,num opps nutcase_deal_probability - 0,num opps on_demand_reconciled - false,num opps on_demand_reconciled - true,num opps pain_point - null,num opps pain_point - other,num opps pain_point - servicenow,num opps probability - 100,num opps probability - 0,num opps probability - 15,num opps requested_products - null,num opps requested_products - hosting only,num opps support_unit - null,num opps support_unit - tricore,num opps support_unit - enterprise,num opps support_unit - smb,num opps support_unit - email & apps,num opps ticket_type - null,num opps ticket_type - upgrade,num opps typex - dedicated/private cloud,num opps typex - revenue ticket,num opps typex - mail contract signup,num opps typex - rackspace cloud,num opps typex - tricore,num opps typex - aws,num opps what_did_we_do_well - null,num opps what_did_we_do_well - solution fit,num opps why_did_we_lose - null,num opps why_did_we_lose - no response,num opps why_did_we_lose - unresponsive,num opps why_did_we_lose - project abandoned,num opps why_did_we_lose - existing opp/closed via ticket,num opps why_did_we_lose - price,account_sla_type_num,num distinct account_team_name,num distinct account_manager,num distinct account_bdc,num distinct account_primary_contact,num distinct account_region,num distinct account_billing_street,num distinct account_billing_city,num distinct account_billing_state,num distinct account_billing_postal_code,num distinct account_billing_country,num distinct account_geographic_location,last_survey_score,second_last_survey_score,change_in_survey_score,rating_promoter_to_promoter,rating_promoter_to_passive,rating_promoter_to_detractor,rating_passive_to_promoter,rating_passive_to_passive,rating_passive_to_detractor,rating_detractor_to_promoter,rating_detractor_to_passive,rating_detractor_to_detractor,ownership - private,ownership - public,ownership - unknown,ownership - subsidiary,ownership - other,site - branch,site - headquarters,site - single location,company_review_priority,number_of_customer_accounts,number_of_cloud_accounts,time_month_key_dt
account_number,time_month_key,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1,Unnamed: 217_level_1
883600,201808,0,1,0,0,0,0,0,1,121393.0,0,0,29.660000,55,0,30.11,201708,0,0,0,0,0,0,0,0.0,0.0,0.000000,0.000000,0.0,178.92,13.763076,13.0,1.010847,0.933089,1.083333,1.021694,0.875738,1.166666,25.267142,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,1.166666,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.988542,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0.0,0.0,2018-08-01
883600,201806,0,1,0,0,0,0,0,1,121374.0,0,0,29.639166,53,0,29.50,201706,0,0,0,0,0,0,0,0.0,0.0,0.000000,0.000000,0.0,178.67,13.743846,13.0,1.009435,0.931786,1.083333,1.018870,0.873317,1.166666,25.267142,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,1.166666,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.989925,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0.0,0.0,2018-06-01
883643,201806,0,1,0,0,0,0,0,0,0.0,0,0,1293.680833,53,0,1635.62,201706,2,3,8596,154,4560,0,0,1.0,1.0,0.500000,0.500000,0.0,8815.94,99.055505,89.0,1.314197,1.166534,1.126582,1.165365,1.139468,1.022727,180.665000,38.898333,1.303846,0.0,95.937777,0.0,10.093333,0.0,0.0,142.230000,0.0,4.666666,1.000000,2.166666,0.0,3.000000,0.0,1.0,0.0,0.0,2.000000,0.0,0.573803,0.026473,0.001922,0.0,0.195881,0.0,0.006869,0.0,0.0,0.193599,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0.0,0.0,2018-06-01
883642,201808,0,1,0,0,0,0,0,0,0.0,0,1,32.194166,55,0,4.46,201708,0,0,0,0,0,0,0,0.0,0.0,0.000000,0.000000,0.0,76.26,12.710000,6.0,0.245944,0.245944,1.000000,0.314600,0.314600,1.000000,0.000000,0.000000,12.710000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,1.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,1.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,6.0,0.0,0.0,2018-08-01
883699,201806,0,0,1,0,0,0,0,1,121374.0,0,1,479.160000,53,0,4.56,201706,2,7,0,256,281,0,0,0.0,2.0,0.000000,1.000000,0.0,1478.20,52.792857,28.0,0.346043,0.655010,0.528301,0.009975,0.083128,0.120000,147.824444,18.070000,4.765000,0.0,3.267142,0.0,0.000000,0.0,0.0,0.000000,0.0,1.500000,0.500000,1.000000,0.0,1.166666,0.0,0.0,0.0,0.0,0.000000,0.0,0.900027,0.036672,0.019341,0.0,0.015471,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,1,1,1,1,1,2,2,1,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0.0,0.0,2018-06-01
883643,201807,0,1,0,0,0,0,0,0,0.0,0,0,1371.695000,54,0,1665.70,201707,3,7,4752,349,4560,0,0,4.0,2.0,1.333333,0.666667,0.0,9170.60,103.040449,89.0,1.258014,1.187339,1.059523,1.119531,1.197638,0.934782,191.800740,42.903333,1.278571,0.0,100.304444,0.0,10.038333,0.0,0.0,153.180833,0.0,4.500000,1.000000,2.333333,0.0,3.000000,0.0,1.0,0.0,0.0,2.000000,0.0,0.564698,0.028070,0.001951,0.0,0.196876,0.0,0.006567,0.0,0.0,0.200441,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0.0,0.0,2018-07-01
883689,201806,0,1,0,0,0,0,0,1,121374.0,0,0,28.890000,53,0,29.37,201706,0,0,0,0,0,0,0,0.0,0.0,0.000000,0.000000,0.0,173.88,5.796000,30.0,1.006250,1.006250,1.000000,1.009708,1.009708,1.000000,8.230000,0.000000,0.300000,0.0,0.000000,0.0,10.040000,0.0,0.0,0.000000,0.0,2.000000,0.000000,1.000000,0.0,0.000000,0.0,1.0,0.0,0.0,0.000000,0.0,0.567977,0.000000,0.010351,0.0,0.000000,0.0,0.346445,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,2,1,1,1,1,1,1,1,1,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0.0,0.0,2018-06-01
883661,201806,0,1,0,0,0,0,0,1,121374.0,0,0,6.549166,53,0,7.50,201706,0,0,0,0,0,0,0,0.0,0.0,0.000000,0.000000,0.0,41.76,3.212307,13.0,1.133858,1.046638,1.083333,1.213036,1.415209,0.857142,0.000000,0.000000,3.212307,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,2.166666,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,1.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,2,1,1,1,1,1,1,1,1,1,1,10,10,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0.0,0.0,2018-06-01
883701,201806,0,0,0,0,1,0,0,0,0.0,0,0,1402.205833,53,0,1436.51,201706,2,7,1070,703,707,0,0,1.0,1.0,0.500000,0.500000,0.0,8690.64,167.127692,52.0,1.068193,1.006566,1.061224,1.011368,0.866887,1.166666,214.760500,0.000000,14.535000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,6.666666,0.000000,1.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.988468,0.000000,0.010034,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,1,1,1,1,1,1,1,1,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0.0,0.0,2018-06-01
883719,201807,0,0,1,0,0,0,0,0,0.0,1,0,625.985833,54,0,627.28,201707,0,0,0,0,0,0,0,0.0,0.0,0.000000,0.000000,0.0,3756.07,156.502916,24.0,1.000082,1.000082,1.000000,0.999834,0.999834,1.000000,305.530000,11.860000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,2.000000,1.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.976115,0.018945,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,3,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0.0,0.0,2018-07-01


In [3]:
#split datasets into low-med-hi revenue segments

#training dataset
cloud_train_low, cloud_train_mid, cloud_train_hi = p.split_df_revenue_segments(cloud_train, 'average_invoiced_last_12_months', low=1000, high=10000)

#validation dataset
cloud_val_low, cloud_val_mid, cloud_val_hi = p.split_df_revenue_segments(cloud_val, 'average_invoiced_last_12_months', low=1000, high=10000)

print("Train-test-eval split complete")

Train-test-eval split complete


In [6]:
#==================================================================================================================
#==================================================================================================================
# Resample minority class in training dataset
#==================================================================================================================
#==================================================================================================================

#for repeatable results, set seed equal to an integer; for random assignment, set seed to None
seed = 155

cloud_train_low_rus = p.AndrewsRandomUnderSampler(cloud_train_low, 'target', seed)
cloud_train_mid_rus = p.AndrewsRandomUnderSampler(cloud_train_mid, 'target', seed)
cloud_train_hi_rus = p.AndrewsRandomUnderSampler(cloud_train_hi, 'target', seed)

print('Minority class resampling complete')

Minority class resampling complete


In [5]:
#define lists of target (y) and predictor (x) columns
exclude_col = ['target'
               ,'average_invoiced_last_12_months'
               ,'account_number'
               ,'time_month_key'
               ,'month_order'
               ,'time_month_key_dt'
               ]

include_col = ['3 mth pct change number of device_status = computer no longer active'
               ]

#predictors = list(cloud_train.columns[~cloud_train.columns.isin(exclude_col)])
predictors = list(cloud_train.columns[cloud_train.columns.isin(include_col)])

#create copy of the X and Y columns
x1_train_res_low = cloud_train_low_rus[predictors] #uses list of columns excluding target (predictors) to select columns for x_low
y1_train_res_low = cloud_train_low_rus['target']

x1_train_res_mid = cloud_train_mid_rus[predictors]
y1_train_res_mid = cloud_train_mid_rus['target']

x1_train_res_hi = cloud_train_hi_rus[predictors]
y1_train_res_hi = cloud_train_hi_rus['target']

print("train ready")

#create copy of the X and Y columns for validation data
x_val_low = cloud_val_low[predictors] #uses list of columns excluding target (predictors) to select columns for x_low
y1_val_low = cloud_val_low['target']

x_val_mid = cloud_val_mid[predictors]
y1_val_mid = cloud_val_mid['target']

x_val_hi = cloud_val_hi[predictors]
y1_val_hi = cloud_val_hi['target']

print("eval ready")

train ready
eval ready


In [7]:
#Create time series split cross validators, cross-validated by time month key, on training dataset (test data generated 
#within the cross-validator routine, and used for grid search scoring in GridSearchCV API)
cvsplits = 4
cv_low = p.TimeSeriesSplitTMK(n_splits=cvsplits).split(cloud_train_low_rus)
cv_mid = p.TimeSeriesSplitTMK(n_splits=cvsplits).split(cloud_train_mid_rus)
cv_hi = p.TimeSeriesSplitTMK(n_splits=cvsplits).split(cloud_train_hi_rus)

In [8]:
#import libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score, roc_auc_score, make_scorer
from sklearn.metrics import make_scorer, fbeta_score, matthews_corrcoef
import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

import time
import datetime
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [None]:
#==================================================================================================================
#==================================================================================================================
#==================================================================================================================
#Initial model run with no parameters tuned
#==================================================================================================================
#==================================================================================================================
#==================================================================================================================

xgb_final_low = XGBClassifier(
        learning_rate=0.05, 
        n_estimators=1100, 
        max_depth=13, 
        max_features=13, 
        subsample=0.7,
        min_child_weight=1,
        gamma=0, 
        colsample_bytree=0.7, 
        scale_pos_weight=1, 
        reg_alpha=0.1,
        reg_lambda=1,
        random_state=seed, 
        eval_metric='auc',
        objective='binary:logistic',
        nthread=128
        )

xgb_final_mid = XGBClassifier(
        learning_rate=0.05, 
        n_estimators=1300, 
        max_depth=9, 
        max_features=13, 
        subsample=0.7,
        min_child_weight=1,
        gamma=0, 
        colsample_bytree=0.7, 
        scale_pos_weight=1, 
        reg_alpha=1e-05,
        reg_lambda=0.01,
        random_state=seed, 
        eval_metric='auc',
        objective='binary:logistic',
        nthread=128
        )

xgb_final_hi = XGBClassifier(
        learning_rate=0.05, 
        n_estimators=1300, 
        max_depth=9, 
        max_features=13, 
        subsample=0.7,
        min_child_weight=1,
        gamma=0, 
        colsample_bytree=0.7, 
        scale_pos_weight=1, 
        reg_alpha=1e-05,
        reg_lambda=1,
        random_state=seed, 
        eval_metric='auc',
        objective='binary:logistic',
        nthread=128
        )

p.modeleval(xgb_final_low, x1_train_res_low, y1_train_res_low.values.ravel(), x_val_low, y1_val_low, predictors)
p.modeleval(xgb_final_mid, x1_train_res_mid, y1_train_res_mid.values.ravel(), x_val_mid, y1_val_mid, predictors)
p.modeleval(xgb_final_hi, x1_train_res_hi, y1_train_res_hi.values.ravel(), x_val_hi, y1_val_hi, predictors)

In [9]:
#==================================================================================================================
#==================================================================================================================
#==================================================================================================================
#PARAMETER TUNING
#==================================================================================================================
#==================================================================================================================
#==================================================================================================================

#defining the grid search metric scorer. Using F2 scores; this weighs the recall twice as much as the precision
f2_scorer = make_scorer(fbeta_score, beta=2)

#Using F0.5 scores weighs the precision twice as much as the recall
#f05_scorer = make_scorer(fbeta_score, beta=0.5)

#mcc = make_scorer(matthews_corrcoef)

In [10]:
print('Run first grid search for parameter tuning for low revenue accts')

#
#
#
#Tune learning rate, gamma and number of estimators
#
#
#

param_test1 = {
  'learning_rate':[0.005,0.01],#test values of 0.05 to 0.3. must be between 0 and 1
#  'gamma':[i/2 for i in range(0,2)], #test values of 0 to 3, in steps of 0.5. must be non-negative, default 0
#  'gamma':[0,5,10], #test values of 1, 5 and 10. default is 0, but overfitting is a problem
  'n_estimators':range(500,1501,500) #test values of 500 to 1500, in steps of 500
}

start_time = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')

g1_low = GridSearchCV(estimator = XGBClassifier(
#        learning_rate=0.1, 
#        n_estimators=500, 
        max_depth=13, 
        max_features=13, 
        subsample=0.8,
        min_child_weight=1,
        gamma=0, 
        colsample_bytree=0.8, 
        scale_pos_weight=1, 
        reg_alpha=0.1,
        reg_lambda=1e-05,
        random_state=seed, 
        eval_metric='auc',
        objective='binary:logistic', #try binary:hinge?
        max_delta_step=0,
        nthread=-1), param_grid = param_test1, scoring=f2_scorer, n_jobs=-1,iid=False, cv=cv_low
                     )
#1st iteration {'gamma': 1, 'learning_rate': 0.01, 'n_estimators': 1250}

g1_mid = GridSearchCV(estimator = XGBClassifier(
#        learning_rate=0.1, 
#        n_estimators=500, 
        max_depth=13, 
        max_features=13, 
        subsample=0.8,
        min_child_weight=1,
        gamma=0, 
        colsample_bytree=0.8, 
        scale_pos_weight=1, 
        reg_alpha=0.1,
        reg_lambda=1e-05,
        random_state=seed, 
        eval_metric='auc',
        objective='binary:logistic',
        max_delta_step=0,
        nthread=-1), param_grid = param_test1, scoring=f2_scorer, n_jobs=-1,iid=False, cv=cv_mid
                     )
#1st iteration {'gamma': 1, 'learning_rate': 0.005, 'n_estimators': 1000}

g1_hi = GridSearchCV(estimator = XGBClassifier(
#        learning_rate=0.1, 
#        n_estimators=1500, 
        max_depth=13, 
        max_features=13, 
        subsample=0.8,
        min_child_weight=1,
        gamma=0, 
        colsample_bytree=0.7, 
        scale_pos_weight=1, 
        reg_alpha=0.1,
        reg_lambda=1e-05,
        random_state=seed, 
        eval_metric='auc',
        objective='binary:logistic',
        max_delta_step=0,
        nthread=-1), param_grid = param_test1, scoring=f2_scorer, n_jobs=-1,iid=False, cv=cv_hi
                     )
#1st iteration {'gamma': 0, 'learning_rate': 0.005, 'n_estimators': 1500 (tested 1000-1500)}
#2nd iteration {'gamma': 0, 'learning_rate': 0.01, 'n_estimators': 1500 (tested 1500-2000)}

g1_low.fit(x1_train_res_low,y1_train_res_low)
g1_mid.fit(x1_train_res_mid,y1_train_res_mid)
g1_hi.fit(x1_train_res_hi,y1_train_res_hi)


end_time = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')

Run first grid search for parameter tuning for low revenue accts


(low revenue)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, eval_metric='auc', gamma=0.0,
       learning_rate=0.1, max_delta_step=0, max_depth=13, max_features=13,
       min_child_weight=1, missing=None, n_estimators=1400, n_jobs=1,
       nthread=-1, objective='binary:logistic', random_state=15,
       reg_alpha=0.1, reg_lambda=1e-05, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)

In [11]:
g1_low_params = g1_low.best_estimator_
g1_mid_params = g1_mid.best_estimator_
g1_hi_params = g1_hi.best_estimator_

print(g1_low.best_estimator_)
print(g1_low.best_params_)
print(g1_low.best_score_)

print(g1_mid.best_estimator_)
print(g1_mid.best_params_)
print(g1_mid.best_score_)

print(g1_hi.best_estimator_)
print(g1_hi.best_params_)
print(g1_hi.best_score_)

print(start_time)
print(end_time)

print(g1_low.cv_results_)
print(g1_mid.cv_results_)
print(g1_hi.cv_results_)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, eval_metric='auc', gamma=0,
       learning_rate=0.01, max_delta_step=0, max_depth=13, max_features=13,
       min_child_weight=1, missing=None, n_estimators=1500, n_jobs=1,
       nthread=-1, objective='binary:logistic', random_state=155,
       reg_alpha=0.1, reg_lambda=1e-05, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)
{'learning_rate': 0.01, 'n_estimators': 1500}
0.8499199685075913
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, eval_metric='auc', gamma=0,
       learning_rate=0.005, max_delta_step=0, max_depth=13,
       max_features=13, min_child_weight=1, missing=None,
       n_estimators=1000, n_jobs=1, nthread=-1,
       objective='binary:logistic', random_state=155, reg_alpha=0.1,
       reg_lambda=1e-05, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.8)
{'learning_rate': 0.005, 'n_estimators'

In [None]:
Low:
    XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, eval_metric='auc', gamma=0,
       learning_rate=0.005, max_delta_step=0, max_depth=13,
       max_features=13, min_child_weight=1, missing=None,
       n_estimators=1750, n_jobs=1, nthread=-1,
       objective='binary:logistic', random_state=17, reg_alpha=0.1,
       reg_lambda=1e-05, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.8)
{'learning_rate': 0.005, 'n_estimators': 1750}
0.7607220178419795
Mid: 
    XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, eval_metric='auc', gamma=0,
       learning_rate=0.01, max_delta_step=0, max_depth=13, max_features=13,
       min_child_weight=1, missing=None, n_estimators=1500, n_jobs=1,
       nthread=-1, objective='binary:logistic', random_state=17,
       reg_alpha=0.1, reg_lambda=1e-05, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)
{'learning_rate': 0.01, 'n_estimators': 1500}
0.7877350749486776

In [None]:
#
#
#
#
#Tune max_delta_step, max_depth and min_child_weight
#
#XGBoost docs say "Maximum delta step we allow each leaf output to be. If the value is set 
#to 0, it means there is no constraint. If it is set to a positive value, it can help making 
#the update step more conservative. Usually this parameter is not needed, but it might help 
#in logistic regression when class is extremely imbalanced. Set it to value of 1-10 might 
#help control the update." 
#
#max_delta_step range is 0 to infinity
#
#
#
#

param_test2 = {
 'max_depth':range(11,16,2)
 ,'min_child_weight':range(1,3,1)
 ,'max_delta_step':range(0,3,1) #try values 0 through 2, increment by 1
}

#Create time series split cross validators, cross-validated by time month key, on training dataset (test data generated 
#within the cross-validator routine, and used for grid search scoring in GridSearchCV API)
cvsplits = 3
cv_low = p.TimeSeriesSplitTMK(n_splits=cvsplits).split(cloud_train_low_rus)
cv_mid = p.TimeSeriesSplitTMK(n_splits=cvsplits).split(cloud_train_mid_rus)
cv_hi = p.TimeSeriesSplitTMK(n_splits=cvsplits).split(cloud_train_hi_rus)

start_time = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')

#g2_low = GridSearchCV(estimator = XGBClassifier(g1_low_params), param_grid = param_test2, scoring=f2_scorer
#                      ,n_jobs=-1,iid=False, cv=3)


#g2_mid = GridSearchCV(estimator = XGBClassifier(g1_mid_params), param_grid = param_test2, scoring=f2_scorer
#                      ,n_jobs=-1,iid=False, cv=3)

#g2_hi = GridSearchCV(estimator = XGBClassifier(g1_hi_params), param_grid = param_test2, scoring=f2_scorer
#                     ,n_jobs=-1,iid=False, cv=3)

g2_low = GridSearchCV(estimator = XGBClassifier(
#{'learning_rate': 0.005, 'n_estimators': 1750}
        base_score=0.5,
        booster='gbtree',
        colsample_bylevel=1,
        colsample_bytree=0.7,
        gamma=10, 
        learning_rate=0.0001,
#        max_delta_step=1, 
#        max_depth=15, 
#        min_child_weight=1, 
        missing=None,
        n_estimators=1500, 
        random_state=seed,
        reg_alpha=0.1,
        reg_lambda=1e-05,
        scale_pos_weight=1,
        silent=True,
        subsample=0.8,
        eval_metric='auc',
        objective='binary:logistic',
        nthread=-1), param_grid = param_test2, scoring=f2_scorer
                      ,n_jobs=-1,iid=False, cv=cv_low)

g2_mid = GridSearchCV(estimator = XGBClassifier(
        base_score=0.5,
        booster='gbtree',
        colsample_bylevel=1,
        colsample_bytree=0.9,
        gamma=1, 
        learning_rate=0.01,
#        max_delta_step=1, 
#        max_depth=15, 
#        min_child_weight=1, 
        missing=None,
        n_estimators=1500, 
        random_state=seed,
        reg_alpha=0.01,
        reg_lambda=0.1,
        scale_pos_weight=1,
        silent=True,
        subsample=0.7,
        eval_metric='auc',
        objective='binary:logistic',
        nthread=-1), param_grid = param_test2, scoring=f2_scorer
                      ,n_jobs=-1,iid=False, cv=cv_mid)
#1st iteration: {'max_delta_step': 1, 'max_depth': 11, 'min_child_weight': 1}

g2_hi = GridSearchCV(estimator = XGBClassifier(
        learning_rate=0.01, 
        n_estimators=1500, 
        max_depth=13, 
        max_features=13, 
        subsample=0.8,
        min_child_weight=1,
        gamma=0, 
        colsample_bytree=0.7, 
        scale_pos_weight=1, 
        reg_alpha=0.1,
        reg_lambda=1e-05,
        random_state=seed, 
        eval_metric='auc',
        objective='binary:logistic',
        max_delta_step=0,
        nthread=-1), param_grid = param_test2, scoring=f2_scorer, n_jobs=-1,iid=False, cv=cv_hi)
#1st iteration {'max_delta_step': 1, 'max_depth': 11, 'min_child_weight': 1}

g2_low.fit(x1_train_res_low,y1_train_res_low)
g2_mid.fit(x1_train_res_mid,y1_train_res_mid)
g2_hi.fit(x1_train_res_hi,y1_train_res_hi)

end_time = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')

In [None]:
#g2_low_params = g2_low.best_estimator_
#g2_mid_params = g2_mid.best_estimator_
g2_hi_params = g2_hi.best_estimator_

#print(g2_low.best_estimator_)
#print(g2_low.best_params_)
#print(g2_low.best_score_)

#print(g2_mid.best_estimator_)
#print(g2_mid.best_params_)
#print(g2_mid.best_score_)

print(g2_hi.best_estimator_)
print(g2_hi.best_params_)
print(g2_hi.best_score_)

print(start_time)
print(end_time)

#print(g2_low.cv_results_)
#print(g2_mid.cv_results_)
print(g2_hi.cv_results_)


In [None]:
#
#
#
#
#Tune Subsample and colsample_bytree
#
#
#
#
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)], #test values of 0.0 to 0.4, in steps of 0.1
 'subsample':[i/10.0 for i in range(7,11)], #test values of 0.7 to 1.0, in steps of 0.1
 'colsample_bytree':[0.7,0.8,0.9,1]
}

start_time = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')

#g3_low = GridSearchCV(estimator = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
#       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=1,
#       max_depth=19, min_child_weight=1, missing=None, n_estimators=2400,
#       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
#       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
#       silent=True, subsample=1), param_grid = param_test3, scoring=f2_scorer
#                      ,n_jobs=-1,iid=False, cv=3)


g3_mid = GridSearchCV(estimator = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=1,
       max_depth=15, min_child_weight=1, missing=None, n_estimators=2000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), param_grid = param_test3, scoring=f2_scorer
                      ,n_jobs=-1,iid=False, cv=2)


g3_hi = GridSearchCV(estimator = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=21, min_child_weight=1, missing=None, n_estimators=2000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1), param_grid = param_test3, scoring=f2_scorer
                     ,n_jobs=-1,iid=False, cv=2)

#g3_low.fit(x1_train_res_low,y1_train_res_low)
g3_mid.fit(x1_train_res_mid,y1_train_res_mid)
g3_hi.fit(x1_train_res_hi,y1_train_res_hi)

end_time = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')

In [None]:
#g3_low_params = g3_low.best_estimator_
g3_mid_params = g3_mid.best_estimator_
g3_hi_params = g3_hi.best_estimator_

#print(g3_low.cv_results_)
#print(g3_low.best_estimator_)
#print(g3_low.best_params_)
#print(g3_low.best_score_)

#print(g3_mid.cv_results_)
print(g3_mid.best_estimator_)
print(g3_mid.best_params_)
print(g3_mid.best_score_)

#print(g3_hi.cv_results_)
print(g3_hi.best_estimator_)
print(g3_hi.best_params_)
print(g3_hi.best_score_)

print(start_time)
print(end_time)

In [None]:
#
#
#
#
#Tune Regularization alpha (L1)
# and lambda (L2)
#
#
#
#
param_test4 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100],
 'reg_lambda':[1e-5, 1e-2, 0.1, 1, 10, 50, 100]
}

start_time = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')

#g4_low = GridSearchCV(estimator = XGBClassifier(g3_low_params), param_grid = param_test4, scoring=f2_scorer
#                      ,n_jobs=-1,iid=False, cv=2)


g4_mid = GridSearchCV(estimator = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.0, learning_rate=0.1,
       max_delta_step=1, max_depth=15, min_child_weight=1, missing=None,
       n_estimators=2000, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.7), param_grid = param_test4, scoring=f2_scorer
                      ,n_jobs=-1,iid=False, cv=2)


g4_hi = GridSearchCV(estimator = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.0, learning_rate=0.1,
       max_delta_step=0, max_depth=21, min_child_weight=1, missing=None,
       n_estimators=2000, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.7), param_grid = param_test4, scoring=f2_scorer
                     ,n_jobs=-1,iid=False, cv=2)

#g4_low.fit(x1_train_res_low,y1_train_res_low)
g4_mid.fit(x1_train_res_mid,y1_train_res_mid)
g4_hi.fit(x1_train_res_hi,y1_train_res_hi)

end_time = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')

In [None]:
#g4_low_params = g4_low.best_estimator_
g4_mid_params = g4_mid.best_estimator_
g4_hi_params = g4_hi.best_estimator_

#print(g4_low.cv_results_)
#print(g4_low.best_estimator_)
#print(g4_low.best_params_)
#print(g4_low.best_score_)

#print(g4_mid.cv_results_)
print(g4_mid.best_estimator_)
print(g4_mid.best_params_)
print(g4_mid.best_score_)

#print(g4_hi.cv_results_)
print(g4_hi.best_estimator_)
print(g4_hi.best_params_)
print(g4_hi.best_score_)

print(start_time)
print(end_time)

In [None]:
#
#
#
#
#Tune learning rate and n_estimators
#
#
#
#
param_test5 = {
 'max_delta_step': range(0,6,1)
}

start_time = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')

g5_low = GridSearchCV(estimator = XGBClassifier(g4_low_params), param_grid = param_test5, scoring=f2_scorer
                      ,n_jobs=-1,iid=False, cv=2)


g5_mid = GridSearchCV(estimator = XGBClassifier(g4_mid_params), param_grid = param_test5, scoring=f2_scorer
                      ,n_jobs=-1,iid=False, cv=2)


g5_hi = GridSearchCV(estimator = XGBClassifier(g4_hi_params), param_grid = param_test5, scoring=f2_scorer
                     ,n_jobs=-1,iid=False, cv=2)

g5_low.fit(x1_train_res_low,y1_train_res_low)
g5_mid.fit(x1_train_res_mid,y1_train_res_mid)
g5_hi.fit(x1_train_res_hi,y1_train_res_hi)

end_time = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')

In [None]:
g5_low_params = g5_low.best_estimator_
g5_mid_params = g5_mid.best_estimator_
g5_hi_params = g5_hi.best_estimator_

#print(g5_low.cv_results_)
print(g5_low.best_estimator_)
print(g5_low.best_params_)
print(g5_low.best_score_)

#print(g5_mid.cv_results_)
print(g5_mid.best_estimator_)
print(g5_mid.best_params_)
print(g5_mid.best_score_)

#print(g5_hi.cv_results_)
print(g5_hi.best_estimator_)
print(g5_hi.best_params_)
print(g5_hi.best_score_)

print(start_time)
print(end_time)

In [None]:
#==================================================================================================================
#==================================================================================================================
#==================================================================================================================
#Final Model
#==================================================================================================================
#==================================================================================================================
#==================================================================================================================

xgb_final_low = XGBClassifier(
        colsample_bylevel=1,
        colsample_bytree=0.7,
        gamma=0, 
        learning_rate=0.005,
        max_delta_step=2, 
        max_depth=14, 
        max_features=13, 
        min_child_weight=0, 
        n_estimators=1750, 
#        random_state=seed,
        reg_alpha=0.1,
        reg_lambda=1e-05,
        scale_pos_weight=1,
        silent=True,
        subsample=0.8,
        eval_metric='auc',
        objective='binary:logistic',
        nthread=-1)

xgb_final_mid = XGBClassifier(
        colsample_bylevel=1,
        colsample_bytree=0.9,
        gamma=1, 
        learning_rate=0.01,
        max_delta_step=1, 
        max_depth=11, 
        min_child_weight=1, 
        missing=None,
        n_estimators=1500, 
#        random_state=seed,
        reg_alpha=0.01,
        reg_lambda=0.1,
        scale_pos_weight=1,
        silent=True,
        subsample=0.7,
        eval_metric='auc',
        objective='binary:logistic',
        nthread=-1)


xgb_final_hi = XGBClassifier(
        colsample_bylevel=1,
        colsample_bytree=0.7,
        gamma=1, 
        learning_rate=0.01,
        max_delta_step=0, 
        max_depth=13, 
        max_features=13, 
        min_child_weight=1, 
        missing=None,
        n_estimators=1500, 
#        random_state=seed,
        reg_alpha=0.01,
        reg_lambda=1e-05,
        scale_pos_weight=1,
        silent=True,
        subsample=0.8,
        eval_metric='auc',
        objective='binary:logistic',
        nthread=-1)

p.modeleval(xgb_final_low, x1_train_res_low, y1_train_res_low.values.ravel(), x_val_low, y1_val_low, predictors)
p.modeleval(xgb_final_mid, x1_train_res_mid, y1_train_res_mid.values.ravel(), x_val_mid, y1_val_mid, predictors)
p.modeleval(xgb_final_hi, x1_train_res_hi, y1_train_res_hi.values.ravel(), x_val_hi, y1_val_hi, predictors)

In [None]:
print('low revenue features')
fi_final_low = pd.Series(xgb_final_low.get_booster().get_fscore()).sort_values(ascending=False)
print(fi_final_low.head())
print('mid revenue features')
fi_final_mid = pd.Series(xgb_final_mid.get_booster().get_fscore()).sort_values(ascending=False)
print(fi_final_mid.head())
print('hi revenue features')
fi_final_hi = pd.Series(xgb_final_hi.get_booster().get_fscore()).sort_values(ascending=False)
print(fi_final_hi.head())