# Feature Engineering & Model Creation

In [3]:
#packages import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.feature_selection import SelectKBest, chi2, f_classif, RFE
from sklearn.utils import resample

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [4]:
#dataset import
train_data = pd.read_csv("train.csv")

train_data['adexchange'] = train_data['adexchange'].fillna(0)
train_data['slotsize'] = train_data['slotwidth'].astype(str) + 'x' + train_data['slotheight'].astype(str)
train_data.head(3)

Unnamed: 0,click,weekday,hour,bidid,userid,useragent,IP,region,city,adexchange,...,slotvisibility,slotformat,slotprice,creative,bidprice,payprice,keypage,advertiser,usertag,slotsize
0,0,5,22,b7bea80521fdecd95d2d761a38c91c3f09618066,2e880fb7d690cf7377b2e42e701728e3f3c0e4c1,windows_ie,125.37.175.*,2,2,2.0,...,2,0,5,a4f763f78ef3eedfe614263b94a8924e,238,5,0f951a030abdaedd733ee8d114ce2944,3427,NaN,200x200
1,0,1,20,4f51205475678f5a124bc76b2c54163bf8eaa7eb,3a1fe01360ff8100e7d006b83b77a3e4c01d928c,windows_chrome,171.36.92.*,238,239,1.0,...,FourthView,Na,0,10722,294,23,,2821,NaN,300x250
2,0,3,13,b604e3fd054a658ab7ced4285ebf2ef54d2bd890,801d18a056b6fe6b06a794aef17fb0d6daff2414,windows_ie,59.46.106.*,40,41,2.0,...,2,0,5,798b2d49952d77f1eace9f23c210d0b5,238,24,0f951a030abdaedd733ee8d114ce2944,3427,10052100061386610110,250x250


In [7]:
'''Because there is such a huge disparity between non-clicks vs clicks, we'll perform a:
Negative downsampling (imbalanced): different samples sizes are used in this procedure. 
In all these samples all the observations from the minority class are kept and 
we take different number of observations from the majority class by performing sampling without replacement.
'''


# Separate majority and minority classes
train_data_majority = train_data[train_data.click==0]
train_data_minority = train_data[train_data.click==1]

# Downsample majority class
df_majority_downsampled = resample(train_data_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=len(train_data_minority),     # to match minority class
                                 random_state=123) # reproducible results

# Combine minority class with downsampled majority class
train_df = pd.concat([df_majority_downsampled, train_data_minority])

# Display new class counts
train_df.click.value_counts()
#print(train_df.click.value_counts())
    


1    1793
0    1793
Name: click, dtype: int64

In [8]:

cat_col = ['useragent','slotvisibility', 'slotformat', 'keypage', 'slotsize']

model_df = train_df[
['click',
 'weekday',
 'hour',
 #'bidid',
 #'userid',
 'useragent',
 #'IP',
 'region',
 'city',
 'adexchange',
 #'domain',
 #'url',
 #'urlid',
 #'slotid',
 'slotsize',
 'slotvisibility',
 'slotformat',
 'slotprice',
 #'creative',
 'bidprice',
 'payprice',
 'keypage',
 'advertiser',
 #'usertag'
]]


model_df = pd.get_dummies(model_df, columns=cat_col, prefix=cat_col)
print(model_df.shape)

X = model_df
y = model_df['click']
test = SelectKBest(score_func=chi2, k=20)
fit = test.fit(X, y)
np.set_printoptions(precision=2)
features = fit.transform(X)

model = LogisticRegression()
rfe = RFE(model, 20)
fit = rfe.fit(X, y)
#print("Num Features: %s" % (fit.n_features_))
#print("Selected Features: %s" % (fit.support_))
#print("Feature Ranking: %s" % (fit.ranking_))


(3586, 86)


In [16]:
#Grab the top 20 features
top_features = []
counter = 0
for r in  np.nditer(fit.ranking_):
    if r == 1:
        feature = model_df.columns[counter]
        top_features.append(feature)
    counter += 1

print(top_features, end=", ")

with open('features.pkl', 'wb') as f:
    pickle.dump(top_features, f)

['click', 'adexchange', 'useragent_android_safari', 'useragent_windows_chrome', 'useragent_windows_firefox', 'useragent_windows_ie', 'slotvisibility_0', 'slotvisibility_1', 'slotvisibility_2', 'slotvisibility_Na', 'slotvisibility_OtherView', 'slotformat_0', 'slotformat_1', 'slotformat_Na', 'keypage_361e128affece850342293213691a043', 'keypage_d29e59bf0f7f8243858b8183f14d4412', 'slotsize_1000x90', 'slotsize_250x250', 'slotsize_320x50', 'slotsize_950x90'], 

In [17]:
#now get all of the dataset and get the dummies
train_data_f = pd.get_dummies(train_data, columns=cat_col, prefix=cat_col)
X = train_data_f[top_features]
y= train_data_f['click']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4,random_state=1)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [18]:
# save the model to disk
import pickle
filename = 'Lin_model.sav'
pickle.dump(logreg, open(filename, 'wb'))

In [19]:
# load the model from disk
lin_model = pickle.load(open(filename, 'rb'))
print(loaded_model)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)


In [23]:
min_base_bid = 0
max_base_bid = 300/avgCTR
n_base_bids = 10000
base_bid = np.linspace(min_base_bid, max_base_bid, n_base_bids)
for i, bb in zip(range(n_base_bids), base_bid):
    print(bb)

0.0
40.678596582469176
81.35719316493835
122.03578974740753
162.7143863298767
203.39298291234587
244.07157949481507
284.7501760772842
325.4287726597534
366.1073692422226
406.78596582469174
447.46456240716094
488.14315898963014
528.8217555720993
569.5003521545684
610.1789487370377
650.8575453195068
691.536141901976
732.2147384844452
772.8933350669143
813.5719316493835
854.2505282318527
894.9291248143219
935.607721396791
976.2863179792603
1016.9649145617294
1057.6435111441986
1098.3221077266678
1139.0007043091368
1179.679300891606
1220.3578974740753
1261.0364940565444
1301.7150906390136
1342.3936872214829
1383.072283803952
1423.7508803864212
1464.4294769688904
1505.1080735513594
1545.7866701338287
1586.465266716298
1627.143863298767
1667.8224598812362
1708.5010564637055
1749.1796530461745
1789.8582496286438
1830.536846211113
1871.215442793582
1911.8940393760513
1952.5726359585206
1993.2512325409896
2033.9298291234588
2074.608425705928
2115.287022288397
2155.9656188708664
2196.64421545333

74360.47455275366
74401.15314933613
74441.8317459186
74482.51034250106
74523.18893908353
74563.867535666
74604.54613224846
74645.22472883094
74685.90332541341
74726.58192199588
74767.26051857835
74807.93911516081
74848.61771174328
74889.29630832575
74929.97490490822
74970.65350149068
75011.33209807317
75052.01069465563
75092.6892912381
75133.36788782057
75174.04648440304
75214.7250809855
75255.40367756797
75296.08227415044
75336.76087073292
75377.43946731539
75418.11806389785
75458.79666048032
75499.47525706279
75540.15385364526
75580.83245022772
75621.51104681019
75662.18964339267
75702.86823997514
75743.54683655761
75784.22543314008
75824.90402972254
75865.58262630501
75906.26122288748
75946.93981946995
75987.61841605241
76028.2970126349
76068.97560921736
76109.65420579983
76150.3328023823
76191.01139896477
76231.68999554723
76272.3685921297
76313.04718871217
76353.72578529465
76394.40438187712
76435.08297845958
76475.76157504205
76516.44017162452
76557.11876820699
76597.79736478945


150022.66419614633
150063.3427927288
150104.02138931127
150144.69998589373
150185.3785824762
150226.05717905867
150266.73577564114
150307.4143722236
150348.09296880607
150388.77156538854
150429.450161971
150470.12875855347
150510.80735513594
150551.4859517184
150592.16454830088
150632.84314488334
150673.52174146584
150714.2003380483
150754.87893463077
150795.55753121324
150836.2361277957
150876.91472437818
150917.59332096064
150958.2719175431
150998.95051412558
151039.62911070805
151080.3077072905
151120.98630387298
151161.66490045545
151202.34349703792
151243.02209362038
151283.70069020285
151324.37928678535
151365.05788336782
151405.73647995028
151446.41507653275
151487.09367311522
151527.77226969769
151568.45086628015
151609.12946286262
151649.8080594451
151690.48665602756
151731.16525261002
151771.8438491925
151812.52244577496
151853.20104235742
151893.8796389399
151934.55823552236
151975.23683210483
152015.91542868732
152056.5940252698
152097.27262185226
152137.95121843473
152178.

216694.8839948133
216735.56259139578
216776.24118797824
216816.9197845607
216857.59838114318
216898.27697772565
216938.95557430811
216979.63417089058
217020.31276747305
217060.99136405552
217101.66996063798
217142.34855722045
217183.02715380292
217223.7057503854
217264.38434696785
217305.06294355035
217345.74154013282
217386.4201367153
217427.09873329775
217467.77732988022
217508.4559264627
217549.13452304516
217589.81311962762
217630.4917162101
217671.17031279256
217711.84890937502
217752.5275059575
217793.20610253996
217833.88469912243
217874.5632957049
217915.24189228736
217955.92048886983
217996.59908545233
218037.2776820348
218077.95627861726
218118.63487519973
218159.3134717822
218199.99206836466
218240.67066494713
218281.3492615296
218322.02785811207
218362.70645469453
218403.385051277
218444.06364785947
218484.74224444194
218525.4208410244
218566.09943760687
218606.77803418934
218647.45663077183
218688.1352273543
218728.81382393677
218769.49242051924
218810.1710171017
218850.84

298052.0771597517
298092.7557563341
298133.4343529166
298174.11294949905
298214.79154608154
298255.470142664
298296.1487392465
298336.8273358289
298377.5059324114
298418.18452899385
298458.86312557635
298499.5417221588
298540.2203187413
298580.8989153238
298621.5775119062
298662.2561084887
298702.93470507115
298743.61330165365
298784.2918982361
298824.9704948186
298865.649091401
298906.3276879835
298947.00628456596
298987.68488114845
299028.3634777309
299069.0420743134
299109.7206708958
299150.3992674783
299191.07786406076
299231.75646064326
299272.43505722575
299313.1136538082
299353.7922503907
299394.4708469731
299435.1494435556
299475.82804013806
299516.50663672056
299557.185233303
299597.8638298855
299638.54242646793
299679.2210230504
299719.89961963287
299760.57821621536
299801.2568127978
299841.9354093803
299882.61400596274
299923.29260254523
299963.9711991277
300004.64979571017
300045.32839229266
300086.0069888751
300126.6855854576
300167.36418204004
300208.04277862253
300248.72

379368.59172810754
379409.27032469
379449.9489212725
379490.62751785497
379531.3061144374
379571.9847110199
379612.66330760234
379653.34190418484
379694.0205007673
379734.6990973498
379775.3776939322
379816.0562905147
379856.73488709715
379897.41348367964
379938.0920802621
379978.7706768446
380019.449273427
380060.1278700095
380100.80646659195
380141.48506317445
380182.16365975694
380222.8422563394
380263.5208529219
380304.1994495043
380344.8780460868
380385.55664266925
380426.23523925175
380466.9138358342
380507.5924324167
380548.2710289991
380588.9496255816
380629.62822216406
380670.30681874655
380710.985415329
380751.6640119115
380792.3426084939
380833.0212050764
380873.6998016589
380914.37839824136
380955.05699482386
380995.7355914063
381036.4141879888
381077.0927845712
381117.7713811537
381158.44997773616
381199.12857431866
381239.8071709011
381280.4857674836
381321.16436406603
381361.84296064853
381402.52155723097
381443.20015381346
381483.8787503959
381524.5573469784
381565.2359