In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import numpy
import xgboost as xgb



In [2]:
train_df = pd.read_csv("train.csv")

In [3]:
valid_df = pd.read_csv("validation.csv")

### Clean Data

In [4]:
# convert train_df slotformat to number
for index, c in train_df.iterrows():  
    if c.slotformat in "Na":
        train_df.set_value(index, 'slotformat', 10)
        
d = {'1': 1, '2': 2, '3': 3 ,'4': 4, 'null': 0}
train_df['adexchange']=train_df['adexchange'].map(d)

In [5]:
# convert valid_df slotformat to number
for index, c in valid_df.iterrows():  
    if c.slotformat in "Na":
        valid_df.set_value(index, 'slotformat', 10)

d = {'1': 1, '2': 2, '3': 3 ,'4': 4, 'null': 0}
valid_df['adexchange']=valid_df['adexchange'].map(d)

In [6]:
# Cluster the slotprice training
train_df['slotprice_group']=train_df['slotprice']
train_df['slotprice_group']=train_df['slotprice_group']/10

# Cluster the slotprice validation
valid_df['slotprice_group']=valid_df['slotprice']
valid_df['slotprice_group']=valid_df['slotprice_group']/10

In [7]:
# split os and browser from useragent

train_df=pd.concat([train_df, pd.DataFrame(train_df.useragent.str.split('_',1).tolist(),columns = ['os','browser'])], axis=1)

valid_df=pd.concat([valid_df, pd.DataFrame(valid_df.useragent.str.split('_',1).tolist(),columns = ['os','browser'])], axis=1)


In [8]:

#convert os to numeric values
o = {'android': 1, 'ios': 2, 'linux': 3 ,'mac': 4, 'windows': 5,'other': 6 }
train_df['os']=train_df['os'].map(o)
valid_df['os']=valid_df['os'].map(o)

#convert browser to numberic values

b={'chrome': 1, 'firefox': 2, 'ie': 3 ,'maxthon': 4, 'other': 5,'opera': 6,'safari': 7,'sogou': 8,'theworld': 9 }
train_df['browser']=train_df['browser'].map(b)
valid_df['browser']=valid_df['browser'].map(b)



In [9]:
# Calculate the area
train_df['area'] = train_df['slotwidth']*train_df['slotheight']
valid_df['area'] = valid_df['slotwidth']*valid_df['slotheight']


In [10]:
# fix Slotvisibility
d = {'1': 1, '2': 2, '3': 3 ,'4': 4, '0': 0, '255': 5,'FirstView': 6, 'FourthView': 7,'Na': 10,'OtherView': 8,'SecondView': 9, "Null":10}
train_df['slotvisibility']=train_df['slotvisibility'].map(d)
valid_df['slotvisibility']=valid_df['slotvisibility'].map(d)

train_df['slotvisibility']=train_df['slotvisibility'].fillna(10)
valid_df['slotvisibility']=valid_df['slotvisibility'].fillna(10)

In [11]:
# combine weekday & hour columns
train_df['wh']=(train_df['weekday']*100)+train_df['hour']
valid_df['wh']=(valid_df['weekday']*100)+valid_df['hour']

In [12]:
train_df.info()
#train_df[train_df["click"]==0].groupby(['payprice']).count()
 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2697738 entries, 0 to 2697737
Data columns (total 31 columns):
click              int64
weekday            int64
hour               int64
bidid              object
logtype            int64
userid             object
useragent          object
IP                 object
region             int64
city               int64
adexchange         int64
domain             object
url                object
urlid              object
slotid             object
slotwidth          int64
slotheight         int64
slotvisibility     float64
slotformat         object
slotprice          int64
creative           object
bidprice           int64
payprice           int64
keypage            object
advertiser         int64
usertag            object
slotprice_group    float64
os                 int64
browser            int64
area               int64
wh                 int64
dtypes: float64(2), int64(17), object(12)
memory usage: 638.0+ MB


### Bidding

In [13]:
#Define features
features = [ "wh","city","region","area", "slotprice","slotvisibility", "advertiser", "adexchange","os","browser"]

X_train = train_df[features]
y_train = train_df['click']

#Convert the values to int
X_train=X_train.astype("int")


In [14]:
# Train the model to estimate pCTR
clf = xgb.XGBRegressor(n_estimators=100, max_depth = 3, learning_rate=0.5, objective= 'binary:logistic')
clf.fit(X_train, y_train,verbose=True)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.5, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [15]:
# preprocess validation set
X_valid = valid_df[features]
X_valid=X_valid.astype("int")

In [16]:
# predicts probabilities of being clicked for validation
y_p_valid = clf.predict(X_valid)

In [17]:
avgCTR = sum(y_p_valid)/len(y_p_valid)
print("Average CTR ", avgCTR)

Average CTR  0.000752739847911


In [18]:
def calculate_bid(base_bid, y_pred, avgCTR):
    bid = np.multiply(y_pred, base_bid/avgCTR)
    return bid

In [19]:
def max_bid_base(prices, y_p_valid, avgCTR):
    ration = np.zeros(len(prices))
    total = len(valid_df)

    for i in range(len(prices)):
        tmp = calculate_bid(prices[i], y_p_valid, avgCTR) # calculates our bid price
        tmp_2 = valid_df.click[valid_df.bidprice < tmp] # stores 0/1 for those auctions we won
        if len(tmp_2) == 0:
            ration[i] = 0
        else:
            ration[i] = sum(tmp_2)/len(tmp_2) # clicked/won actions

    print("The bid_base with best CTR: ", prices[ration.argmax()])
    print("CTR for winning bid_base is ", ration.max())
    return prices[ration.argmax()]

In [24]:
prices = range(1, 450, 1)

bid_base = max_bid_base(prices, y_p_valid, avgCTR)
predicted_bid_v = calculate_bid(bid_base, y_p_valid, avgCTR)
impressions_c = valid_df.click[valid_df.bidprice < predicted_bid_v]
total_cost_c = sum(valid_df.payprice[valid_df.bidprice < predicted_bid_v])

print("Cost for valid set ", total_cost_c, ", CPC ", (total_cost_c/sum(impressions_c)), " impressions ",\
      len(impressions_c)," clicks ", sum(impressions_c))

The bid_base with best CTR:  4
CTR for winning bid_base is  0.25
Cost for valid set  703 , CPC  703.0  impressions  4  clicks  1


In [21]:
tmp = calculate_bid(bid_base, y_p_valid, avgCTR)
tmp_2 = valid_df.payprice[valid_df.bidprice < tmp]
print("Cost for valid set ", sum(tmp_2))


Cost for valid set  703


In [23]:
our_bids = calculate_bid(bid_base, y_p_valid, avgCTR) # calculates our bid price
price_payed = 0
clicked_won = 0 
won = 0

for i in range(len(our_bids)): 
    if valid_df.bidprice[i] <= our_bids[i]:
        price_payed = price_payed + valid_df.payprice[i]
        if (price_payed < 6250):       
            won = won + 1
            if (valid_df.click[i] == 1):
                clicked_won = clicked_won + 1
        else:
            break
        
print("With budget 6250 we won ", won, " auctions and on ", clicked_won, " was clicked" )
print("CPC is then ", (price_payed/clicked_won))
# 1st With budget 6250 we won  4  auctions and on  1  was clicked


With budget 6250 we won  4  auctions and on  1  was clicked
CPC is then  703.0


## Summary Results
The average of probability that the ad will be clicked 

avgCTR = 0.000752739847911

The bid_base with best CTR:  4

CTR for winning bid_base is  0.25

Cost for valid set  703 , 

CPC  703.0  

impressions  4  

clicks  1
