***
# Facebook Recruiting IV: Human or Robot?
Predict if an online bid is made by a machine or a human
***

## 1.0 - Libraries, Datasets
***

In [1]:
import pandas as pd
import numpy as np
# import missingno as msno
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import statistics

In [2]:
bids_df = pd.read_csv("data/bids.csv")
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

***
## 2.0 - Feature Engineering
***

***
### 2.1 - Time
***
#### Summary of time-related features generated (dataframe label in brackets):
- #### `instant_bids` (instant_bids_df): No. of simulataneous bids

    Bots would typically be able to perform multiple bids at the same exact time compared to humans


- #### `mean`, `std`, `min`, `50%`, `max`, `iqr_range` (time_stats_df)

    Various aggregation features


- #### `8h_1`, `8h_2`, `8h_3` (time_transform_df)
- #### `6h_1`, `6h_2`, `6h_3`, `6h_4` (time_transform_df)
- #### `4h_1`, `4h_2`, `4h_3`, `4h_4`, `4h_5`, `4h_6` (time_transform_df)

    These 8-hour / 6-hour / 4-hour timeframe features were generated to identify potential bidding patterns at a certain time of day. (e.g. a bidder could typically be more active bidding in the morning, or a bot could be bidding at wee-hours where humans would be asleep)
***

In [3]:
# Generating a separate dataframe for analyzing time-based features

time_df = bids_df[["bidder_id", "time"]].copy()
time_df

Unnamed: 0,bidder_id,time
0,8dac2b259fd1c6d1120e519fb1ac14fbqvax8,9759243157894736
1,668d393e858e8126275433046bbd35c6tywop,9759243157894736
2,aa5f360084278b35d746fa6af3a7a1a5ra3xe,9759243157894736
3,3939ac3ef7d472a59a9c5f893dd3e39fh9ofi,9759243157894736
4,8393c48eaf4b8fa96886edc7cf27b372dsibi,9759243157894736
...,...,...
7656329,626159dd6f2228ede002d9f9340f75b7puk8d,9709222052631578
7656330,a318ea333ceee1ba39a494476386136a826dv,9709222052631578
7656331,f5b2bbad20d1d7ded3ed960393bec0f40u6hn,9709222052631578
7656332,d4bd412590f5106b9d887a43c51b254eldo4f,9709222052631578


In [4]:
# Time difference between one bid to another, for the entire auction
time_df["time_diff"] = time_df.groupby("bidder_id")["time"].diff()
# time_df.to_csv("data/time_df.csv") # saving to csv in case takes too long to load

In [5]:
# Checking for any missing data in time_diff
print("Rows with missing data:", time_df["time_diff"].isna().sum())
# Removing those rows with missing data
time_df = time_df[["bidder_id", "time_diff"]].dropna()
print("Rows with missing data after removing:", time_df["time_diff"].isna().sum())

Rows with missing data: 6614
Rows with missing data after removing: 0


In [6]:
time_df

Unnamed: 0,bidder_id,time_diff
10,a58ace8b671a7531c88814bc86b2a34cf0crb,0.000000e+00
14,8dac2b259fd1c6d1120e519fb1ac14fbqvax8,1.052632e+08
23,6c8ffec692e88d81e1b18d47818abb04v6llr,0.000000e+00
25,668d393e858e8126275433046bbd35c6tywop,1.578947e+08
27,aa5f360084278b35d746fa6af3a7a1a5ra3xe,1.578947e+08
...,...,...
7656329,626159dd6f2228ede002d9f9340f75b7puk8d,3.684211e+08
7656330,a318ea333ceee1ba39a494476386136a826dv,3.631579e+09
7656331,f5b2bbad20d1d7ded3ed960393bec0f40u6hn,5.263158e+07
7656332,d4bd412590f5106b9d887a43c51b254eldo4f,3.157895e+08


In [7]:
instant_bids_df = time_df[time_df["time_diff"] == 0].groupby("bidder_id").count().reset_index()
instant_bids_df = instant_bids_df.rename(columns = {"time_diff": "instant_bids"})
instant_bids_df ##### feature

Unnamed: 0,bidder_id,instant_bids
0,0053b78cde37c4384a20d2da9aa4272aym4pb,728
1,00a79ebd15f0b24a0a3b5794457cd8ed7dng1,29
2,00b519ec8ed5e370328451379bb708a306eoj,1
3,00e0f614d9dd32dd27f6080f472d2934emlos,15
4,019cf2d366df756c092c91e26f406acdozha7,1
...,...,...
1317,fef833fd1639d647db78851c77ca716b5lnxz,3
1318,ff58ffde976a4899dcd89597a7877e18lntgz,2
1319,ff74c92e8503ad93ac6c51695b373e644hzbn,305
1320,ff92ea4abd33ed38601287f0e1d6726dmgx1f,1


In [8]:
time_stats_df = time_df.groupby("bidder_id")["time_diff"].describe().reset_index()
time_stats_df

Unnamed: 0,bidder_id,count,mean,std,min,25%,50%,75%,max
0,002d229ffb247009810828f648afc2ef593rb,1.0,1.052632e+08,,1.052632e+08,1.052632e+08,1.052632e+08,1.052632e+08,1.052632e+08
1,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,2.0,3.297784e+13,3.904443e+13,5.369263e+12,1.917355e+13,3.297784e+13,4.678213e+13,6.058642e+13
2,00486a11dff552c4bd7696265724ff81yeo9v,19.0,4.018413e+12,1.153730e+13,3.684211e+08,1.101316e+11,9.177368e+11,1.806895e+12,5.094174e+13
3,0051aef3fdeacdadba664b9b3b07e04e4coc6,67.0,1.635106e+11,5.770740e+11,5.263158e+07,4.736842e+08,2.736842e+09,1.276316e+10,3.792368e+12
4,0053b78cde37c4384a20d2da9aa4272aym4pb,10938.0,7.065316e+09,4.784394e+11,0.000000e+00,1.578947e+08,3.684211e+08,1.157895e+09,5.002753e+13
...,...,...,...,...,...,...,...,...,...
5552,ffacbed056cbfaa60c1fcf51f0d381bddr3ly,8.0,8.825230e+12,1.803203e+13,2.736842e+09,4.560658e+11,1.948684e+12,5.720447e+12,5.289747e+13
5553,ffaf0a972a6dcb3910fd6b16045781e2ava5y,2.0,4.388158e+12,1.328914e+12,3.448474e+12,3.918316e+12,4.388158e+12,4.858000e+12,5.327842e+12
5554,ffbc0fdfbf19a8a9116b68714138f2902cc13,25074.0,5.440129e+08,6.661264e+08,0.000000e+00,1.052632e+08,3.157895e+08,6.842105e+08,8.842105e+09
5555,ffc4e2dd2cc08249f299cab46ecbfacfobmr3,21.0,3.596346e+12,1.090881e+13,1.368421e+09,6.889474e+10,6.903684e+11,2.603526e+12,5.082974e+13


In [9]:
# Generating interquartile (IQR) range, i.e. spread of data, where:
# IQR = 75% - 25%
time_stats_df["iqr_range"] = time_stats_df["75%"] - time_stats_df["25%"]

# Dropping columns that are not as relevant
time_stats_df = time_stats_df.drop(columns = {"count", "25%", "75%"})

In [10]:
time_stats_df

Unnamed: 0,bidder_id,mean,std,min,50%,max,iqr_range
0,002d229ffb247009810828f648afc2ef593rb,1.052632e+08,,1.052632e+08,1.052632e+08,1.052632e+08,0.000000e+00
1,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,3.297784e+13,3.904443e+13,5.369263e+12,3.297784e+13,6.058642e+13,2.760858e+13
2,00486a11dff552c4bd7696265724ff81yeo9v,4.018413e+12,1.153730e+13,3.684211e+08,9.177368e+11,5.094174e+13,1.696763e+12
3,0051aef3fdeacdadba664b9b3b07e04e4coc6,1.635106e+11,5.770740e+11,5.263158e+07,2.736842e+09,3.792368e+12,1.228947e+10
4,0053b78cde37c4384a20d2da9aa4272aym4pb,7.065316e+09,4.784394e+11,0.000000e+00,3.684211e+08,5.002753e+13,1.000000e+09
...,...,...,...,...,...,...,...
5552,ffacbed056cbfaa60c1fcf51f0d381bddr3ly,8.825230e+12,1.803203e+13,2.736842e+09,1.948684e+12,5.289747e+13,5.264382e+12
5553,ffaf0a972a6dcb3910fd6b16045781e2ava5y,4.388158e+12,1.328914e+12,3.448474e+12,4.388158e+12,5.327842e+12,9.396842e+11
5554,ffbc0fdfbf19a8a9116b68714138f2902cc13,5.440129e+08,6.661264e+08,0.000000e+00,3.157895e+08,8.842105e+09,5.789474e+08
5555,ffc4e2dd2cc08249f299cab46ecbfacfobmr3,3.596346e+12,1.090881e+13,1.368421e+09,6.903684e+11,5.082974e+13,2.534632e+12


In [11]:
print(time_stats_df.isna().sum(), "\n")

# Filling NaN std values with zero based on the following rationale:
# - almost 10% of the values are NaN, removing them may have significant impact
# - the std values for the non-zero rows are very small,
#   so filling them with zeros would have minimal impact

time_stats_df = time_stats_df.fillna(0)
      
print(time_stats_df.isna().sum())

bidder_id      0
mean           0
std          521
min            0
50%            0
max            0
iqr_range      0
dtype: int64 

bidder_id    0
mean         0
std          0
min          0
50%          0
max          0
iqr_range    0
dtype: int64


In [12]:
time_stats_df ##### feature

Unnamed: 0,bidder_id,mean,std,min,50%,max,iqr_range
0,002d229ffb247009810828f648afc2ef593rb,1.052632e+08,0.000000e+00,1.052632e+08,1.052632e+08,1.052632e+08,0.000000e+00
1,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,3.297784e+13,3.904443e+13,5.369263e+12,3.297784e+13,6.058642e+13,2.760858e+13
2,00486a11dff552c4bd7696265724ff81yeo9v,4.018413e+12,1.153730e+13,3.684211e+08,9.177368e+11,5.094174e+13,1.696763e+12
3,0051aef3fdeacdadba664b9b3b07e04e4coc6,1.635106e+11,5.770740e+11,5.263158e+07,2.736842e+09,3.792368e+12,1.228947e+10
4,0053b78cde37c4384a20d2da9aa4272aym4pb,7.065316e+09,4.784394e+11,0.000000e+00,3.684211e+08,5.002753e+13,1.000000e+09
...,...,...,...,...,...,...,...
5552,ffacbed056cbfaa60c1fcf51f0d381bddr3ly,8.825230e+12,1.803203e+13,2.736842e+09,1.948684e+12,5.289747e+13,5.264382e+12
5553,ffaf0a972a6dcb3910fd6b16045781e2ava5y,4.388158e+12,1.328914e+12,3.448474e+12,4.388158e+12,5.327842e+12,9.396842e+11
5554,ffbc0fdfbf19a8a9116b68714138f2902cc13,5.440129e+08,6.661264e+08,0.000000e+00,3.157895e+08,8.842105e+09,5.789474e+08
5555,ffc4e2dd2cc08249f299cab46ecbfacfobmr3,3.596346e+12,1.090881e+13,1.368421e+09,6.903684e+11,5.082974e+13,2.534632e+12


In [13]:
# Attempting to decipher time 

time_transform_df = bids_df[["bidder_id", "time"]].copy()
time_transform_df = time_transform_df.groupby("bidder_id")["time"].mean().reset_index()
time_transform_df["time_transform"] = pd.to_datetime(time_transform_df["time"])

# Comparing time_transform to time:
# - The last 9 digits are the same.
# - The year is 1970, which is an unlikely year that online auctions took place/gained traction.
# This could mean that time may not have been clarified properly.
# Nonetheless it is good enough of a reference to use for comparison + further feature engineering.

# Separate hour / minute / second from time_transform
time_transform_df["hour"] = time_transform_df["time_transform"].dt.hour
time_transform_df["minute"] = time_transform_df["time_transform"].dt.minute
time_transform_df["second"] = time_transform_df["time_transform"].dt.second

time_transform_df = time_transform_df.drop(columns = ["time"])
time_transform_df

Unnamed: 0,bidder_id,time_transform,hour,minute,second
0,001068c415025a009fee375a12cff4fcnht8y,1970-04-23 08:12:25.052631578,8,12,25
1,002d229ffb247009810828f648afc2ef593rb,1970-04-24 00:59:04.157894736,0,59,4
2,0030a2dd87ad2733e0873062e4f83954mkj86,1970-04-23 07:42:33.947368420,7,42,33
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,1970-04-23 01:29:59.333333334,1,29,59
4,00486a11dff552c4bd7696265724ff81yeo9v,1970-04-22 17:43:45.771052630,17,43,45
...,...,...,...,...,...
6609,ffbc0fdfbf19a8a9116b68714138f2902cc13,1970-04-24 00:35:16.592177152,0,35,16
6610,ffc4e2dd2cc08249f299cab46ecbfacfobmr3,1970-04-22 19:50:36.657894736,19,50,36
6611,ffd29eb307a4c54610dd2d3d212bf3bagmmpl,1970-04-22 14:26:19.947368420,14,26,19
6612,ffd62646d600b759a985d45918bd6f0431vmz,1970-04-24 00:50:05.502615726,0,50,5


In [14]:
# Defining three different 8-hour periods

time_transform_df["8h_1"] = 0
time_transform_df["8h_2"] = 0
time_transform_df["8h_3"] = 0

# Assigning each bidder to his/her repective 6-hour period

hour = time_transform_df["hour"]

for i in time_transform_df.index:
    
    if (hour.iloc[i] >= 0) and (hour.iloc[i] < 8):
        time_transform_df.loc[i, "8h_1"] = 1
        
    elif (hour.iloc[i] >= 8) and (hour.iloc[i] < 16):
        time_transform_df.loc[i, "8h_2"] = 1
                           
    else:
        time_transform_df.loc[i, "8h_3"] = 1

In [15]:
# Defining four different 6-hour periods

time_transform_df["6h_1"] = 0
time_transform_df["6h_2"] = 0
time_transform_df["6h_3"] = 0
time_transform_df["6h_4"] = 0

# Assigning each bidder to his/her repective 6-hour period

hour = time_transform_df["hour"]

for i in time_transform_df.index:
    
    if (hour.iloc[i] >= 0) and (hour.iloc[i] < 6):
        time_transform_df.loc[i, "6h_1"] = 1
        
    elif (hour.iloc[i] >= 6) and (hour.iloc[i] < 12):
        time_transform_df.loc[i, "6h_2"] = 1
            
    elif (hour.iloc[i] >= 12) and (hour.iloc[i] < 18):
        time_transform_df.loc[i, "6h_3"] = 1
                
    else:
        time_transform_df.loc[i, "6h_4"] = 1

In [16]:
# Defining six different 4-hour periods

time_transform_df["4h_1"] = 0
time_transform_df["4h_2"] = 0
time_transform_df["4h_3"] = 0
time_transform_df["4h_4"] = 0
time_transform_df["4h_5"] = 0
time_transform_df["4h_6"] = 0

# Assigning each bidder to his/her repective 4-hour period

hour = time_transform_df["hour"]

for i in time_transform_df.index:
    
    if (hour.iloc[i] >= 0) and (hour.iloc[i] < 4):
        time_transform_df.loc[i, "4h_1"] = 1
        
    elif (hour.iloc[i] >= 4) and (hour.iloc[i] < 8):
        time_transform_df.loc[i, "4h_2"] = 1

    elif (hour.iloc[i] >= 8) and (hour.iloc[i] < 12):
        time_transform_df.loc[i, "4h_3"] = 1        

    elif (hour.iloc[i] >= 12) and (hour.iloc[i] < 16):
        time_transform_df.loc[i, "4h_4"] = 1        

    elif (hour.iloc[i] >= 16) and (hour.iloc[i] < 20):
        time_transform_df.loc[i, "4h_5"] = 1           

    else:
        time_transform_df.loc[i, "4h_6"] = 1

In [17]:
time_transform_df

Unnamed: 0,bidder_id,time_transform,hour,minute,second,8h_1,8h_2,8h_3,6h_1,6h_2,6h_3,6h_4,4h_1,4h_2,4h_3,4h_4,4h_5,4h_6
0,001068c415025a009fee375a12cff4fcnht8y,1970-04-23 08:12:25.052631578,8,12,25,0,1,0,0,1,0,0,0,0,1,0,0,0
1,002d229ffb247009810828f648afc2ef593rb,1970-04-24 00:59:04.157894736,0,59,4,1,0,0,1,0,0,0,1,0,0,0,0,0
2,0030a2dd87ad2733e0873062e4f83954mkj86,1970-04-23 07:42:33.947368420,7,42,33,1,0,0,0,1,0,0,0,1,0,0,0,0
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,1970-04-23 01:29:59.333333334,1,29,59,1,0,0,1,0,0,0,1,0,0,0,0,0
4,00486a11dff552c4bd7696265724ff81yeo9v,1970-04-22 17:43:45.771052630,17,43,45,0,0,1,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6609,ffbc0fdfbf19a8a9116b68714138f2902cc13,1970-04-24 00:35:16.592177152,0,35,16,1,0,0,1,0,0,0,1,0,0,0,0,0
6610,ffc4e2dd2cc08249f299cab46ecbfacfobmr3,1970-04-22 19:50:36.657894736,19,50,36,0,0,1,0,0,0,1,0,0,0,0,1,0
6611,ffd29eb307a4c54610dd2d3d212bf3bagmmpl,1970-04-22 14:26:19.947368420,14,26,19,0,1,0,0,0,1,0,0,0,0,1,0,0
6612,ffd62646d600b759a985d45918bd6f0431vmz,1970-04-24 00:50:05.502615726,0,50,5,1,0,0,1,0,0,0,1,0,0,0,0,0


***
### 2.2 - Auction
***
#### Summary of auction-related features generated (dataframe label in brackets):
- #### `first_bid`, `last_bid` (auction_df)

    No. of times each bidder was the first or last bid of an auction
    
    Bots may have a higher tendency to detect a new auction on the site through webscraping and be the first bidder.
    
    Last bids in this case would assume that the bidder was the winner of the auction.
    
        
- ####  `num_bids` (count_auctions_df)

    Total no. of bids each bidder made over the entire dataset


- #### `bids_per_auction` (bids_per_auction_df)

    Average no. of bids each bidder made per auction


- #### `max_bids_per_auction` (max_bids_per_auction_df)

    Maximum no. of bids each bidder made for an auction
    
    A higher value would typically either mean one of the following:
    - The human really wanted the item in that auction very badly
    - The bot was designed to win that auction at all cost


- #### `ip_per_auction` (ip_per_auction_df)

    Average no. of IP addresses each bidder used (for bidding) per auction
    
    IP addresses are generally range-bound for a human (don't differ too much), while bots may be using spoofed IP addresses to avoid detection (and hence are less likely to reuse any IP addresses, hence increasing the no. of IP addresses used)
***

In [18]:
first_bid_df = bids_df.sort_values(['auction', 'time'])
first_bid_df = first_bid_df.groupby('auction').first().reset_index()
first_bid_df = first_bid_df.groupby('bidder_id').count()['bid_id'].reset_index()
first_bid_df = first_bid_df.rename(columns = {'bid_id': 'first_bid'})
# first_bid_df

In [19]:
last_bid_df = bids_df.sort_values(['auction', 'time'], ascending = [True, False])
last_bid_df = last_bid_df.groupby('auction').first().reset_index()
last_bid_df = last_bid_df.groupby('bidder_id').count()['bid_id'].reset_index()
last_bid_df = last_bid_df.rename(columns = {'bid_id': 'last_bid'})
# last_bid_df

In [20]:
auction_df = first_bid_df.merge(last_bid_df, on = "bidder_id", how = "left")
auction_df = auction_df.fillna(0)
auction_df ##### feature

Unnamed: 0,bidder_id,first_bid,last_bid
0,0053b78cde37c4384a20d2da9aa4272aym4pb,4,6.0
1,00a79ebd15f0b24a0a3b5794457cd8ed7dng1,7,0.0
2,00e0f614d9dd32dd27f6080f472d2934emlos,6,3.0
3,019cf2d366df756c092c91e26f406acdozha7,2,0.0
4,01cda526658455000913950f20cf31a2q6nsf,8,9.0
...,...,...,...
1465,ff58ffde976a4899dcd89597a7877e18lntgz,1,1.0
1466,ff74c92e8503ad93ac6c51695b373e644hzbn,28,37.0
1467,ffa7b0b0f144b1594131d99e50c17a0bwbym3,4,4.0
1468,ffbc0fdfbf19a8a9116b68714138f2902cc13,11,16.0


In [21]:
# Total number of bids each bidder made

count_auctions_df = bids_df.groupby("bidder_id")["auction"].count().reset_index()
count_auctions_df = count_auctions_df.rename(columns = {"auction": "num_bids"})
count_auctions_df ##### feature

Unnamed: 0,bidder_id,num_bids
0,001068c415025a009fee375a12cff4fcnht8y,1
1,002d229ffb247009810828f648afc2ef593rb,2
2,0030a2dd87ad2733e0873062e4f83954mkj86,1
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,3
4,00486a11dff552c4bd7696265724ff81yeo9v,20
...,...,...
6609,ffbc0fdfbf19a8a9116b68714138f2902cc13,25075
6610,ffc4e2dd2cc08249f299cab46ecbfacfobmr3,22
6611,ffd29eb307a4c54610dd2d3d212bf3bagmmpl,1
6612,ffd62646d600b759a985d45918bd6f0431vmz,664


In [22]:
bids_per_auction_df = bids_df.groupby('bidder_id').nunique()[['bid_id', 'auction']].reset_index()
bids_per_auction_df = bids_per_auction_df.rename(columns = {"bid_id": "bid_count", "auction": "auction_count"})
bids_per_auction_df["bids_per_auction"] = bids_per_auction_df["bid_count"] / bids_per_auction_df["auction_count"]
bids_per_auction_df = bids_per_auction_df.drop(columns = ["bid_count", "auction_count"])
bids_per_auction_df ##### feature

Unnamed: 0,bidder_id,bids_per_auction
0,001068c415025a009fee375a12cff4fcnht8y,1.000000
1,002d229ffb247009810828f648afc2ef593rb,2.000000
2,0030a2dd87ad2733e0873062e4f83954mkj86,1.000000
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,1.000000
4,00486a11dff552c4bd7696265724ff81yeo9v,1.538462
...,...,...
6609,ffbc0fdfbf19a8a9116b68714138f2902cc13,39.364207
6610,ffc4e2dd2cc08249f299cab46ecbfacfobmr3,1.466667
6611,ffd29eb307a4c54610dd2d3d212bf3bagmmpl,1.000000
6612,ffd62646d600b759a985d45918bd6f0431vmz,12.072727


In [23]:
# Calculating max no. of bids per auction
max_bids_per_auction_df = bids_df.groupby(["bidder_id", "auction"]).nunique(["bid_id", "device"]).reset_index()
max_bids_per_auction_df = max_bids_per_auction_df.groupby("bidder_id").max().reset_index()
max_bids_per_auction_df = max_bids_per_auction_df.drop(columns = ["auction", "merchandise", "device", "time", "country", "ip", "url"])
max_bids_per_auction_df = max_bids_per_auction_df.rename(columns = {"bid_id": "max_bids_per_auction"})
max_bids_per_auction_df ##### feature

Unnamed: 0,bidder_id,max_bids_per_auction
0,001068c415025a009fee375a12cff4fcnht8y,1
1,002d229ffb247009810828f648afc2ef593rb,2
2,0030a2dd87ad2733e0873062e4f83954mkj86,1
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,1
4,00486a11dff552c4bd7696265724ff81yeo9v,3
...,...,...
6609,ffbc0fdfbf19a8a9116b68714138f2902cc13,3682
6610,ffc4e2dd2cc08249f299cab46ecbfacfobmr3,4
6611,ffd29eb307a4c54610dd2d3d212bf3bagmmpl,1
6612,ffd62646d600b759a985d45918bd6f0431vmz,78


In [24]:
ip_per_auction_df = bids_df.groupby('bidder_id').nunique()[['ip', 'auction']].reset_index()
ip_per_auction_df = ip_per_auction_df.rename(columns = {"ip": "ip_count", "auction": "auction_count"})
ip_per_auction_df["ip_per_auction"] = ip_per_auction_df["ip_count"] / ip_per_auction_df["auction_count"]
ip_per_auction_df = ip_per_auction_df.drop(columns = ["ip_count", "auction_count"])
ip_per_auction_df ##### feature

Unnamed: 0,bidder_id,ip_per_auction
0,001068c415025a009fee375a12cff4fcnht8y,1.000000
1,002d229ffb247009810828f648afc2ef593rb,1.000000
2,0030a2dd87ad2733e0873062e4f83954mkj86,1.000000
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,1.000000
4,00486a11dff552c4bd7696265724ff81yeo9v,0.769231
...,...,...
6609,ffbc0fdfbf19a8a9116b68714138f2902cc13,29.397174
6610,ffc4e2dd2cc08249f299cab46ecbfacfobmr3,1.200000
6611,ffd29eb307a4c54610dd2d3d212bf3bagmmpl,1.000000
6612,ffd62646d600b759a985d45918bd6f0431vmz,0.672727


***
### 2.3 - Device
***
#### Summary of device-related features generated (dataframe label in brackets):
- #### `max_bids_per_device` (max_bids_per_device_df)

    Maximum no. of devices each bidder used. 
    (Similar rationale as below)
    
    
- #### `mean_bids_per_device` (mean_bids_per_device_df)

    Average no. of devices each bidder used (for bidding) per auction

    Humans typically stick with their same devices to participate in auctions, while bots may be using spoofed device IDs/emulators to avoid detection (and hence are less likely to reuse any, hence increasing the no. of devices used)
***

In [25]:
bids_df.head(3)

Unnamed: 0,bid_id,bidder_id,auction,merchandise,device,time,country,ip,url
0,0,8dac2b259fd1c6d1120e519fb1ac14fbqvax8,ewmzr,jewelry,phone0,9759243157894736,us,69.166.231.58,vasstdc27m7nks3
1,1,668d393e858e8126275433046bbd35c6tywop,aeqok,furniture,phone1,9759243157894736,in,50.201.125.84,jmqlhflrzwuay9c
2,2,aa5f360084278b35d746fa6af3a7a1a5ra3xe,wa00e,home goods,phone2,9759243157894736,py,112.54.208.157,vasstdc27m7nks3


In [26]:
bids_device_auction_df = bids_df.groupby(["bidder_id", "device"]).nunique(["bid_id", "auction"]).reset_index()
bids_device_auction_df

Unnamed: 0,bidder_id,device,bid_id,auction,merchandise,time,country,ip,url
0,001068c415025a009fee375a12cff4fcnht8y,phone561,1,1,1,1,1,1,1
1,002d229ffb247009810828f648afc2ef593rb,phone219,1,1,1,1,1,1,1
2,002d229ffb247009810828f648afc2ef593rb,phone640,1,1,1,1,1,1,1
3,0030a2dd87ad2733e0873062e4f83954mkj86,phone313,1,1,1,1,1,1,1
4,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,phone102,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...
485711,ffd62646d600b759a985d45918bd6f0431vmz,phone91,1,1,1,1,1,1,1
485712,ffd62646d600b759a985d45918bd6f0431vmz,phone94,1,1,1,1,1,1,1
485713,ffd62646d600b759a985d45918bd6f0431vmz,phone97,5,4,1,5,1,3,2
485714,ffd62646d600b759a985d45918bd6f0431vmz,phone98,1,1,1,1,1,1,1


In [27]:
# Calculating max no. of bids per device

max_bids_per_device_df = bids_device_auction_df.groupby("bidder_id").max().reset_index()
max_bids_per_device_df = max_bids_per_device_df.drop(columns = ["device", "auction", "merchandise", "time", "country", "ip", "url"])
max_bids_per_device_df = max_bids_per_device_df.rename(columns = {"bid_id": "max_bids_per_device"})
max_bids_per_device_df ##### feature

Unnamed: 0,bidder_id,max_bids_per_device
0,001068c415025a009fee375a12cff4fcnht8y,1
1,002d229ffb247009810828f648afc2ef593rb,1
2,0030a2dd87ad2733e0873062e4f83954mkj86,1
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,1
4,00486a11dff552c4bd7696265724ff81yeo9v,7
...,...,...
6609,ffbc0fdfbf19a8a9116b68714138f2902cc13,2432
6610,ffc4e2dd2cc08249f299cab46ecbfacfobmr3,4
6611,ffd29eb307a4c54610dd2d3d212bf3bagmmpl,1
6612,ffd62646d600b759a985d45918bd6f0431vmz,200


In [28]:
# Calculating average no. of bids per device
mean_bids_per_device_df = bids_device_auction_df.groupby("bidder_id").mean().reset_index()
mean_bids_per_device_df = mean_bids_per_device_df.drop(columns = ["auction", "merchandise", "time", "country", "ip", "url"])
mean_bids_per_device_df = mean_bids_per_device_df.rename(columns = {"bid_id": "mean_bids_per_device"})
mean_bids_per_device_df ##### feature

Unnamed: 0,bidder_id,mean_bids_per_device
0,001068c415025a009fee375a12cff4fcnht8y,1.000000
1,002d229ffb247009810828f648afc2ef593rb,1.000000
2,0030a2dd87ad2733e0873062e4f83954mkj86,1.000000
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,1.000000
4,00486a11dff552c4bd7696265724ff81yeo9v,2.500000
...,...,...
6609,ffbc0fdfbf19a8a9116b68714138f2902cc13,31.660354
6610,ffc4e2dd2cc08249f299cab46ecbfacfobmr3,1.692308
6611,ffd29eb307a4c54610dd2d3d212bf3bagmmpl,1.000000
6612,ffd62646d600b759a985d45918bd6f0431vmz,6.916667


***
### 2.4 - IP, URL
***
#### Summary of IP and URL-related features generated (dataframe label in brackets):
- #### `ip_per_bidder` (ip_per_bidder_df)

    No. of unique IP addresses each bidder used.
    Similar to `ip_per_auction`.


- #### `bid_per_ip` (bid_per_ip_df)

    No. of bids made per IP address

- #### `bid_per_url` (bid_per_url_df)

    No. of bids made per URL
***

In [29]:
bids_df.head(3)

Unnamed: 0,bid_id,bidder_id,auction,merchandise,device,time,country,ip,url
0,0,8dac2b259fd1c6d1120e519fb1ac14fbqvax8,ewmzr,jewelry,phone0,9759243157894736,us,69.166.231.58,vasstdc27m7nks3
1,1,668d393e858e8126275433046bbd35c6tywop,aeqok,furniture,phone1,9759243157894736,in,50.201.125.84,jmqlhflrzwuay9c
2,2,aa5f360084278b35d746fa6af3a7a1a5ra3xe,wa00e,home goods,phone2,9759243157894736,py,112.54.208.157,vasstdc27m7nks3


In [30]:
# Calculating IP per bidder (bots may have different IP addresses)

ip_per_bidder_df = bids_df.groupby(["bidder_id", "auction"])["ip"].count().reset_index()
ip_per_bidder_df = ip_per_bidder_df.groupby("bidder_id").mean().reset_index()
ip_per_bidder_df = ip_per_bidder_df.rename(columns = {"ip": "ip_per_bidder"})
ip_per_bidder_df ##### feature

Unnamed: 0,bidder_id,ip_per_bidder
0,001068c415025a009fee375a12cff4fcnht8y,1.000000
1,002d229ffb247009810828f648afc2ef593rb,2.000000
2,0030a2dd87ad2733e0873062e4f83954mkj86,1.000000
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,1.000000
4,00486a11dff552c4bd7696265724ff81yeo9v,1.538462
...,...,...
6609,ffbc0fdfbf19a8a9116b68714138f2902cc13,39.364207
6610,ffc4e2dd2cc08249f299cab46ecbfacfobmr3,1.466667
6611,ffd29eb307a4c54610dd2d3d212bf3bagmmpl,1.000000
6612,ffd62646d600b759a985d45918bd6f0431vmz,12.072727


In [31]:
# Calculating bids per IP (bots may use multiple spoofed IP addresses to bid)
bid_per_ip_df = bids_df.groupby(["bidder_id", "ip"])["bid_id"].count().reset_index()
bid_per_ip_df = bid_per_ip_df.groupby("bidder_id").mean().reset_index()
bid_per_ip_df = bid_per_ip_df.rename(columns = {"bid_id": "bid_per_ip"}) 
bid_per_ip_df ##### feature

Unnamed: 0,bidder_id,bid_per_ip
0,001068c415025a009fee375a12cff4fcnht8y,1.000000
1,002d229ffb247009810828f648afc2ef593rb,2.000000
2,0030a2dd87ad2733e0873062e4f83954mkj86,1.000000
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,1.000000
4,00486a11dff552c4bd7696265724ff81yeo9v,2.000000
...,...,...
6609,ffbc0fdfbf19a8a9116b68714138f2902cc13,1.339047
6610,ffc4e2dd2cc08249f299cab46ecbfacfobmr3,1.222222
6611,ffd29eb307a4c54610dd2d3d212bf3bagmmpl,1.000000
6612,ffd62646d600b759a985d45918bd6f0431vmz,17.945946


In [32]:
# Calculating bids per URL (bots are likely to "camp" on the same url and bid repeatedly, especially nearer to end of an auction)

bid_per_url_df = bids_df.groupby(["bidder_id", "url"])["bid_id"].count().reset_index()
bid_per_url_df = bid_per_url_df.groupby("bidder_id").mean().reset_index()
bid_per_url_df = bid_per_url_df.rename(columns = {"bid_id": "bid_per_url"}) 
bid_per_url_df ##### feature

Unnamed: 0,bidder_id,bid_per_url
0,001068c415025a009fee375a12cff4fcnht8y,1.000000
1,002d229ffb247009810828f648afc2ef593rb,2.000000
2,0030a2dd87ad2733e0873062e4f83954mkj86,1.000000
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,1.500000
4,00486a11dff552c4bd7696265724ff81yeo9v,2.857143
...,...,...
6609,ffbc0fdfbf19a8a9116b68714138f2902cc13,3.119169
6610,ffc4e2dd2cc08249f299cab46ecbfacfobmr3,1.833333
6611,ffd29eb307a4c54610dd2d3d212bf3bagmmpl,1.000000
6612,ffd62646d600b759a985d45918bd6f0431vmz,4.611111


***
### 2.5 - Additional Features
***
#### Summary of additional features generated (dataframe label in brackets):
- #### (bidder_unique_df)

    Using .nunique() to generate no. of unique features based on the original ones provided in the dataset.
***

In [33]:
bidder_unique_df = bids_df.groupby("bidder_id").nunique()
bidder_unique_df = bidder_unique_df.drop(columns=['bidder_id',"bid_id"], errors='ignore').reset_index()
bidder_unique_df ##### feature

Unnamed: 0,bidder_id,auction,merchandise,device,time,country,ip,url
0,001068c415025a009fee375a12cff4fcnht8y,1,1,1,1,1,1,1
1,002d229ffb247009810828f648afc2ef593rb,1,1,2,2,1,1,1
2,0030a2dd87ad2733e0873062e4f83954mkj86,1,1,1,1,1,1,1
3,003180b29c6a5f8f1d84a6b7b6f7be57tjj1o,3,1,3,3,1,3,2
4,00486a11dff552c4bd7696265724ff81yeo9v,13,1,8,20,1,10,7
...,...,...,...,...,...,...,...,...
6609,ffbc0fdfbf19a8a9116b68714138f2902cc13,637,1,792,23487,102,18726,8039
6610,ffc4e2dd2cc08249f299cab46ecbfacfobmr3,15,1,13,22,6,18,12
6611,ffd29eb307a4c54610dd2d3d212bf3bagmmpl,1,1,1,1,1,1,1
6612,ffd62646d600b759a985d45918bd6f0431vmz,55,1,96,664,1,37,144


***
### 2.6 - Feature Consolidation
***

In [34]:
## generate chart to deduce that payment_account and address has no relevance/significance

In [35]:
train_df = train_df.drop(columns = ["payment_account", "address"])

In [36]:
# Consolidating time-related features:

train_time_df = train_df.merge(instant_bids_df, on = "bidder_id", how = "left")
train_time_df = train_time_df.merge(time_stats_df, on = "bidder_id", how = "left")
train_time_df = train_time_df.merge(time_transform_df, on = "bidder_id", how = "left")

In [37]:
train_time_df = train_time_df.fillna(0) # fill NaN rows with zeroes

In [38]:
train_time_df

Unnamed: 0,bidder_id,outcome,instant_bids,mean,std,min,50%,max,iqr_range,time_transform,...,6h_1,6h_2,6h_3,6h_4,4h_1,4h_2,4h_3,4h_4,4h_5,4h_6
0,91a3c57b13234af24875c56fb7e2b2f4rb56a,0.0,0.0,5.711121e+11,7.438736e+11,5.210526e+09,3.458421e+11,3.167632e+12,6.250263e+11,1970-04-24 00:48:46.934210526,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,624f258b49e77713fc34034560f93fb3hu3jo,0.0,0.0,3.233579e+12,1.759654e+12,1.989316e+12,3.233579e+12,4.477842e+12,1.244263e+12,1970-04-24 01:01:05.929824560,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1c5f4fc669099bfbfac515cd26997bd12ruaj,0.0,0.0,2.379000e+12,8.618847e+11,1.450842e+12,2.532053e+12,3.154105e+12,8.516316e+11,1970-04-24 01:43:14.684210526,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,4bee9aba2abda51bf43d639013d6efe12iycd,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1970-04-23 08:14:02.210526316,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,4ab12bc61c82ddd9c2d65e60555808acqgos1,0.0,0.0,7.777888e+10,2.154285e+11,5.263158e+07,1.368421e+10,1.619211e+12,3.888158e+10,1970-04-23 23:41:58.088624786,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2008,369515b3af4f8ca582f90271d30b14b6r52aw,0.0,3.0,2.099162e+12,9.464286e+12,0.000000e+00,6.842105e+08,5.605500e+13,9.436842e+10,1970-04-22 20:14:59.976608186,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2009,f939c17ffc7c39ac9b35b69e5e75179fv9pe2,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1970-04-23 07:52:31.684210526,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2010,c806dbb2decba0ed3c4ff5e2e60a74c2wjvbl,0.0,0.0,9.798474e+12,0.000000e+00,9.798474e+12,9.798474e+12,9.798474e+12,0.000000e+00,1970-04-24 01:15:01.289473684,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2011,0381a69b7a061e9ace2798fd48f1f537mgq57,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1970-04-22 12:52:34.421052632,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [39]:
# Consolidating auction-related features:

train_auction_df = train_df.merge(auction_df, on = "bidder_id", how = "left")
train_auction_df = train_auction_df.merge(count_auctions_df, on = "bidder_id", how = "left")
train_auction_df = train_auction_df.merge(bids_per_auction_df, on = "bidder_id", how = "left")
train_auction_df = train_auction_df.merge(max_bids_per_auction_df, on = "bidder_id", how = "left")
train_auction_df = train_auction_df.merge(ip_per_auction_df, on = "bidder_id", how = "left")

In [40]:
train_auction_df = train_auction_df.fillna(0) # fill NaN rows with zeroes

In [41]:
train_auction_df

Unnamed: 0,bidder_id,outcome,first_bid,last_bid,num_bids,bids_per_auction,max_bids_per_auction,ip_per_auction
0,91a3c57b13234af24875c56fb7e2b2f4rb56a,0.0,0.0,0.0,24.0,1.333333,3.0,1.111111
1,624f258b49e77713fc34034560f93fb3hu3jo,0.0,0.0,0.0,3.0,3.000000,3.0,3.000000
2,1c5f4fc669099bfbfac515cd26997bd12ruaj,0.0,0.0,0.0,4.0,1.000000,1.0,1.000000
3,4bee9aba2abda51bf43d639013d6efe12iycd,0.0,0.0,0.0,1.0,1.000000,1.0,1.000000
4,4ab12bc61c82ddd9c2d65e60555808acqgos1,0.0,0.0,0.0,155.0,6.739130,38.0,5.347826
...,...,...,...,...,...,...,...,...
2008,369515b3af4f8ca582f90271d30b14b6r52aw,0.0,1.0,0.0,36.0,1.440000,4.0,0.200000
2009,f939c17ffc7c39ac9b35b69e5e75179fv9pe2,0.0,0.0,0.0,1.0,1.000000,1.0,1.000000
2010,c806dbb2decba0ed3c4ff5e2e60a74c2wjvbl,0.0,0.0,0.0,2.0,2.000000,2.0,2.000000
2011,0381a69b7a061e9ace2798fd48f1f537mgq57,0.0,0.0,0.0,1.0,1.000000,1.0,1.000000


In [42]:
# Consolidating device-related features:

train_device_df = train_df.merge(max_bids_per_device_df, on = "bidder_id", how = "left")
train_device_df = train_device_df.merge(mean_bids_per_device_df, on = "bidder_id", how = "left")

In [43]:
train_device_df = train_device_df.fillna(0)

In [44]:
train_device_df

Unnamed: 0,bidder_id,outcome,max_bids_per_device,mean_bids_per_device
0,91a3c57b13234af24875c56fb7e2b2f4rb56a,0.0,6.0,1.714286
1,624f258b49e77713fc34034560f93fb3hu3jo,0.0,2.0,1.500000
2,1c5f4fc669099bfbfac515cd26997bd12ruaj,0.0,3.0,2.000000
3,4bee9aba2abda51bf43d639013d6efe12iycd,0.0,1.0,1.000000
4,4ab12bc61c82ddd9c2d65e60555808acqgos1,0.0,19.0,2.924528
...,...,...,...,...
2008,369515b3af4f8ca582f90271d30b14b6r52aw,0.0,33.0,9.000000
2009,f939c17ffc7c39ac9b35b69e5e75179fv9pe2,0.0,1.0,1.000000
2010,c806dbb2decba0ed3c4ff5e2e60a74c2wjvbl,0.0,1.0,1.000000
2011,0381a69b7a061e9ace2798fd48f1f537mgq57,0.0,1.0,1.000000


In [45]:
### Final feature consolidation

train_combined_df = train_time_df.merge(train_auction_df, on = "bidder_id", how = "left")
train_combined_df = train_combined_df.merge(train_device_df, on = "bidder_id", how = "left")
train_combined_df = train_combined_df.drop(columns = ["outcome_x", "outcome_y"])
train_combined_df = train_combined_df.merge(bidder_unique_df, on = "bidder_id", how = "left")

In [46]:
train_combined2_df = train_combined_df.merge(bid_per_url_df, on = "bidder_id", how = "left")
train_combined2_df = train_combined2_df.merge(ip_per_bidder_df, on ="bidder_id", how = "left")
train_combined2_df = train_combined2_df.merge(bid_per_ip_df, on = "bidder_id", how = "left")

train_combined2_df = train_combined2_df.fillna(0)

In [47]:
train_combined2_df

Unnamed: 0,bidder_id,instant_bids,mean,std,min,50%,max,iqr_range,time_transform,hour,...,auction,merchandise,device,time,country,ip,url,bid_per_url,ip_per_bidder,bid_per_ip
0,91a3c57b13234af24875c56fb7e2b2f4rb56a,0.0,5.711121e+11,7.438736e+11,5.210526e+09,3.458421e+11,3.167632e+12,6.250263e+11,1970-04-24 00:48:46.934210526,0.0,...,18.0,1.0,14.0,24.0,6.0,20.0,1.0,24.000000,1.333333,1.200000
1,624f258b49e77713fc34034560f93fb3hu3jo,0.0,3.233579e+12,1.759654e+12,1.989316e+12,3.233579e+12,4.477842e+12,1.244263e+12,1970-04-24 01:01:05.929824560,1.0,...,1.0,1.0,2.0,3.0,1.0,3.0,2.0,1.500000,3.000000,1.000000
2,1c5f4fc669099bfbfac515cd26997bd12ruaj,0.0,2.379000e+12,8.618847e+11,1.450842e+12,2.532053e+12,3.154105e+12,8.516316e+11,1970-04-24 01:43:14.684210526,1.0,...,4.0,1.0,2.0,4.0,1.0,4.0,2.0,2.000000,1.000000,1.000000
3,4bee9aba2abda51bf43d639013d6efe12iycd,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1970-04-23 08:14:02.210526316,8.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.000000,1.000000,1.000000
4,4ab12bc61c82ddd9c2d65e60555808acqgos1,0.0,7.777888e+10,2.154285e+11,5.263158e+07,1.368421e+10,1.619211e+12,3.888158e+10,1970-04-23 23:41:58.088624786,23.0,...,23.0,1.0,53.0,155.0,2.0,123.0,91.0,1.703297,6.739130,1.260163
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2008,369515b3af4f8ca582f90271d30b14b6r52aw,3.0,2.099162e+12,9.464286e+12,0.000000e+00,6.842105e+08,5.605500e+13,9.436842e+10,1970-04-22 20:14:59.976608186,20.0,...,25.0,1.0,4.0,33.0,4.0,5.0,2.0,18.000000,1.440000,7.200000
2009,f939c17ffc7c39ac9b35b69e5e75179fv9pe2,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1970-04-23 07:52:31.684210526,7.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.000000,1.000000,1.000000
2010,c806dbb2decba0ed3c4ff5e2e60a74c2wjvbl,0.0,9.798474e+12,0.000000e+00,9.798474e+12,9.798474e+12,9.798474e+12,0.000000e+00,1970-04-24 01:15:01.289473684,1.0,...,1.0,1.0,2.0,2.0,1.0,2.0,1.0,2.000000,2.000000,1.000000
2011,0381a69b7a061e9ace2798fd48f1f537mgq57,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1970-04-22 12:52:34.421052632,12.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.000000,1.000000,1.000000


***
### Ideas for potential features (that were not implemented)
***

1) Final stretch bidding frequency

- A bot that was designed to win as many auctions as possible, would likely be designed in such a way that its bidding frequency would be much higher in the last stretch of each auction (e.g. final 5 minutes).


2) 



***
## 3.0 - Model Training
***

In [48]:
print(train_combined2_df.drop(columns = ["outcome", "bidder_id"]).columns)

features = ['instant_bids', 'mean', 'std', 'min', '50%', 'max',
       'iqr_range', 'first_bid', 'last_bid', 'num_bids', 'bids_per_auction',
       'max_bids_per_auction', 'max_bids_per_device', 'mean_bids_per_device',
       'auction', 'merchandise', 'device', 'time', 'country', 'ip', 'url',
        'bid_per_url', 'ip_per_bidder', 'bid_per_ip', 'ip_per_auction', 
        '8h_1', '8h_2', '8h_3',
        '6h_1', '6h_2', '6h_3', '6h_4',
        '4h_1', '4h_2', '4h_3', '4h_4', '4h_5', '4h_6']

Index(['instant_bids', 'mean', 'std', 'min', '50%', 'max', 'iqr_range',
       'time_transform', 'hour', 'minute', 'second', '8h_1', '8h_2', '8h_3',
       '6h_1', '6h_2', '6h_3', '6h_4', '4h_1', '4h_2', '4h_3', '4h_4', '4h_5',
       '4h_6', 'first_bid', 'last_bid', 'num_bids', 'bids_per_auction',
       'max_bids_per_auction', 'ip_per_auction', 'max_bids_per_device',
       'mean_bids_per_device', 'auction', 'merchandise', 'device', 'time',
       'country', 'ip', 'url', 'bid_per_url', 'ip_per_bidder', 'bid_per_ip'],
      dtype='object')


***
#### Using train-test-split + XGBoost
***

In [49]:
# X = train_combined2_df[features]
# y = train_combined2_df["outcome"]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [50]:
# model = xgb.XGBClassifier(random_state = 0,
#                           learning_rate = 0.6,
#                           max_depth = 100
#                           )
# model.fit(X_train, y_train)

# y_pred = model.predict(X_test)

# accuracy_test = metrics.accuracy_score(y_test, y_pred)
# auc_test = metrics.roc_auc_score(y_test, y_pred)
# print("XGBoost Accuracy:", accuracy_test)
# print("XGBoost AUC:", auc_test)

In [51]:
# model = xgb.XGBClassifier(learning_rate = 0.1, random_state = 0)

# XGBoost Accuracy: 0.9602649006622517
# XGBoost AUC: 0.7077429284084276

In [52]:
# model = xgb.XGBClassifier(random_state = 0,
#                           learning_rate = 0.4,
#                           max_depth = 100)

# XGBoost Accuracy: 0.9635761589403974
# XGBoost AUC: 0.7237701003024996

***
#### Using StratifiedKFold cross-validation + RandomForestClassifier
***

In [53]:
# %%time

# # Trying StratifiedKFold cross-validation

# X = train_combined2_df[features]
# y = train_combined2_df["outcome"]

# skf = StratifiedKFold(n_splits = 3)
# k_fold_AUC = []

# for train_index, test_index in skf.split(X, y):
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]
     
#     model = RandomForestClassifier(random_state = 0,
#                                    max_depth = 20,
#                                    n_estimators = 250,
#                                    min_samples_leaf = 1)
    
#     model.fit(X_train, y_train)
#     y_proba = []
          
#     sub_y_proba = model.predict_proba(X_test)[:,1]
#     y_proba.append(sub_y_proba)
#     y_proba = np.mean(y_proba, axis = 0)

#     AUC = metrics.roc_auc_score(y_test, y_proba)
#     k_fold_AUC.append(AUC)

# mean_AUC = np.mean(k_fold_AUC)

# print(k_fold_AUC, "\n")
# print("StratifiedKFold Mean AUC:", mean_AUC)

# [0.9139117185335672, 0.9199833779665713, 0.9487196765498652] 

# StratifiedKFold Mean AUC: 0.9275382576833345

***
#### Using GridSearchCV for StratifiedKFold + RandomForestClassifier
***

In [54]:
%%time

import time

# Trying GridSearchCV

param_grid = [{'n_estimators': [100, 150, 200, 250, 300],
#                'max_features': [2, 3],
               'max_depth': [10, 20, 30, 40, 50],
               'min_samples_split': [2, 3, 4, 5, 6],
               'min_samples_leaf': [1, 2, 3, 4, 5]}]

optimal_params = GridSearchCV(RandomForestClassifier(),
                              param_grid,
                              cv = 10,
                              scoring = 'roc_auc',
                              n_jobs = -1)

auc_values = {}
auc_values_sorted = {}

X = train_combined2_df[features]
y = train_combined2_df["outcome"]

skf = StratifiedKFold(n_splits = 3)
k_fold_AUC = []

for train_index, test_index in skf.split(X, y):
    start = time.time()
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    optimal_params.fit(X_train, y_train)
    
    model = RandomForestClassifier(random_state = 0,
                                   n_estimators = optimal_params.best_params_['n_estimators'],
#                                    max_features = optimal_params.best_params_['max_features'],
                                   max_depth = optimal_params.best_params_['max_depth'],
                                   min_samples_split =  optimal_params.best_params_['min_samples_split'],
                                   min_samples_leaf =  optimal_params.best_params_['min_samples_leaf'])
    
    model.fit(X_train, y_train)
    
    y_pred_proba_output = model.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_proba_output)
    auc_values[metrics.auc(fpr, tpr)] = [optimal_params.best_params_['n_estimators'],
#                                  optimal_params.best_params_['max_features'],
                                         optimal_params.best_params_['max_depth'],
                                 optimal_params.best_params_['min_samples_split'],
                                 optimal_params.best_params_['min_samples_leaf']]
    
    for k, v in sorted(auc_values.items(), key = lambda x:x[0], reverse = True):
        auc_values_sorted[k] = v
        
    print(f"Optimal parameters are: {auc_values_sorted}")
    
    end = time.time()
    print(f"Time elapsed: {(end - start):.4f} seconds")
    
# Optimal parameters are: {0.919313879397913: [150, 20, 4, 3]}
# Time elapsed: 501.4450 seconds
# Optimal parameters are: {0.919313879397913: [150, 20, 4, 3], 0.915920214239542: [150, 20, 3, 2]}
# Time elapsed: 475.3431 seconds
# Optimal parameters are: {0.919313879397913: [150, 20, 4, 3], 0.915920214239542: [150, 20, 3, 2], 0.9484052111410601: [100, 10, 6, 3]}
# Time elapsed: 476.4395 seconds
# Wall time: 24min 13s

Optimal parameters are: {0.9112568104164742: [100, 20, 5, 4]}
Time elapsed: 994.6521 seconds
Optimal parameters are: {0.9112568104164742: [100, 20, 5, 4], 0.9159894727121618: [100, 50, 2, 2]}
Time elapsed: 1000.6035 seconds
Optimal parameters are: {0.9112568104164742: [100, 20, 5, 4], 0.9159894727121618: [100, 50, 2, 2], 0.9393530997304582: [150, 10, 2, 1]}
Time elapsed: 985.1261 seconds
Wall time: 49min 40s


In [57]:
%%time

# Applying optimal parameters from above GridSearchCV

X = train_combined2_df[features]
y = train_combined2_df["outcome"]

skf = StratifiedKFold(n_splits = 3)
k_fold_AUC = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
     
    model = RandomForestClassifier(random_state = 0,
                                   n_estimators = 150,
                                   max_depth = 30,
                                   min_samples_split = 3,
                                   min_samples_leaf = 3)
    
    model.fit(X_train, y_train)
    y_proba = []
          
    sub_y_proba = model.predict_proba(X_test)[:,1]
    y_proba.append(sub_y_proba)
    y_proba = np.mean(y_proba, axis = 0)

    AUC = metrics.roc_auc_score(y_test, y_proba)
    k_fold_AUC.append(AUC)

mean_AUC = np.mean(k_fold_AUC)

print(k_fold_AUC, "\n")
print("StratifiedKFold Mean AUC:", mean_AUC)


#     model = RandomForestClassifier(random_state = 0,
#                                    n_estimators = 150,
#                                    max_depth = 30,
#                                    min_samples_split = 3,
#                                    min_samples_leaf = 3)
# [0.919313879397913, 0.9190137593498938, 0.9497529200359389] 

# StratifiedKFold Mean AUC: 0.9293601862612485
# Public score: 0.89885
# Private score: 0.93822

[0.919313879397913, 0.9190137593498938, 0.9497529200359389] 

StratifiedKFold Mean AUC: 0.9293601862612485
Wall time: 971 ms


***
## 4.0 - Final Model
***

In [58]:
test_df = test_df.drop(columns = ["payment_account", "address"])

In [59]:
test_time_df = test_df.merge(instant_bids_df, on = "bidder_id", how = "left")
test_time_df = test_time_df.merge(time_stats_df, on = "bidder_id", how = "left")
test_time_df = test_time_df.merge(time_transform_df, on = "bidder_id", how = "left")
test_time_df = test_time_df.fillna(0)

In [60]:
test_auction_df = test_df.merge(auction_df, on = "bidder_id", how = "left")
test_auction_df = test_auction_df.merge(count_auctions_df, on = "bidder_id", how = "left")
test_auction_df = test_auction_df.merge(bids_per_auction_df, on = "bidder_id", how = "left")
test_auction_df = test_auction_df.merge(max_bids_per_auction_df, on = "bidder_id", how = "left")
test_auction_df = test_auction_df.merge(ip_per_auction_df, on = "bidder_id", how = "left")
test_auction_df = test_auction_df.fillna(0)

In [61]:
test_device_df = test_df.merge(max_bids_per_device_df, on = "bidder_id", how = "left")
test_device_df = test_device_df.merge(mean_bids_per_device_df, on = "bidder_id", how = "left")
test_device_df = test_device_df.fillna(0)

In [62]:
### Final feature consolidation

test_combined_df = test_time_df.merge(test_auction_df, on = "bidder_id", how = "left")
test_combined_df = test_combined_df.merge(test_device_df, on = "bidder_id", how = "left")
test_combined_df = test_combined_df.merge(bidder_unique_df, on = "bidder_id", how = "left")

In [63]:
# test_combined2_df = test_combined_df.merge(entropy_df, on = "bidder_id", how = "left")
test_combined2_df = test_combined_df.merge(bid_per_url_df, on = "bidder_id", how = "left")
test_combined2_df = test_combined2_df.merge(ip_per_bidder_df, on ="bidder_id", how = "left")
test_combined2_df = test_combined2_df.merge(bid_per_ip_df, on = "bidder_id", how = "left")
test_combined2_df = test_combined2_df.fillna(0)

In [64]:
final_model = model
final_model.fit(X, y)

X_kaggle = test_combined2_df[features]

probabilities = final_model.predict_proba(X_kaggle)
print("probabilities: \n", probabilities, "\n")

kaggle_preds = probabilities[:,1]
print("kaggle_preds: \n", kaggle_preds)

probabilities: 
 [[1.         0.        ]
 [0.98344444 0.01655556]
 [0.99032291 0.00967709]
 ...
 [0.86593001 0.13406999]
 [1.         0.        ]
 [0.98453968 0.01546032]] 

kaggle_preds: 
 [0.         0.01655556 0.00967709 ... 0.13406999 0.         0.01546032]


In [65]:
output_dataframe = pd.DataFrame({
    'bidder_id': test_combined2_df['bidder_id'],
    'prediction': kaggle_preds
})
output_dataframe.to_csv('my_predictions_ivan_facebook_2_clean.csv', index=False)