In [1]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
bids = pd.read_csv('./../data/bids.csv')
train = pd.read_csv('./../data/train.csv')
test = pd.read_csv('./../data/test.csv')
sample_submission = pd.read_csv('./../sampleSubmission.csv')

In [2]:
bids.head()

Unnamed: 0,bid_id,bidder_id,auction,merchandise,device,time,country,ip,url
0,0,8dac2b259fd1c6d1120e519fb1ac14fbqvax8,ewmzr,jewelry,phone0,9759243157894736,us,69.166.231.58,vasstdc27m7nks3
1,1,668d393e858e8126275433046bbd35c6tywop,aeqok,furniture,phone1,9759243157894736,in,50.201.125.84,jmqlhflrzwuay9c
2,2,aa5f360084278b35d746fa6af3a7a1a5ra3xe,wa00e,home goods,phone2,9759243157894736,py,112.54.208.157,vasstdc27m7nks3
3,3,3939ac3ef7d472a59a9c5f893dd3e39fh9ofi,jefix,jewelry,phone4,9759243157894736,in,18.99.175.133,vasstdc27m7nks3
4,4,8393c48eaf4b8fa96886edc7cf27b372dsibi,jefix,jewelry,phone5,9759243157894736,in,145.138.5.37,vasstdc27m7nks3


In [3]:
#Existing features

# N bids
bidder_df = pd.DataFrame(bids.bidder_id.unique(), columns=['bidder_id'])
bidder_df = bidder_df.merge(bids.groupby(['bidder_id'])['bid_id'].count().to_frame(), how='left', \
                            right_index=True, left_on='bidder_id')
bidder_df=bidder_df.rename(columns = {'bid_id':'n_bids'})

# N device
bidder_df = bidder_df.merge(bids.groupby(['bidder_id'])['device'].nunique().to_frame(), how='left', right_index=True, left_on='bidder_id')
bidder_df=bidder_df.rename(columns = {'device':'n_device'})

# N auctions
bidder_df = bidder_df.merge(bids.groupby(['bidder_id'])['auction'].nunique().to_frame(), how='left', right_index=True, left_on='bidder_id')
bidder_df=bidder_df.rename(columns = {'auction':'n_auction'})

# N countries
bidder_df = bidder_df.merge(bids.groupby(['bidder_id'])['country'].nunique().to_frame(), how='left', right_index=True, left_on='bidder_id')
bidder_df=bidder_df.rename(columns = {'country':'n_country'})

# N ips
bidder_df = bidder_df.merge(bids.groupby(['bidder_id'])['ip'].nunique().to_frame(), how='left', right_index=True, left_on='bidder_id')
bidder_df=bidder_df.rename(columns = {'ip':'n_ip'})

In [4]:
# New features

# Average N bids per auction and Std of N bids per auction
def value_counts_avg(x):
    return x.value_counts().mean()

def value_counts_std(x):
    return x.value_counts().std()

bidder_df = bidder_df.merge(bids.groupby(['bidder_id'])['auction'].agg([value_counts_avg, value_counts_std]), \
                                         how='left', right_index=True, left_on='bidder_id')
bidder_df = bidder_df.rename(columns = {'value_counts_avg':'auction_value_counts_avg'})
bidder_df = bidder_df.rename(columns = {'value_counts_std':'auction_value_counts_std'})


In [5]:
# Average N device per auction and Std of N device per auction

df_tmp = bids.groupby(['bidder_id','auction'])['device'].nunique().to_frame().reset_index(level=0).groupby(['bidder_id']).mean()
bidder_df = bidder_df.merge(df_tmp, how='left', right_index=True, left_on='bidder_id')
bidder_df=bidder_df.rename(columns = {'device':'n_device_per_auction'})

df_tmp = bids.groupby(['bidder_id','auction'])['device'].nunique().to_frame().reset_index(level=0).groupby(['bidder_id']).std()
bidder_df = bidder_df.merge(df_tmp, how='left', right_index=True, left_on='bidder_id')
bidder_df=bidder_df.rename(columns = {'device':'std_device_per_auction'})

df_tmp = None

In [6]:
# Average N country per auction and Std of N country per auction

df_tmp = bids.groupby(['bidder_id','auction'])['country'].nunique().to_frame().reset_index(level=0).groupby(['bidder_id']).mean()
bidder_df = bidder_df.merge(df_tmp, how='left', right_index=True, left_on='bidder_id')
bidder_df=bidder_df.rename(columns = {'country':'n_country_per_auction'})

df_tmp = bids.groupby(['bidder_id','auction'])['country'].nunique().to_frame().reset_index(level=0).groupby(['bidder_id']).std()
bidder_df = bidder_df.merge(df_tmp, how='left', right_index=True, left_on='bidder_id')
bidder_df=bidder_df.rename(columns = {'country':'std_country_per_auction'})

df_tmp = None

In [7]:
# Average N ip per auction and Std of N ip per auction

df_tmp = bids.groupby(['bidder_id','auction'])['ip'].nunique().to_frame().reset_index(level=0).groupby(['bidder_id']).mean()
bidder_df = bidder_df.merge(df_tmp, how='left', right_index=True, left_on='bidder_id')
bidder_df=bidder_df.rename(columns = {'ip':'n_ip_per_auction'})

df_tmp = bids.groupby(['bidder_id','auction'])['ip'].nunique().to_frame().reset_index(level=0).groupby(['bidder_id']).std()
bidder_df = bidder_df.merge(df_tmp, how='left', right_index=True, left_on='bidder_id')
bidder_df=bidder_df.rename(columns = {'ip':'std_ip_per_auction'})

df_tmp = None

In [8]:
bidder_df.fillna(0, inplace=True)
bidder_df = bidder_df.merge(train, how='left', on='bidder_id')
from sklearn.model_selection import train_test_split
bidder_df_train, bidder_df_validation = train_test_split(bidder_df[bidder_df.bidder_id.isin(train.bidder_id)],\
                                                         test_size=0.33, random_state=42)

In [9]:
from sklearn.svm import SVC
feature_list = ['n_bids', 'n_device', 'n_auction', 'n_country', 'n_ip', 'auction_value_counts_avg', 'auction_value_counts_std', \
               'n_device_per_auction', 'std_device_per_auction', 'n_country_per_auction', 'std_country_per_auction', \
               'n_ip_per_auction', 'std_ip_per_auction']

In [10]:
feature_list = ['n_bids', 'n_device', 'n_auction', 'n_country', 'n_ip']
new_feature_list = ['auction_value_counts_avg', 'auction_value_counts_std', \
               'n_device_per_auction', 'std_device_per_auction', 'n_country_per_auction', 'std_country_per_auction', \
               'n_ip_per_auction', 'std_ip_per_auction']

for feature in new_feature_list:
    feature_select_list = feature_list.copy()
    feature_select_list.append(feature)
    print("Adding feature: ", feature)
    clf = SVC(class_weight = 'balanced', probability=True)
    clf.fit(bidder_df_train[feature_select_list], bidder_df_train.outcome)
    from sklearn.metrics import roc_auc_score
    result = clf.predict_proba(bidder_df_validation[feature_select_list])
    print(roc_auc_score(bidder_df_validation.outcome, result[:, 1]))

Adding feature:  auction_value_counts_avg
0.829061289588
Adding feature:  auction_value_counts_std
0.828092959672
Adding feature:  n_device_per_auction
0.828434723172
Adding feature:  std_device_per_auction
0.828377762588
Adding feature:  n_country_per_auction
0.828320802005
Adding feature:  std_country_per_auction
0.828092959672
Adding feature:  n_ip_per_auction
0.829403053087
Adding feature:  std_ip_per_auction
0.827124629756


In [11]:
print("Base line: ")
clf = SVC(class_weight = 'balanced', probability=True)
clf.fit(bidder_df_train[feature_list], bidder_df_train.outcome)

result = clf.predict_proba(bidder_df_validation[feature_list])
print(roc_auc_score(bidder_df_validation.outcome, result[:, 1]))

Base line: 
0.827238550923


In [15]:
new_good_feature_list = ['auction_value_counts_avg', 'auction_value_counts_std', \
               'n_device_per_auction', 'std_device_per_auction', 'n_country_per_auction', 'std_country_per_auction', \
               'n_ip_per_auction']

feature_select_list = feature_list.copy()
feature_select_list.extend(new_good_feature_list)

clf.fit(bidder_df_train[feature_select_list], bidder_df_train.outcome)

result = clf.predict_proba(bidder_df_validation[feature_select_list])
print(roc_auc_score(bidder_df_validation.outcome, result[:, 1]))

0.822254499886


In [14]:
new_good_feature_list = ['auction_value_counts_avg', 'n_ip_per_auction']

feature_select_list = feature_list.copy()
feature_select_list.extend(new_good_feature_list)

clf.fit(bidder_df_train[feature_select_list], bidder_df_train.outcome)

result = clf.predict_proba(bidder_df_validation[feature_select_list])
print(roc_auc_score(bidder_df_validation.outcome, result[:, 1]))

0.830257461836
