In [2]:
import pandas as pd
import numpy as np

In [46]:
bids = pd.read_csv('useful_bids.csv')
bids_g_bidder = bids.groupby('bidder_id')
bids_g_auction = bids.groupby('auction')
bids_g_bidder_auction = bids.groupby(['bidder_id', 'auction'])

In [10]:
from constants import *
def generate_basic_features_per_bidder(group):
    feature = dict()
    feature['bids_cnt'] = group.shape[0]
    feature['auction_cnt'] = group['auction'].unique().shape[0]
    feature['mean_bids_per_auction'] = feature['bids_cnt'] * 1.0 / feature['auction_cnt']
    
    feature['country_cnt'] = group['country'].unique().shape[0]
    feature['device_cnt'] = group['device'].unique().shape[0]
    feature['ip_cnt'] = group['ip'].unique().shape[0]
    feature['url_cnt'] = group['url'].unique().shape[0]
    feature['merchandise_cnt'] = group['merchandise'].unique().shape[0]
    
    feature.update(dict.fromkeys(categories, 0))
    feature.update(dict.fromkeys(countries_list, 0))
    
    for cat, value in group['merchandise'].value_counts().iteritems():
        feature[cat] = value

    for country in group['country'].unique():
        feature[str(country)] = 1

    return pd.Series(feature)
    
basic_feat_per_bidder = bids_g_bidder.apply(generate_basic_features_per_bidder)
basic_feat_per_bidder.to_csv('basic_per_bidder.csv')
print basic_feat_per_bidder.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6614 entries, 001068c415025a009fee375a12cff4fcnht8y to fff2c070d8200e0a09150bd81452ce29ngcnv
Columns: 218 entries, ad to zz
dtypes: float64(218)
memory usage: 11.1+ MB
None


In [18]:
columns_interested = [name for name in basic_feat_per_bidder.columns if name not in countries_list and name not in categories]
print np.min(basic_feat_per_bidder[columns_interested], axis = 0)
print
print np.max(basic_feat_per_bidder[columns_interested], axis = 0)
print
print np.min(basic_feat_per_bidder[categories], axis = 0)
print
print np.max(basic_feat_per_bidder[categories], axis = 0)

auction_cnt              1.0
bids_cnt                 1.0
country_cnt              1.0
device_cnt               1.0
ip_cnt                   1.0
mean_bids_per_auction    1.0
merchandise_cnt          1.0
url_cnt                  1.0
dtype: float64

auction_cnt                1726.000000
bids_cnt                 515033.000000
country_cnt                 179.000000
device_cnt                 2618.000000
ip_cnt                   111918.000000
mean_bids_per_auction      1327.366667
merchandise_cnt               2.000000
url_cnt                   81376.000000
dtype: float64

auto parts          0.0
books and music     0.0
clothing            0.0
computers           0.0
furniture           0.0
home goods          0.0
jewelry             0.0
mobile              0.0
office equipment    0.0
sporting goods      0.0
dtype: float64

auto parts            7344.0
books and music      16040.0
clothing              8087.0
computers            15026.0
furniture            66027.0
home goods          236

In [38]:
def max_time_idx(group):
    return group['time'].idxmax()
idx = bids_g_auction.apply(max_time_idx)
last_bids = bids.loc[idx]
def cnt_auctions_win(group):
    return group.shape[0]
auctions_win = last_bids.groupby('bidder_id').size().reset_index(name='auctions_won_cnt')
print auctions_win.columns
print auctions_win.head(10)
auctions_win.to_csv('auctions_win.csv')

Index([u'bidder_id', u'auctions_won_cnt'], dtype='object')
                                 bidder_id  auctions_won_cnt
0    00a0517965f18610417ee784a05f494d4dw6e                 1
1    01067975436d123f717ee5aba0dd4bbfa0937                 2
2    01cda526658455000913950f20cf31a2q6nsf                10
3    022ac3a7ce986049d9a4bede83ccf9ddctvg6                14
4    0318e0a173f7c65db40116b903884c854x258                54
5    037d9f10da403d8d5f94b6e2957a3702f6x07               227
6    03a1e81cb7bcd15014489a6f752c9d5b7pvel                 1
7    03aafab9868455b78f0723eda86698ba9v9rm                11
8    041d869edcadf595b90e3b5248903183pn754                 5
9    047558bebab9292a5d4c85d6ee768784j2hdd                 8
10   0496c24a6a7f593f28bf7896350e946advl6i                 2
11   0547dbd3ecb4a293913f138d60b586fdf4ane                 2
12   054b51d059fe981ca8cc4e0d4c49aabb5mck7                16
13   055282b75717345ba6f53d40e9e6d6c56qrzp                41
14   0565a780f2c5b46837bbe

In [47]:
print np.min(bids['time']), np.max(bids['time'])
min_time = np.min(bids['time'])
bids['time'] = bids['time'].apply(lambda x: x-min_time)
print np.min(bids['time']), np.max(bids['time'])

9631916842105263 9772885210526315
0 140968368421052


In [49]:
def generate_time_features_per_bidder(group):
    sorted_group = group.sort_values(by=['time'])
    
    time_diff = np.ediff1d(sorted_group['time'])
    if len(time_diff) == 0:
        features = {'tdiff_min':0, 
                    'tdiff_max':0,
                    'tdiff_mean':0,
                    'tdiff_std':0,
                    'tdiff_median':0,
                    'tdiff_zeros':0}
    else:
        features = {'tdiff_min':np.min(time_diff), 
                    'tdiff_max':np.max(time_diff),
                    'tdiff_mean':np.mean(time_diff),
                    'tdiff_std':np.std(time_diff),
                    'tdiff_median':np.median(time_diff),
                    'tdiff_zeros':time_diff.shape[0]-np.count_nonzero(time_diff)}
    
    return pd.Series(features)
    
time_feat_per_bidder = bids_g_bidder.apply(generate_time_features_per_bidder)
print time_feat_per_bidder.info()
print time_feat_per_bidder.head(10)
time_feat_per_bidder.to_csv('tdiff_feat_per_bidder.csv')

<class 'pandas.core.frame.DataFrame'>
Index: 1984 entries, 001068c415025a009fee375a12cff4fcnht8y to fff2c070d8200e0a09150bd81452ce29ngcnv
Data columns (total 6 columns):
tdiff_max       1984 non-null float64
tdiff_mean      1984 non-null float64
tdiff_median    1984 non-null float64
tdiff_min       1984 non-null float64
tdiff_std       1984 non-null float64
tdiff_zeros     1984 non-null float64
dtypes: float64(6)
memory usage: 108.5+ KB
None
                                          tdiff_max    tdiff_mean  \
bidder_id                                                           
001068c415025a009fee375a12cff4fcnht8y  0.000000e+00  0.000000e+00   
0030a2dd87ad2733e0873062e4f83954mkj86  0.000000e+00  0.000000e+00   
00a0517965f18610417ee784a05f494d4dw6e  5.012521e+13  5.486831e+11   
00cc97158e6f4cb8eac3c0075918b7ffi5k8o  5.491395e+13  2.748400e+13   
01067975436d123f717ee5aba0dd4bbfa0937  9.704211e+11  2.507283e+10   
012441119bcf83b23d4768bb72cea6d6carua  2.583895e+12  5.622344e+11   
01

In [51]:
def generate_price_features_per_bidder(group):
    features = {'price_min':np.min(group['time']), 
                'price_max':np.max(group['time']),
                'price_mean':np.mean(group['time']),
                'price_std':np.std(group['time']),
                'price_median':np.median(group['time'])}
    return pd.Series(features)
    
price_feat_per_bidder = bids_g_bidder.apply(generate_price_features_per_bidder)
print price_feat_per_bidder.info()
print price_feat_per_bidder.head(10)
price_feat_per_bidder.to_csv('price_feat_per_bidder.csv')

<class 'pandas.core.frame.DataFrame'>
Index: 1984 entries, 001068c415025a009fee375a12cff4fcnht8y to fff2c070d8200e0a09150bd81452ce29ngcnv
Data columns (total 5 columns):
price_max       1984 non-null float64
price_mean      1984 non-null float64
price_median    1984 non-null float64
price_min       1984 non-null float64
price_std       1984 non-null float64
dtypes: float64(5)
memory usage: 93.0+ KB
None
                                          price_max    price_mean  \
bidder_id                                                           
001068c415025a009fee375a12cff4fcnht8y  7.442821e+13  7.442821e+13   
0030a2dd87ad2733e0873062e4f83954mkj86  7.263711e+13  7.263711e+13   
00a0517965f18610417ee784a05f494d4dw6e  7.719842e+13  6.381892e+13   
00cc97158e6f4cb8eac3c0075918b7ffi5k8o  6.419305e+13  2.756574e+13   
01067975436d123f717ee5aba0dd4bbfa0937  1.409241e+14  1.334935e+14   
012441119bcf83b23d4768bb72cea6d6carua  1.402336e+14  1.340105e+14   
01255c2c7c5578c186873422fc00fd7afwk8k  1.

In [67]:
from collections import defaultdict
time_response = defaultdict(list)

for auction in bids_g_auction:
    auction_group = auction[1].sort_values(by=['time'])
    last_row = None
    for i, row in auction_group.iterrows():
        if last_row is None:
            last_row = row
            continue
#         try:
#             assert row['time']-last_row['time'] >= 0
#         except:
#             print row['time'], last_row['time']
#             print auction_group['time']
#             break
        time_response[row['bidder_id']].append(row['time']-last_row['time'])
        last_row = row

time_response_df = []
for bidder_id in time_response.keys():
    time_response_df.append({'bidder_id':bidder_id,
                             'response_min': np.min(time_response[bidder_id]),
                             'response_max': np.max(time_response[bidder_id]),
                             'response_mean': np.mean(time_response[bidder_id]),
                             'response_std': np.std(time_response[bidder_id]),
                             'response_median': np.median(time_response[bidder_id])})
    
time_response_df = pd.DataFrame(time_response_df)
print time_response_df.info()
print time_response_df.head(10)
time_response_df.to_csv('time_response_feat.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1981 entries, 0 to 1980
Data columns (total 6 columns):
bidder_id          1981 non-null object
response_max       1981 non-null int64
response_mean      1981 non-null float64
response_median    1981 non-null float64
response_min       1981 non-null int64
response_std       1981 non-null float64
dtypes: float64(3), int64(2), object(1)
memory usage: 92.9+ KB
None
                               bidder_id  response_max  response_mean  \
0  b9080651d16e3d6d2d83e1c5debb55b0qpwbv     473684211   4.736842e+08   
1  de4388600d997803223748e48bfc7b16ggyrh    2473684211   2.131579e+09   
2  3919be53bf4bd411ba25acfdc1362b49u4vle    3105263158   3.105263e+09   
3  a4c40fd7a4c1040283fb2b84611eaad240849   11526315790   8.106547e+08   
4  86a626c876e7eb4be35f4f0b0a78f97a1rn7h  201631578947   3.598684e+10   
5  e0fecfdf4029d9a91664e703a178397718lyl     105263158   7.894737e+07   
6  d13cd9e0a7b95c0a35f3456014865230qku5q   85526315790   1.455263e+10   
7

In [68]:
auctions_win.to_csv('auctions_win.csv')
time_feat_per_bidder.to_csv('tdiff_feat_per_bidder.csv')
price_feat_per_bidder.to_csv('price_feat_per_bidder.csv')
time_response_df.to_csv('time_response_feat.csv')

In [74]:
# sanity check
print 'auctions_won_cnt', np.count_nonzero(auctions_win['auctions_won_cnt'] <= 0)

for df in [time_feat_per_bidder, price_feat_per_bidder, time_response_df]:
    print 'unique bidder', df.index.unique().shape[0]
    for column in df.columns:
        print column, np.count_nonzero(df[column] < 0)

 auctions_won_cnt 0
unique bidder 1984
tdiff_max 0
tdiff_mean 0
tdiff_median 0
tdiff_min 0
tdiff_std 0
tdiff_zeros 0
unique bidder 1984
price_max 0
price_mean 0
price_median 0
price_min 0
price_std 0
unique bidder 1981
bidder_id 0
response_max 0
response_mean 0
response_median 0
response_min 0
response_std 0


In [78]:
complex_feat = pd.merge(auctions_win, price_feat_per_bidder, left_on='bidder_id', right_index=True, how='outer')
complex_feat = complex_feat.merge(time_feat_per_bidder, left_on='bidder_id', right_index=True, how='outer')
complex_feat = complex_feat.merge(time_response_df, on='bidder_id', how='outer')
complex_feat.fillna(0, inplace=True)
print complex_feat.info()
print complex_feat.head(10)
complex_feat.to_csv('complex_feat.csv')

all_feat = pd.merge(complex_feat, basic_feat_per_bidder, left_on='bidder_id', right_index=True)
print all_feat.info()
print all_feat.head(5)
all_feat.to_csv('all_feat.csv')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1984 entries, 0 to 1983
Data columns (total 18 columns):
bidder_id           1984 non-null object
auctions_won_cnt    1984 non-null float64
price_max           1984 non-null float64
price_mean          1984 non-null float64
price_median        1984 non-null float64
price_min           1984 non-null float64
price_std           1984 non-null float64
tdiff_max           1984 non-null float64
tdiff_mean          1984 non-null float64
tdiff_median        1984 non-null float64
tdiff_min           1984 non-null float64
tdiff_std           1984 non-null float64
tdiff_zeros         1984 non-null float64
response_max        1984 non-null float64
response_mean       1984 non-null float64
response_median     1984 non-null float64
response_min        1984 non-null float64
response_std        1984 non-null float64
dtypes: float64(17), object(1)
memory usage: 294.5+ KB
None
                               bidder_id  auctions_won_cnt     price_max  \
0 

In [77]:
basic_feat_per_bidder.index

Index([u'001068c415025a009fee375a12cff4fcnht8y',
       u'002d229ffb247009810828f648afc2ef593rb',
       u'0030a2dd87ad2733e0873062e4f83954mkj86',
       u'003180b29c6a5f8f1d84a6b7b6f7be57tjj1o',
       u'00486a11dff552c4bd7696265724ff81yeo9v',
       u'0051aef3fdeacdadba664b9b3b07e04e4coc6',
       u'0053b78cde37c4384a20d2da9aa4272aym4pb',
       u'0061edfc5b07ff3d70d693883a38d370oy4fs',
       u'00862324eb508ca5202b6d4e5f1a80fc3t3lp',
       u'009479273c288b1dd096dc3087653499lrx3c',
       ...
       u'ff8a8bcb0ecfd4c8881ab22abeba75a9u4q95',
       u'ff92ea4abd33ed38601287f0e1d6726dmgx1f',
       u'ffa7b0b0f144b1594131d99e50c17a0bwbym3',
       u'ffacbed056cbfaa60c1fcf51f0d381bddr3ly',
       u'ffaf0a972a6dcb3910fd6b16045781e2ava5y',
       u'ffbc0fdfbf19a8a9116b68714138f2902cc13',
       u'ffc4e2dd2cc08249f299cab46ecbfacfobmr3',
       u'ffd29eb307a4c54610dd2d3d212bf3bagmmpl',
       u'ffd62646d600b759a985d45918bd6f0431vmz',
       u'fff2c070d8200e0a09150bd81452ce29ngcnv'],
      dt