In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_rows=1000
pd.options.display.max_columns=1000
from multiprocessing import Pool

In [3]:
train=pd.read_csv('../train.csv')
camp=pd.read_csv('../campaign_data.csv')
trans=pd.read_csv('../customer_transaction_data.csv')
cust=pd.read_csv('../customer_demographics.csv')
coupon=pd.read_csv('../coupon_item_mapping.csv')
item=pd.read_csv('../item_data.csv')
camp.start_date=pd.to_datetime(camp.start_date,format='%d/%m/%y')
camp.end_date=pd.to_datetime(camp.end_date,format='%d/%m/%y')
test=pd.read_csv('../test_QyjYwdj.csv')

In [4]:
train.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status
0,1,13,27,1053,0
1,2,13,116,48,0
2,6,9,635,205,0
3,7,13,644,1050,0
4,9,8,1017,1489,0


In [5]:
def camp_feats(df):
    df['camp_duration']=(df['end_date']-df['start_date']).dt.days
    df['campaign_start_day']=df['start_date'].dt.day
    df['campaign_start_dow']=df['start_date'].dt.dayofweek
    df['campaign_end_day']=df['end_date'].dt.day
    df['campaign_end_dow']=df['end_date'].dt.dayofweek
    df=df.sort_values('start_date').reset_index(drop=True)
    df['days_since_last_campaign_start']=(df.start_date-df.start_date.shift(1)).dt.days
    df['days_since_last_campaign_end']=(df.end_date-df.end_date.shift(1)).dt.days
    df['days_between_start_end_of_prev_campaign']=(df.start_date-df.end_date.shift(1)).dt.days
    return df

In [6]:
camp=camp_feats(camp)

In [7]:
train=train.merge(camp,on='campaign_id',how='left')
test=test.merge(camp,on='campaign_id',how='left')

In [8]:
coupon=coupon.merge(item,on='item_id')

In [9]:
coupon.head()

Unnamed: 0,coupon_id,item_id,brand,brand_type,category
0,105,37,56,Local,Grocery
1,6,37,56,Local,Grocery
2,22,37,56,Local,Grocery
3,31,37,56,Local,Grocery
4,107,75,56,Local,Grocery


In [10]:
coupon_item_counts=coupon.groupby('coupon_id')['item_id'].nunique().to_dict()
coupon_brand_counts=coupon.groupby('coupon_id')['brand'].nunique().to_dict()
coupon_category_counts=coupon.groupby('coupon_id')['category'].nunique().to_dict()
item_coupon_counts=coupon.groupby('item_id')['coupon_id'].nunique().to_dict()
brand_coupon_counts=coupon.groupby('brand')['coupon_id'].nunique().to_dict()
category_coupon_counts=coupon.groupby('category')['coupon_id'].nunique().to_dict()

In [11]:
item['num_coupons_per_item']=item.item_id.map(item_coupon_counts)
item.num_coupons_per_item=item.num_coupons_per_item.fillna(0)
item['num_coupons_per_brand']=item.brand.map(brand_coupon_counts)
item.num_coupons_per_brand=item.num_coupons_per_brand.fillna(0)
item['num_coupons_per_category']=item.category.map(category_coupon_counts)
item.num_coupons_per_category=item.num_coupons_per_category.fillna(0)

In [12]:
item['num_items_per_brand']=item.brand.map(item.groupby('brand')['item_id'].nunique().to_dict())
item['num_items_per_category']=item.category.map(item.groupby('category')['item_id'].nunique().to_dict())

In [13]:
item.head()

Unnamed: 0,item_id,brand,brand_type,category,num_coupons_per_item,num_coupons_per_brand,num_coupons_per_category,num_items_per_brand,num_items_per_category
0,1,1,Established,Grocery,2.0,10.0,776.0,1091,32448
1,2,1,Established,Miscellaneous,0.0,10.0,17.0,1091,385
2,3,56,Local,Bakery,0.0,297.0,12.0,10480,1679
3,4,56,Local,Grocery,3.0,297.0,776.0,10480,32448
4,5,56,Local,Grocery,0.0,297.0,776.0,10480,32448


In [14]:
trans=trans.merge(item,on='item_id')

In [15]:
trans['price_after_OD']=trans['selling_price']-trans['other_discount']
trans['cost_price']=trans['price_after_OD']-trans['coupon_discount']
trans['date']=pd.to_datetime(trans['date'])
trans.sort_values('date',inplace=True)
trans['coupon_redeemed']=(trans['coupon_discount']<0).astype('int')

In [16]:
trans.head()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,brand,brand_type,category,num_coupons_per_item,num_coupons_per_brand,num_coupons_per_category,num_items_per_brand,num_items_per_category,price_after_OD,cost_price,coupon_redeemed
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0,56,Local,Natural Products,3.0,297.0,24.0,10480,2533,45.95,45.95,0
37078,2012-01-02,464,30518,2,70.53,-21.37,0.0,56,Local,Grocery,3.0,297.0,776.0,10480,32448,91.9,91.9,0
38585,2012-01-02,464,30979,4,178.1,-34.2,0.0,56,Local,Grocery,0.0,297.0,776.0,10480,32448,212.3,212.3,0
38723,2012-01-02,464,31215,1,71.24,-45.95,0.0,714,Established,Grocery,1.0,8.0,776.0,380,32448,117.19,117.19,0
38880,2012-01-02,464,31267,1,44.52,-19.23,0.0,56,Local,Grocery,0.0,297.0,776.0,10480,32448,63.75,63.75,0


In [17]:
trans['selling_price_ewm']=trans.groupby('customer_id')['selling_price'].apply(lambda x: x.ewm(halflife=2).mean())
trans['other_discount_ewm']=trans.groupby('customer_id')['other_discount'].apply(lambda x: x.ewm(halflife=2).mean())
trans['coupon_discount_ewm']=trans.groupby('customer_id')['coupon_discount'].apply(lambda x: x.ewm(halflife=2).mean())
trans['price_after_OD_ewm']=trans.groupby('customer_id')['price_after_OD'].apply(lambda x: x.ewm(halflife=2).mean())
trans['cost_price_ewm']=trans.groupby('customer_id')['cost_price'].apply(lambda x: x.ewm(halflife=2).mean())
trans['coupon_redeemed_ewm']=trans.groupby('customer_id')['coupon_redeemed'].apply(lambda x: x.ewm(halflife=2).mean())

In [18]:
trans[trans.item_id==26830]

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,brand,brand_type,category,num_coupons_per_item,num_coupons_per_brand,num_coupons_per_category,num_items_per_brand,num_items_per_category,price_after_OD,cost_price,coupon_redeemed,selling_price_ewm,other_discount_ewm,coupon_discount_ewm,price_after_OD_ewm,cost_price_ewm,coupon_redeemed_ewm
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0,56,Local,Natural Products,3.0,297.0,24.0,10480,2533,45.95,45.95,0,35.26,-10.69,0.0,45.95,45.95,0.0
1,2012-01-02,135,26830,1,35.26,-10.69,0.0,56,Local,Natural Products,3.0,297.0,24.0,10480,2533,45.95,45.95,0,106.678116,-14.347565,0.0,121.025681,121.025681,0.0
2,2012-01-02,464,26830,1,35.26,-10.69,0.0,56,Local,Natural Products,3.0,297.0,24.0,10480,2533,45.95,45.95,0,79.318079,-31.041409,-5.321,110.359488,115.680488,0.1522597
3,2012-01-12,1185,26830,1,45.95,0.0,0.0,56,Local,Natural Products,3.0,297.0,24.0,10480,2533,45.95,45.95,0,122.96691,-8.152265,0.0,131.119175,131.119175,0.0
5,2012-01-13,1383,26830,1,45.95,0.0,0.0,56,Local,Natural Products,3.0,297.0,24.0,10480,2533,45.95,45.95,0,100.065218,-15.339407,0.0,115.404625,115.404625,0.0
4,2012-01-13,1140,26830,1,45.95,0.0,0.0,56,Local,Natural Products,3.0,297.0,24.0,10480,2533,45.95,45.95,0,124.943844,-12.306492,-1.304107,137.250336,138.554443,0.03661165
6,2012-01-22,1396,26830,1,53.07,0.0,0.0,56,Local,Natural Products,3.0,297.0,24.0,10480,2533,53.07,53.07,0,53.219117,-14.60517,0.0,67.824287,67.824287,0.0
7,2012-01-28,1223,26830,1,53.07,0.0,0.0,56,Local,Natural Products,3.0,297.0,24.0,10480,2533,53.07,53.07,0,53.07,0.0,0.0,53.07,53.07,0.0
8,2012-01-29,329,26830,1,53.07,0.0,0.0,56,Local,Natural Products,3.0,297.0,24.0,10480,2533,53.07,53.07,0,85.251398,-5.345067,-0.0001591928,90.596465,90.596624,2.234598e-06
9,2012-02-01,436,26830,1,53.07,0.0,0.0,56,Local,Natural Products,3.0,297.0,24.0,10480,2533,53.07,53.07,0,49.236268,-7.038124,0.0,56.274391,56.274391,0.0


In [19]:
def return_stats(x):
    return [x.mean(),x.sum(),x.std(),x.max()]

In [20]:
df=pd.concat((train,test),axis=0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [21]:
temp=df[['customer_id','start_date','coupon_id','id']]

In [22]:
def get_feats(temp):
    return [temp.shape[0],temp.item_id.nunique(),temp.brand.nunique(),temp.category.nunique(),temp.item_id.mode()[0]]+\
return_stats(temp.quantity)+return_stats(temp.selling_price)+return_stats(temp.other_discount)+return_stats(temp['coupon_redeemed'])+\
return_stats(temp.coupon_discount)+return_stats(temp.num_coupons_per_brand)+\
return_stats(temp.num_coupons_per_category)+return_stats(temp.num_coupons_per_item)+return_stats(temp.cost_price)+\
return_stats(temp.selling_price_ewm)+return_stats(temp.other_discount_ewm)+return_stats(temp.coupon_discount_ewm)+\
return_stats(temp.price_after_OD_ewm)+return_stats(temp.cost_price_ewm)+return_stats(temp.coupon_redeemed_ewm)

In [23]:
%%time
def get_feat(row):
    data_=row[1]
    temp=trans[(trans.customer_id==data_['customer_id'])&(trans.date<data_['start_date'])]
    
    if temp.shape[0]>0:
        feats=[data_.id]+get_feats(temp)
    else:
        feats=[data_.id]+np.repeat(0,65).tolist()
    return feats

try:
    pool = Pool(16) 
    data_outputs = pool.map(get_feat, temp.iterrows())
finally: 
    pool.close()
    pool.join()
cols=[(x+'_mean',x+'_sum',x+'_std',x+'_max') for x in
    ['qty','sp','od','coupon_redeemed','cd','num_coupons_per_brand','num_coupons_per_category','num_coupons_per_item','cost_price',
'selling_price_ewm','other_discount_ewm','coupon_discount_ewm','price_after_OD_ewm','cost_price_ewm','coupon_redeemed_ewm']]
cols=['id','trans_shape','unique_item','unique_brand','unique_category','item_id']+[x for t in cols for x in t]    
cust_hist=pd.DataFrame(np.row_stack(data_outputs),columns=cols)

CPU times: user 23 s, sys: 832 ms, total: 23.8 s
Wall time: 2min 35s


In [24]:
cust_hist.shape

(128595, 66)

In [25]:
train=train.merge(cust_hist,on='id')
test=test.merge(cust_hist,on='id')

In [26]:
train.shape

(78369, 81)

In [27]:
train=train.merge(cust,on='customer_id',how='left')
test=test.merge(cust,on='customer_id',how='left')

In [28]:
train.shape,test.shape

((78369, 87), (50226, 86))

In [29]:
train.to_pickle('../train_v2.pkl')
test.to_pickle('../test_v2.pkl')