In [1]:
#Loading essential libraries
import numpy as np
import pandas as pd

In [2]:
#Loading datasets
tr = pd.read_csv('train.csv')
camp = pd.read_csv('campaign_data.csv')
coup = pd.read_csv('coupon_item_mapping.csv')
item = pd.read_csv('item_data.csv')
cust_dem = pd.read_csv('customer_demographics.csv')
cust_trans = pd.read_csv('customer_transaction_data.csv')
ts = pd.read_csv('test_QyjYwdj.csv')

In [3]:
#Deriving dayofweek, month, quarter, year from date in cust_trans
cust_trans['date'] = pd.to_datetime(cust_trans['date'], infer_datetime_format=True)
cust_trans['date_weekday'] = cust_trans.date.dt.weekday_name
cust_trans['date_month'] = cust_trans.date.dt.month
cust_trans['date_quarter'] = cust_trans.date.dt.quarter
cust_trans['date_year'] = cust_trans.date.dt.year

#Also changing negative foramt in discount to positive
cust_trans['other_discount'] = abs(cust_trans['other_discount'])
cust_trans['coupon_discount'] = abs(cust_trans['coupon_discount'])
cust_trans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1324566 entries, 0 to 1324565
Data columns (total 11 columns):
date               1324566 non-null datetime64[ns]
customer_id        1324566 non-null int64
item_id            1324566 non-null int64
quantity           1324566 non-null int64
selling_price      1324566 non-null float64
other_discount     1324566 non-null float64
coupon_discount    1324566 non-null float64
date_weekday       1324566 non-null object
date_month         1324566 non-null int64
date_quarter       1324566 non-null int64
date_year          1324566 non-null int64
dtypes: datetime64[ns](1), float64(3), int64(6), object(1)
memory usage: 111.2+ MB


In [4]:
#Merging cust_trans and item data
cust_item = cust_trans.merge(item, on='item_id', how='left')
cust_item.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1324566 entries, 0 to 1324565
Data columns (total 14 columns):
date               1324566 non-null datetime64[ns]
customer_id        1324566 non-null int64
item_id            1324566 non-null int64
quantity           1324566 non-null int64
selling_price      1324566 non-null float64
other_discount     1324566 non-null float64
coupon_discount    1324566 non-null float64
date_weekday       1324566 non-null object
date_month         1324566 non-null int64
date_quarter       1324566 non-null int64
date_year          1324566 non-null int64
brand              1324566 non-null int64
brand_type         1324566 non-null object
category           1324566 non-null object
dtypes: datetime64[ns](1), float64(3), int64(7), object(3)
memory usage: 151.6+ MB


#### Aggregating and creating Features for each customer

In [6]:
#What is the number of purchase transactions made by each customer?
#Total quantities of all the items purchased by each customer?
#Total number of unique items bought by each customer?
#Total number of unique brands bought by each customer?
#Average number of quantities bought by each customer?
#Total selling price, discount price, other discount price, coupon discount price,
#coupon discount percent & total discount percent by each customer?

agg_col = {'item_id': ['count', 'nunique'], 'brand':['nunique'], 'quantity': ['sum', 'mean'], 'selling_price':['sum'],
           'coupon_discount': ['sum'], 'other_discount': ['sum']}
cust_agg = cust_item.groupby(['customer_id']).agg(agg_col)
cust_agg.columns=['cust_' + '_'.join(col).strip() for col in cust_agg.columns.values]
cust_agg.rename(columns={'cust_item_id_count':'cust_trans_count'}, inplace=True)
cust_agg['cust_coupdisc_price_perc'] = cust_agg['cust_coupon_discount_sum']/cust_agg['cust_selling_price_sum']
cust_agg['cust_discount_price_perc'] = (cust_agg['cust_coupon_discount_sum'] + 
                                        cust_agg['cust_other_discount_sum'])/cust_agg['cust_selling_price_sum']

#Number of unique campaigns each customer got targetted in train and test dataset combined?
tmp_map = pd.concat([tr,ts], sort='False', axis='rows').groupby('customer_id')['campaign_id'].nunique()
cust_agg['cust_campaign_nunique'] = cust_agg.index.map(tmp_map)

#Number of unique coupons each customer got targetted in train and test dataset combined?
tmp_map = pd.concat([tr,ts], sort='False', axis='rows').groupby('customer_id')['coupon_id'].nunique()
cust_agg['cust_coupon_nunique'] = cust_agg.index.map(tmp_map)

In [7]:
#What is the number of coupon discounted transactions made by each customer?
tmp_map = cust_item[cust_item['coupon_discount']!=0].groupby(['customer_id']).size()
cust_agg['cust_coupdisc_trans_count'] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg['cust_coupdisc_trans_count'].isnull(), 'cust_coupdisc_trans_count'] = 0

#Total coupon discounted quantities by each customer?
tmp_map = cust_item[cust_item['coupon_discount']!=0].groupby(['customer_id'])['quantity'].sum()
cust_agg['cust_coupdisc_quantity_sum'] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg['cust_coupdisc_quantity_sum'].isnull(), 'cust_coupdisc_quantity_sum'] = 0

#Coupon discount percent by transactions and quantities by each customer?
cust_agg['cust_coupdisc_trans_perc'] = cust_agg['cust_coupdisc_trans_count']/cust_agg['cust_trans_count']
cust_agg['cust_coupdisc_quantity_perc'] = cust_agg['cust_coupdisc_quantity_sum']/cust_agg['cust_quantity_sum']

In [8]:
#Aggregation of customer data by Local & Established Brand types
#Total purchase transactions & quantities made in all Local and Established brand items by each customer?
tmp_map = cust_item[cust_item['brand_type']=='Local'].groupby(['customer_id']).size()
cust_agg['cust_Local_trans_count'] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg['cust_Local_trans_count'].isnull(), 'cust_Local_trans_count'] = 0
tmp_map = cust_item[cust_item['brand_type']=='Established'].groupby(['customer_id']).size()
cust_agg['cust_Established_trans_count'] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg['cust_Established_trans_count'].isnull(), 'cust_Established_trans_count'] = 0

tmp_map = cust_item[cust_item['brand_type']=='Local'].groupby(['customer_id'])['quantity'].sum()
cust_agg['cust_Local_quantity_sum'] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg['cust_Local_quantity_sum'].isnull(), 'cust_Local_quantity_sum'] = 0
tmp_map = cust_item[cust_item['brand_type']=='Established'].groupby(['customer_id'])['quantity'].sum()
cust_agg['cust_Established_quantity_sum'] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg['cust_Established_quantity_sum'].isnull(), 'cust_Established_quantity_sum'] = 0

#Total coupon discounted transactions, quantities made in Local and Established brand items by each customer?
tmp_map = cust_item[(cust_item['brand_type']=='Local') & (cust_item['coupon_discount']!=0)].groupby(['customer_id']).size()
cust_agg['cust_Local_coupdisc_trans_count'] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg['cust_Local_coupdisc_trans_count'].isnull(), 'cust_Local_coupdisc_trans_count'] = 0
tmp_map = cust_item[(cust_item['brand_type']=='Established')& (cust_item['coupon_discount']!=0)].groupby(['customer_id']).size()
cust_agg['cust_Established_coupdisc_trans_count'] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg['cust_Established_coupdisc_trans_count'].isnull(), 'cust_Established_coupdisc_trans_count'] = 0

tmp_map = cust_item[(cust_item['brand_type']=='Local') & (cust_item['coupon_discount']!=0)].groupby(['customer_id'])['quantity'].sum()
cust_agg['cust_Local_coupdisc_quantity_sum'] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg['cust_Local_coupdisc_quantity_sum'].isnull(), 'cust_Local_coupdisc_quantity_sum'] = 0
tmp_map = cust_item[(cust_item['brand_type']=='Established')& (cust_item['coupon_discount']!=0)].groupby(['customer_id'])['quantity'].sum()
cust_agg['cust_Established_coupdisc_quantity_sum'] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg['cust_Established_coupdisc_quantity_sum'].isnull(), 'cust_Established_coupdisc_quantity_sum'] = 0

#Coupon discount percent by transactions and quantities in  Local and Established brand items by each customer?
cust_agg['cust_Local_coupdisc_trans_perc'] = cust_agg.apply(lambda x: 0 if (x['cust_Local_trans_count']==0) else\
                                                                  (x['cust_Local_coupdisc_trans_count']/
                                                                   x['cust_Local_trans_count']), axis=1)
cust_agg['cust_Established_coupdisc_trans_perc']= cust_agg.apply(lambda x: 0 if (x['cust_Established_trans_count']==0) else\
                                                                  (x['cust_Established_coupdisc_trans_count']/
                                                                   x['cust_Established_trans_count']), axis=1)

cust_agg['cust_Local_coupdisc_quantity_perc'] = cust_agg.apply(lambda x: 0 if (x['cust_Local_quantity_sum']==0) else\
                                                                  (x['cust_Local_coupdisc_quantity_sum']/
                                                                   x['cust_Local_quantity_sum']), axis=1)
cust_agg['cust_Established_coupdisc_quantity_perc']= cust_agg.apply(lambda x: 0 if (x['cust_Established_quantity_sum']==0) else\
                                                                  (x['cust_Established_coupdisc_quantity_sum']/
                                                                   x['cust_Established_quantity_sum']), axis=1)

In [9]:
#Number of unique items bought in Local and established brand items by each customer?
tmp_map = cust_item[(cust_item['brand_type']=='Local')].groupby(['customer_id'])['item_id'].nunique()
cname = 'cust_Local_item_nunique'
cust_agg[cname] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg[cname].isnull(), cname] = 0

tmp_map = cust_item[(cust_item['brand_type']=='Established')].groupby(['customer_id'])['item_id'].nunique()
cname = 'cust_Established_item_nunique'
cust_agg[cname] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg[cname].isnull(), cname] = 0

#Number of unique brands bought in Local and established brand items by each customer?
tmp_map = cust_item[(cust_item['brand_type']=='Local')].groupby(['customer_id'])['brand'].nunique()
cname = 'cust_Local_brand_nunique'
cust_agg[cname] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg[cname].isnull(), cname] = 0

tmp_map = cust_item[(cust_item['brand_type']=='Established')].groupby(['customer_id'])['brand'].nunique()
cname = 'cust_Established_brand_nunique'
cust_agg[cname] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg[cname].isnull(), cname] = 0

In [10]:
#Total selling price, discount price, coupon discount price in Local and established brand items by each customer?
#What is the coupon discount percent in price across Local and established brand items by each customer?
tmp_map = cust_item[(cust_item['brand_type']=='Local')].groupby(['customer_id'])['selling_price'].sum()
cname = 'cust_Local_brand_selling_price_sum'
cust_agg[cname] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg[cname].isnull(), cname] = 0

tmp_map = cust_item[(cust_item['brand_type']=='Established')].groupby(['customer_id'])['selling_price'].sum()
cname = 'cust_Established_brand_selling_price_sum'
cust_agg[cname] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg[cname].isnull(), cname] = 0

tmp_map = cust_item[(cust_item['brand_type']=='Local')].groupby(['customer_id'])['coupon_discount'].sum()
cname = 'cust_Local_brand_coupon_discount_sum'
cust_agg[cname] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg[cname].isnull(), cname] = 0

tmp_map = cust_item[(cust_item['brand_type']=='Established')].groupby(['customer_id'])['coupon_discount'].sum()
cname = 'cust_Established_brand_coupon_discount_sum'
cust_agg[cname] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg[cname].isnull(), cname] = 0

tmp_map = cust_item[(cust_item['brand_type']=='Local')].groupby(['customer_id'])['other_discount'].sum()
cname = 'cust_Local_brand_other_discount_sum'
cust_agg[cname] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg[cname].isnull(), cname] = 0

tmp_map = cust_item[(cust_item['brand_type']=='Established')].groupby(['customer_id'])['other_discount'].sum()
cname = 'cust_Established_brand_other_discount_sum'
cust_agg[cname] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg[cname].isnull(), cname] = 0

#coupon discount percent
cname1 = 'cust_Local_brand_coupon_discount_sum'
cname2 = 'cust_Local_brand_other_discount_sum'
cname3 = 'cust_Local_brand_selling_price_sum'
cust_agg['cust_Local_coupdisc_price_perc'] = cust_agg.apply(lambda x: 0 if (x[cname3]==0) else (x[cname1]/x[cname3]), axis=1)


cname1 = 'cust_Established_brand_coupon_discount_sum'
cname2 = 'cust_Established_brand_other_discount_sum'
cname3 = 'cust_Established_brand_selling_price_sum'
cust_agg['cust_Established_coupdisc_price_perc'] = cust_agg.apply(lambda x: 0 if (x[cname3]==0)
                                                                  else (x[cname1]/x[cname3]), axis=1)

In [11]:
#Aggregation of customer data by 19 item categories
for x in cust_item['category'].unique():
    #Total purchase transactions made in each 19 item categories by each customer?
    tmp_map = cust_item[(cust_item['category']==x)].groupby(['customer_id']).size()
    cname1 = 'cust_'+x+'_trans_count'
    cust_agg[cname1] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname1].isnull(), cname1] = 0
    
    #Total coupon discounted transactions made in each 19 item categories by each customer?
    tmp_map = cust_item[(cust_item['category']==x) & (cust_item['coupon_discount']!=0)].groupby(['customer_id']).size()
    cname2 = 'cust_'+x+'_coupdisc_trans_count'
    cust_agg[cname2] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname2].isnull(), cname2] = 0
    
    #Total quantities transactions made in each 19 item categories by each customer?
    tmp_map = cust_item[(cust_item['category']==x)].groupby(['customer_id'])['quantity'].sum()
    cname3 = 'cust_'+x+'_quantity_sum'
    cust_agg[cname3] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname3].isnull(), cname3] = 0
    
    #Total coupon discounted quantities purchased in each 19 item categories by each customer?
    tmp_map = cust_item[(cust_item['category']==x) & (cust_item['coupon_discount']!=0)].\
    groupby(['customer_id'])['quantity'].sum()
    cname4 = 'cust_'+x+'_coupdisc_quantity_sum'
    cust_agg[cname4] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname4].isnull(), cname4] = 0
    
    #Number of unique items bought in each 19 item categories by each customer?
    tmp_map = cust_item[(cust_item['category']==x)].groupby(['customer_id'])['item_id'].nunique()
    cname5 = 'cust_'+x+'_item_nunique'
    cust_agg[cname5] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname5].isnull(), cname5] = 0
    
    #Number of unique brand bought in each 19 item categories by each customer?
    tmp_map = cust_item[(cust_item['category']==x)].groupby(['customer_id'])['brand'].nunique()
    cname6 = 'cust_'+x+'_brand_nunique'
    cust_agg[cname6] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname6].isnull(), cname6] = 0
    
    #Coupon discount percent by transactions and quantities across 19 categories by each customer?
    cname7 = 'cust_'+x+'_coupdisc_trans_perc'
    cust_agg[cname7] = cust_agg.apply(lambda x: 0 if (x[cname1]==0) else (x[cname2]/x[cname1]), axis=1)
    
    cname8 = 'cust_'+x+'_coupdisc_quantity_perc'
    cust_agg[cname8] = cust_agg.apply(lambda x: 0 if (x[cname3]==0) else (x[cname4]/x[cname3]), axis=1)
    
    #Total selling price, coupon discount price, other discount price, total discount price across 19 categories by each csutomer?
    cname9 = 'cust_'+x+'_selling_price'
    tmp_map = cust_item[(cust_item['category']==x)].groupby(['customer_id'])['selling_price'].sum()
    cust_agg[cname9] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname9].isnull(), cname9] = 0
    
    cname10 = 'cust_'+x+'_coupon_discount'
    tmp_map = cust_item[(cust_item['category']==x)].groupby(['customer_id'])['coupon_discount'].sum()
    cust_agg[cname10] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname10].isnull(), cname10] = 0
    
    cname11 = 'cust_'+x+'_other_discount'
    tmp_map = cust_item[(cust_item['category']==x)].groupby(['customer_id'])['other_discount'].sum()
    cust_agg[cname11] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname11].isnull(), cname11] = 0
    
    #coupon discount percent, discount percent across 19 categories for each csutomer?
    cname12 = 'cust_'+x+'_coupdisc_price_perc'
    cust_agg[cname12] = cust_agg.apply(lambda x: 0 if (x[cname9]==0) else (x[cname10]/x[cname9]), axis=1)
    
    cname13 = 'cust_'+x+'_discount_price_perc'
    cust_agg[cname13] = cust_agg.apply(lambda x: 0 if (x[cname9]==0) else ((x[cname10]+x[cname11])/x[cname9]), axis=1)

In [12]:
#Aggregation of customer data by weekday of customer transaction date
for x in cust_item['date_weekday'].unique():
    #Number of unique items bought in each month, dayofweek & quarter by each customer?
    tmp_map = cust_item[(cust_item['date_weekday']==x)].groupby(['customer_id'])['item_id'].nunique()
    cname1 = 'cust_'+x+'_item_nunique'
    cust_agg[cname1] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname1].isnull(), cname1] = 0
    
    #Number of unique brands bought in each month, dayofweek & quarter by each customer?
    tmp_map = cust_item[(cust_item['date_weekday']==x)].groupby(['customer_id'])['brand'].nunique()
    cname2 = 'cust_'+x+'_brand_nunique'
    cust_agg[cname2] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname2].isnull(), cname2] = 0
    
    #Number of Local and Established unique items bought in each month, dayofweek & quarter by each customer?
    for x1 in cust_item['brand_type'].unique():
        cname3 = 'cust_'+x1+'_'+x+'_item_nunique'
        tmp_map = cust_item[(cust_item['date_weekday']==x)&(cust_item['brand_type']==x1)].groupby(['customer_id'])['item_id'].nunique()
        cust_agg[cname3] = cust_agg.index.map(tmp_map)
        cust_agg.loc[cust_agg[cname3].isnull(), cname3] = 0
        
        cname4 = 'cust_'+x1+'_'+x+'_brand_nunique'
        tmp_map = cust_item[(cust_item['date_weekday']==x)&(cust_item['brand_type']==x1)].groupby(['customer_id'])['brand'].nunique()
        cust_agg[cname4] = cust_agg.index.map(tmp_map)
        cust_agg.loc[cust_agg[cname4].isnull(), cname4] = 0
        
    #Total purchase transactions made in each month, dayofweek & quarter by each customer?
    tmp_map = cust_item[(cust_item['date_weekday']==x)].groupby(['customer_id']).size()
    cname5 = 'cust_'+x+'_trans_count'
    cust_agg[cname5] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname5].isnull(), cname5] = 0
    
    #Total quantities purchased in each month, dayofweek & quarter by each customer?
    tmp_map = cust_item[(cust_item['date_weekday']==x)].groupby(['customer_id'])['quantity'].sum()
    cname6 = 'cust_'+x+'_quantity_sum'
    cust_agg[cname6] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname6].isnull(), cname6] = 0
    
    #Total coupon discounted transactions made in each month, dayofweek & quarter by each customer?
    tmp_map = cust_item[(cust_item['date_weekday']==x)&(cust_item['coupon_discount']!=0)].groupby(['customer_id']).size()
    cname7 = 'cust_'+x+'_coupdisc_trans_count'
    cust_agg[cname7] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname7].isnull(), cname7] = 0
    
    #Total coupon discounted quantities made in each month, dayofweek & quarter by each customer?
    tmp_map = cust_item[(cust_item['date_weekday']==x)&(cust_item['coupon_discount']!=0)].groupby(['customer_id'])['quantity'].sum()
    cname8 = 'cust_'+x+'_coupdisc_quantity_sum'
    cust_agg[cname8] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname8].isnull(), cname8] = 0
    
    #coupon discount percent in transactions and quantities across each month, dayofweek & quarter by each customer?
    cname9 = 'cust_'+x+'_coupdisc_trans_perc'
    cust_agg[cname9] = cust_agg.apply(lambda x: 0 if (x[cname5]==0) else (x[cname7]/x[cname5]), axis=1)
    
    cname10 = 'cust_'+x+'_coupdisc_quantity_perc'
    cust_agg[cname10] = cust_agg.apply(lambda x: 0 if (x[cname6]==0) else (x[cname8]/x[cname6]), axis=1)
    
    #Total selling price, coupon discount price, other discount price across each month, dayofweek & quarter by each customer?
    tmp_map = cust_item[(cust_item['date_weekday']==x)].groupby(['customer_id'])['selling_price'].sum()
    cname11 = 'cust_'+x+'_selling_price_sum'
    cust_agg[cname11] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname11].isnull(), cname11] = 0
    
    tmp_map = cust_item[(cust_item['date_weekday']==x)].groupby(['customer_id'])['coupon_discount'].sum()
    cname12 = 'cust_'+x+'_coupon_discount_sum'
    cust_agg[cname12] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname12].isnull(), cname12] = 0
    
    tmp_map = cust_item[(cust_item['date_weekday']==x)].groupby(['customer_id'])['other_discount'].sum()
    cname13 = 'cust_'+x+'_other_discount_sum'
    cust_agg[cname13] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname13].isnull(), cname13] = 0
    
    #coupon discount & total discount percent in price across each month, dayofweek & quarter by each customer?
    cname14 = 'cust_'+x+'_coupdisc_price_perc'
    cust_agg[cname14] = cust_agg.apply(lambda x: 0 if (x[cname11]==0) else (x[cname12]/x[cname11]), axis=1)
    
    cname15 = 'cust_'+x+'_discount_price_perc'
    cust_agg[cname15] = cust_agg.apply(lambda x: 0 if (x[cname11]==0) else ((x[cname12]+x[cname13])/x[cname11]), axis=1)   
    

In [13]:
#Aggregation of customer data by month of customer transaction date
for x in cust_item['date_month'].unique():
    tmp_map = cust_item[(cust_item['date_month']==x)].groupby(['customer_id'])['item_id'].nunique()
    cname1 = 'cust_month_'+str(x)+'_item_nunique'
    cust_agg[cname1] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname1].isnull(), cname1] = 0
    
    tmp_map = cust_item[(cust_item['date_month']==x)].groupby(['customer_id'])['brand'].nunique()
    cname2 = 'cust_month_'+str(x)+'_brand_nunique'
    cust_agg[cname2] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname2].isnull(), cname2] = 0
    
    for x1 in cust_item['brand_type'].unique():
        cname3 = 'cust_month_'+x1+'_'+str(x)+'_item_nunique'
        tmp_map = cust_item[(cust_item['date_month']==x)&(cust_item['brand_type']==x1)].groupby(['customer_id'])['item_id'].nunique()
        cust_agg[cname3] = cust_agg.index.map(tmp_map)
        cust_agg.loc[cust_agg[cname3].isnull(), cname3] = 0
        
        cname4 = 'cust_month_'+x1+'_'+str(x)+'_brand_nunique'
        tmp_map = cust_item[(cust_item['date_month']==x)&(cust_item['brand_type']==x1)].groupby(['customer_id'])['brand'].nunique()
        cust_agg[cname4] = cust_agg.index.map(tmp_map)
        cust_agg.loc[cust_agg[cname4].isnull(), cname4] = 0
    
    tmp_map = cust_item[(cust_item['date_month']==x)].groupby(['customer_id']).size()
    cname5 = 'cust_month_'+str(x)+'_trans_count'
    cust_agg[cname5] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname5].isnull(), cname5] = 0
    
    tmp_map = cust_item[(cust_item['date_month']==x)].groupby(['customer_id'])['quantity'].sum()
    cname6 = 'cust_month_'+str(x)+'_quantity_sum'
    cust_agg[cname6] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname6].isnull(), cname6] = 0
    
    tmp_map = cust_item[(cust_item['date_month']==x)&(cust_item['coupon_discount']!=0)].groupby(['customer_id']).size()
    cname7 = 'cust_month_'+str(x)+'_coupdisc_trans_count'
    cust_agg[cname7] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname7].isnull(), cname7] = 0
    
    tmp_map = cust_item[(cust_item['date_month']==x)&(cust_item['coupon_discount']!=0)].groupby(['customer_id'])['quantity'].sum()
    cname8 = 'cust_month_'+str(x)+'_coupdisc_quantity_sum'
    cust_agg[cname8] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname8].isnull(), cname8] = 0
    
    cname9 = 'cust_month_'+str(x)+'_coupdisc_trans_perc'
    cust_agg[cname9] = cust_agg.apply(lambda x: 0 if (x[cname5]==0) else (x[cname7]/x[cname5]), axis=1)
    
    cname10 = 'cust_month_'+str(x)+'_coupdisc_quantity_perc'
    cust_agg[cname10] = cust_agg.apply(lambda x: 0 if (x[cname6]==0) else (x[cname8]/x[cname6]), axis=1)
    
    tmp_map = cust_item[(cust_item['date_month']==x)].groupby(['customer_id'])['selling_price'].sum()
    cname11 = 'cust_month_'+str(x)+'_selling_price_sum'
    cust_agg[cname11] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname11].isnull(), cname11] = 0
    
    tmp_map = cust_item[(cust_item['date_month']==x)].groupby(['customer_id'])['coupon_discount'].sum()
    cname12 = 'cust_month_'+str(x)+'_coupon_discount_sum'
    cust_agg[cname12] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname12].isnull(), cname12] = 0
    
    tmp_map = cust_item[(cust_item['date_month']==x)].groupby(['customer_id'])['other_discount'].sum()
    cname13 = 'cust_month_'+str(x)+'_other_discount_sum'
    cust_agg[cname13] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname13].isnull(), cname13] = 0
    
    cname14 = 'cust_month_'+str(x)+'_coupdisc_price_perc'
    cust_agg[cname14] = cust_agg.apply(lambda x: 0 if (x[cname11]==0) else (x[cname12]/x[cname11]), axis=1)
    
    cname15 = 'cust_month_'+str(x)+'_discount_price_perc'
    cust_agg[cname15] = cust_agg.apply(lambda x: 0 if (x[cname11]==0) else ((x[cname12]+x[cname13])/x[cname11]), axis=1)

In [14]:
#Aggregation of customer data by quarter of customer transaction date
for x in cust_item['date_quarter'].unique():
    tmp_map = cust_item[(cust_item['date_quarter']==x)].groupby(['customer_id'])['item_id'].nunique()
    cname1 = 'cust_quarter_'+str(x)+'_item_nunique'
    cust_agg[cname1] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname1].isnull(), cname1] = 0
    
    tmp_map = cust_item[(cust_item['date_quarter']==x)].groupby(['customer_id'])['brand'].nunique()
    cname2 = 'cust_quarter_'+str(x)+'_brand_nunique'
    cust_agg[cname2] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname2].isnull(), cname2] = 0
    
    for x1 in cust_item['brand_type'].unique():
        cname3 = 'cust_quarter_'+x1+'_'+str(x)+'_item_nunique'
        tmp_map = cust_item[(cust_item['date_quarter']==x)&(cust_item['brand_type']==x1)].groupby(['customer_id'])['item_id'].nunique()
        cust_agg[cname3] = cust_agg.index.map(tmp_map)
        cust_agg.loc[cust_agg[cname3].isnull(), cname3] = 0
        
        cname4 = 'cust_quarter_'+x1+'_'+str(x)+'_brand_nunique'
        tmp_map = cust_item[(cust_item['date_quarter']==x)&(cust_item['brand_type']==x1)].groupby(['customer_id'])['brand'].nunique()
        cust_agg[cname4] = cust_agg.index.map(tmp_map)
        cust_agg.loc[cust_agg[cname4].isnull(), cname4] = 0
        
    tmp_map = cust_item[(cust_item['date_quarter']==x)].groupby(['customer_id']).size()
    cname5 = 'cust_quarter_'+str(x)+'_trans_count'
    cust_agg[cname5] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname5].isnull(), cname5] = 0
    
    
    tmp_map = cust_item[(cust_item['date_quarter']==x)].groupby(['customer_id'])['quantity'].sum()
    cname6 = 'cust_quarter_'+str(x)+'_quantity_sum'
    cust_agg[cname6] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname6].isnull(), cname6] = 0
    
    tmp_map = cust_item[(cust_item['date_quarter']==x)&(cust_item['coupon_discount']!=0)].groupby(['customer_id']).size()
    cname7 = 'cust_quarter_'+str(x)+'_coupdisc_trans_count'
    cust_agg[cname7] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname7].isnull(), cname7] = 0
    
    tmp_map = cust_item[(cust_item['date_quarter']==x)&(cust_item['coupon_discount']!=0)].groupby(['customer_id'])['quantity'].sum()
    cname8 = 'cust_quarter_'+str(x)+'_coupdisc_quantity_sum'
    cust_agg[cname8] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname8].isnull(), cname8] = 0
    
    cname9 = 'cust_quarter_'+str(x)+'_coupdisc_trans_perc'
    cust_agg[cname9] = cust_agg.apply(lambda x: 0 if (x[cname5]==0) else (x[cname7]/x[cname5]), axis=1)
    
    cname10 = 'cust_quarter_'+str(x)+'_coupdisc_quantity_perc'
    cust_agg[cname10] = cust_agg.apply(lambda x: 0 if (x[cname6]==0) else (x[cname8]/x[cname6]), axis=1)
    
    
    tmp_map = cust_item[(cust_item['date_quarter']==x)].groupby(['customer_id'])['selling_price'].sum()
    cname11 = 'cust_quarter_'+str(x)+'_selling_price_sum'
    cust_agg[cname11] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname11].isnull(), cname11] = 0
    
    tmp_map = cust_item[(cust_item['date_quarter']==x)].groupby(['customer_id'])['coupon_discount'].sum()
    cname12 = 'cust_quarter_'+str(x)+'_coupon_discount_sum'
    cust_agg[cname12] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname12].isnull(), cname12] = 0
    
    tmp_map = cust_item[(cust_item['date_quarter']==x)].groupby(['customer_id'])['other_discount'].sum()
    cname13 = 'cust_quarter_'+str(x)+'_other_discount_sum'
    cust_agg[cname13] = cust_agg.index.map(tmp_map)
    cust_agg.loc[cust_agg[cname13].isnull(), cname13] = 0
    
    cname14 = 'cust_quarter_'+str(x)+'_coupdisc_price_perc'
    cust_agg[cname14] = cust_agg.apply(lambda x: 0 if (x[cname11]==0) else (x[cname12]/x[cname11]), axis=1)
    
    cname15 = 'cust_quarter_'+str(x)+'_discount_price_perc'
    cust_agg[cname15] = cust_agg.apply(lambda x: 0 if (x[cname11]==0) else ((x[cname12]+x[cname13])/x[cname11]), axis=1)

In [15]:
#Number of unique items bought by each customer, which were not bought by any customer?
tmp_map = pd.Series()
for x in cust_item.customer_id.unique():
    n = len(np.setdiff1d(cust_item[cust_item['customer_id']==x]['item_id'].unique(),\
                         cust_item[cust_item['customer_id']!=x]['item_id'].unique()))
    tmp_map = pd.concat([tmp_map, pd.Series(n, index=[x])])
    
cname1 = 'cust_distinct_item_count'
cust_agg[cname1] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg[cname1].isnull(), cname1] = 0

In [16]:
#Number of unique brands bought by each customer, which were not bought by any customer?
tmp_map = pd.Series()
for x in cust_item.customer_id.unique():
    n = len(np.setdiff1d(cust_item[cust_item['customer_id']==x]['brand'].unique(),\
                         cust_item[cust_item['customer_id']!=x]['brand'].unique()))
    tmp_map = pd.concat([tmp_map, pd.Series(n, index=[x])])
    
cname1 = 'cust_distinct_brand_count'
cust_agg[cname1] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg[cname1].isnull(), cname1] = 0

In [17]:
#Number of Local and Established unique items bought by each customer, which were not bought by any customer?
tmp_map = pd.Series()
for x in cust_item.customer_id.unique():
    n = len(np.setdiff1d(cust_item[(cust_item['customer_id']==x) & (cust_item['brand_type']=='Local')]['item_id'].unique(),\
                         cust_item[(cust_item['customer_id']!=x) & (cust_item['brand_type']=='Local')]['item_id'].unique()))
    tmp_map = pd.concat([tmp_map, pd.Series(n, index=[x])])
    
cname1 = 'cust_distinct_Local_item_count'
cust_agg[cname1] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg[cname1].isnull(), cname1] = 0

tmp_map = pd.Series()
for x in cust_item.customer_id.unique():
    n = len(np.setdiff1d(cust_item[(cust_item['customer_id']==x) & (cust_item['brand_type']=='Established')]['item_id'].unique(),\
                         cust_item[(cust_item['customer_id']!=x) & (cust_item['brand_type']=='Established')]['item_id'].unique()))
    tmp_map = pd.concat([tmp_map, pd.Series(n, index=[x])])
    
cname1 = 'cust_distinct_Established_item_count'
cust_agg[cname1] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg[cname1].isnull(), cname1] = 0

In [18]:
#Number of Local and Established unique brands bought by each customer, which were not bought by any customer?
#Number of Local and Established unique items bought by each customer, which were not bought by any customer?
tmp_map = pd.Series()
for x in cust_item.customer_id.unique():
    n = len(np.setdiff1d(cust_item[(cust_item['customer_id']==x) & (cust_item['brand_type']=='Local')]['brand'].unique(),\
                         cust_item[(cust_item['customer_id']!=x) & (cust_item['brand_type']=='Local')]['brand'].unique()))
    tmp_map = pd.concat([tmp_map, pd.Series(n, index=[x])])
    
cname1 = 'cust_distinct_Local_brand_count'
cust_agg[cname1] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg[cname1].isnull(), cname1] = 0

tmp_map = pd.Series()
for x in cust_item.customer_id.unique():
    n = len(np.setdiff1d(cust_item[(cust_item['customer_id']==x) & (cust_item['brand_type']=='Established')]['brand'].unique(),\
                         cust_item[(cust_item['customer_id']!=x) & (cust_item['brand_type']=='Established')]['brand'].unique()))
    tmp_map = pd.concat([tmp_map, pd.Series(n, index=[x])])
    
cname1 = 'cust_distinct_Established_brand_count'
cust_agg[cname1] = cust_agg.index.map(tmp_map)
cust_agg.loc[cust_agg[cname1].isnull(), cname1] = 0

In [19]:
cust_agg.reset_index().to_csv('cust_agg.csv', index=False)

In [20]:
cust_agg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1582 entries, 1 to 1582
Columns: 684 entries, cust_trans_count to cust_distinct_Established_brand_count
dtypes: float64(660), int64(24)
memory usage: 8.3 MB
