In [1]:
#Loading essential libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score
from catboost import CatBoostClassifier

In [2]:
#Loading datasets
tr = pd.read_csv('train.csv')
camp = pd.read_csv('campaign_data.csv')
coup = pd.read_csv('coupon_item_mapping.csv')
item = pd.read_csv('item_data.csv')
cust_dem = pd.read_csv('customer_demographics.csv')
cust_trans = pd.read_csv('customer_transaction_data.csv')
ts = pd.read_csv('test_QyjYwdj.csv')

In [3]:
#Loading cutomer level, coupon level & cmapaign level aggregation data which we prepared earlier
cust_agg = pd.read_csv('cust_agg.csv')
coup_agg = pd.read_csv('coup_agg.csv')
camp_agg = pd.read_csv('camp_agg.csv')

In [4]:
#Checking customer demographics data
cust_dem.info()
#Two columns have missing values

#More than 40% of data is missing in marital_status & no_of_children. We can treat them as 'NA'
cust_dem.fillna(value={'marital_status':'NA', 'no_of_children':'NA'}, inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 760 entries, 0 to 759
Data columns (total 7 columns):
customer_id       760 non-null int64
age_range         760 non-null object
marital_status    431 non-null object
rented            760 non-null int64
family_size       760 non-null object
no_of_children    222 non-null object
income_bracket    760 non-null int64
dtypes: int64(3), object(4)
memory usage: 41.7+ KB


In [5]:
#Merginf customer demographics details to train and test dataset
tr = tr.merge(cust_dem, on='customer_id', how='left')
ts = ts.merge(cust_dem, on='customer_id', how='left')

#Also changing negative foramt in discount to positive
cust_trans['coupon_discount'] = abs(cust_trans['coupon_discount'])
cust_trans['other_discount'] = abs(cust_trans['other_discount'])

In [6]:
#Converting start and end date to datetime format
camp['start_date'] = pd.to_datetime(camp['start_date'], infer_datetime_format=True, dayfirst=True)
camp['end_date'] = pd.to_datetime(camp['end_date'], infer_datetime_format=True, dayfirst=True)
#Getting duration of campaign in days
camp['camp_duration_days'] = (camp['end_date'] - camp['start_date']).dt.days+1

#Getting duration of campaign in weeks
camp['camp_duration_weeks'] = round(((camp['end_date'] - camp['start_date']).dt.days+1)/7)

#Getting start_date & end_date month
camp['camp_start_month'] = camp['start_date'].dt.month
camp['camp_end_month'] = camp['end_date'].dt.month

#Getting start_date & end_date Quarter
camp['camp_start_quarter'] = camp['start_date'].dt.quarter
camp['camp_end_quarter'] = camp['end_date'].dt.quarter

#Getting start_day & end_day year
camp['camp_start_year'] = camp['start_date'].dt.year
camp['camp_end_year'] = camp['end_date'].dt.year

In [7]:
#Merging Campaign data with train and test dataset
tr = tr.merge(camp, on='campaign_id', how='left').drop(['start_date', 'end_date'], axis='columns')
ts = ts.merge(camp, on='campaign_id', how='left').drop(['start_date', 'end_date'], axis='columns')

In [8]:
#Mean Encoding on training data
def MeanEncoding(df, col, trgt, alpha=5, splits=4):
    mean_g = tr[trgt].mean()
    newcol = col+'_Enc'
    df[newcol] = np.nan
    kf = KFold(n_splits=splits, random_state=100, shuffle=True)
    for tr_idx, ts_idx in kf.split(tr):
        enc_tr = df.loc[tr_idx]
        enc_ts = df.loc[ts_idx]
        map_enc = enc_tr.groupby([col])[trgt].describe().apply(lambda x: ((x['count']*x['mean'])+(mean_g*alpha))/\
                                                               (x['count']+alpha), axis=1)
        df.loc[ts_idx, newcol] = enc_ts[col].map(map_enc)
        
    df[newcol] = df[newcol].astype('float')
    return df

In [9]:
#Mean Encoding with 4folds and regularisation parameter(alpha) as 5
col_list =['campaign_id', 'coupon_id', 'customer_id', 'age_range', 'rented',
           'family_size', 'income_bracket', 'campaign_type',
           'camp_start_month', 'camp_end_month', 'camp_start_quarter',
           'camp_end_quarter', 'camp_start_year', 'camp_end_year', 'marital_status', 'no_of_children']

trgt = 'redemption_status'

for x in col_list:
    newcol = x+'_Enc'
    ts[newcol] = np.nan
    tr = MeanEncoding(tr, col=x, trgt=trgt)    
    map_enc = tr.groupby([x])[newcol].mean()
    ts[newcol] = ts[x].map(map_enc)
    ts[newcol] = ts[newcol].astype('float')
    tr.loc[tr[newcol].isnull(), newcol] = tr[trgt].mean()
    ts.loc[ts[newcol].isnull(), newcol] = tr[trgt].mean()

#### Creating new features based on coupon id and customer id combination in training and test dataset

In [14]:
cust_item = cust_trans.merge(item, on='item_id', how='left')
coup_item = coup.merge(item, on='item_id', how='left')

In [15]:
#NUmber of items bought by the customer which were in that particular coupon code?
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[['customer_id', 'item_id']].drop_duplicates()
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id'])['item_id'].nunique().reset_index()
tmp.rename(columns={'item_id': 'coupcust_items_bought'}, inplace=True)
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_items_bought'].isnull(), 'coupcust_items_bought']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_items_bought'].isnull(), 'coupcust_items_bought']=0

In [16]:
#total quantities bought by the customer which were in that particular coupon code?
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[['customer_id', 'item_id', 'quantity']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id'])['quantity'].sum().reset_index()
tmp.rename(columns={'quantity': 'coupcust_quantity_sum'}, inplace=True)
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_quantity_sum'].isnull(), 'coupcust_quantity_sum']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_quantity_sum'].isnull(), 'coupcust_quantity_sum']=0

In [17]:
#number of total transactions by the customer for the items in that particular coupon code
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[['customer_id', 'item_id']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id']).size().rename('coupcust_trans_count').reset_index()
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_trans_count'].isnull(), 'coupcust_trans_count']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_trans_count'].isnull(), 'coupcust_trans_count']=0

In [18]:
#number of coupon discount transactions by the customer for the items in that particular coupon code
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[cust_item['coupon_discount']!=0][['customer_id', 'item_id']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id']).size().rename('coupcust_coupdisc_trans_count').reset_index()
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_coupdisc_trans_count'].isnull(), 'coupcust_coupdisc_trans_count']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_coupdisc_trans_count'].isnull(), 'coupcust_coupdisc_trans_count']=0

In [19]:
#number of other discount transactions by the customer for the items in that particular coupon code
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[cust_item['other_discount']!=0][['customer_id', 'item_id']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id']).size().rename('coupcust_otherdisc_trans_count').reset_index()
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_otherdisc_trans_count'].isnull(), 'coupcust_otherdisc_trans_count']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_otherdisc_trans_count'].isnull(), 'coupcust_otherdisc_trans_count']=0

In [20]:
#number of discount transactions by the customer for the items in that particular coupon code
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['other_discount']!=0) | (cust_item['coupon_discount']!=0)][['customer_id', 'item_id']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id']).size().rename('coupcust_discount_trans_count').reset_index()
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_discount_trans_count'].isnull(), 'coupcust_discount_trans_count']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_discount_trans_count'].isnull(), 'coupcust_discount_trans_count']=0

In [21]:
cname1 = 'coupcust_trans_count'
cname2 = 'coupcust_coupdisc_trans_count'
cname3 = 'coupcust_otherdisc_trans_count'
cname4 = 'coupcust_discount_trans_count'
cname5 = 'coupcust_coupdisc_trans_perc'
cname6 = 'coupcust_otherdisc_trans_perc'
cname7 = 'coupcust_discount_trans_perc'

tr[cname5] = tr.apply(lambda x: 0 if (x[cname1]==0) else x[cname2]/x[cname1], axis=1)
ts[cname5] = ts.apply(lambda x: 0 if (x[cname1]==0) else x[cname2]/x[cname1], axis=1)

tr[cname6] = tr.apply(lambda x: 0 if (x[cname1]==0) else x[cname3]/x[cname1], axis=1)
ts[cname6] = ts.apply(lambda x: 0 if (x[cname1]==0) else x[cname3]/x[cname1], axis=1)

tr[cname7] = tr.apply(lambda x: 0 if (x[cname1]==0) else x[cname4]/x[cname1], axis=1)
ts[cname7] = ts.apply(lambda x: 0 if (x[cname1]==0) else x[cname4]/x[cname1], axis=1)

In [22]:
#total quantities for coupon discount transactions bought by the customer which were in that particular coupon code?
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[cust_item['coupon_discount']!=0][['customer_id', 'item_id', 'quantity']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id'])['quantity'].sum().reset_index()
tmp.rename(columns={'quantity': 'coupcust_coupdisc_quantity_sum'}, inplace=True)
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_coupdisc_quantity_sum'].isnull(), 'coupcust_coupdisc_quantity_sum']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_coupdisc_quantity_sum'].isnull(), 'coupcust_coupdisc_quantity_sum']=0

In [23]:
#total quantities for other discount transactions bought by the customer which were in that particular coupon code?
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[cust_item['other_discount']!=0][['customer_id', 'item_id', 'quantity']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id'])['quantity'].sum().reset_index()
tmp.rename(columns={'quantity': 'coupcust_otherdisc_quantity_sum'}, inplace=True)
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_otherdisc_quantity_sum'].isnull(), 'coupcust_otherdisc_quantity_sum']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_otherdisc_quantity_sum'].isnull(), 'coupcust_otherdisc_quantity_sum']=0

In [24]:
#total quantities for discount transactions bought by the customer which were in that particular coupon code?
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['other_discount']!=0) | (cust_item['coupon_discount']!=0)][['customer_id', 'item_id', 'quantity']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id'])['quantity'].sum().reset_index()
tmp.rename(columns={'quantity': 'coupcust_discount_quantity_sum'}, inplace=True)
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_discount_quantity_sum'].isnull(), 'coupcust_discount_quantity_sum']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_discount_quantity_sum'].isnull(), 'coupcust_discount_quantity_sum']=0

In [25]:
cname1 = 'coupcust_quantity_sum'
cname2 = 'coupcust_coupdisc_quantity_sum'
cname3 = 'coupcust_otherdisc_quantity_sum'
cname4 = 'coupcust_discount_quantity_sum'
cname5 = 'coupcust_coupdisc_quantity_perc'
cname6 = 'coupcust_otherdisc_quantity_perc'
cname7 = 'coupcust_discount_quantity_perc'

tr[cname5] = tr.apply(lambda x: 0 if (x[cname1]==0) else x[cname2]/x[cname1], axis=1)
ts[cname5] = ts.apply(lambda x: 0 if (x[cname1]==0) else x[cname2]/x[cname1], axis=1)

tr[cname6] = tr.apply(lambda x: 0 if (x[cname1]==0) else x[cname3]/x[cname1], axis=1)
ts[cname6] = ts.apply(lambda x: 0 if (x[cname1]==0) else x[cname3]/x[cname1], axis=1)

tr[cname7] = tr.apply(lambda x: 0 if (x[cname1]==0) else x[cname4]/x[cname1], axis=1)
ts[cname7] = ts.apply(lambda x: 0 if (x[cname1]==0) else x[cname4]/x[cname1], axis=1)

In [26]:
#total selling price for all transactions bought by the customer which were in that particular coupon code?
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[['customer_id', 'item_id', 'selling_price']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id'])['selling_price'].sum().reset_index()
tmp.rename(columns={'selling_price': 'coupcust_selling_price_sum'}, inplace=True)
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_selling_price_sum'].isnull(), 'coupcust_selling_price_sum']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_selling_price_sum'].isnull(), 'coupcust_selling_price_sum']=0

In [27]:
#total coupon discount for all transactions bought by the customer which were in that particular coupon code?
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[['customer_id', 'item_id', 'coupon_discount']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id'])['coupon_discount'].sum().reset_index()
tmp.rename(columns={'coupon_discount': 'coupcust_coupon_discount_sum'}, inplace=True)
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_coupon_discount_sum'].isnull(), 'coupcust_coupon_discount_sum']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_coupon_discount_sum'].isnull(), 'coupcust_coupon_discount_sum']=0

In [28]:
#total other discount for all transactions bought by the customer which were in that particular coupon code?
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[['customer_id', 'item_id', 'other_discount']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id'])['other_discount'].sum().reset_index()
tmp.rename(columns={'other_discount': 'coupcust_other_discount_sum'}, inplace=True)
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_other_discount_sum'].isnull(), 'coupcust_other_discount_sum']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_other_discount_sum'].isnull(), 'coupcust_other_discount_sum']=0

In [29]:
cname1 = 'coupcust_selling_price_sum'
cname2 = 'coupcust_coupon_discount_sum'
cname3 = 'coupcust_other_discount_sum'
cname4 = 'coupcust_coupdisc_price_perc'
cname5 = 'coupcust_otherdisc_price_perc'
cname6 = 'coupcust_discount_price_perc'

tr[cname4] = tr.apply(lambda x: 0 if (x[cname1]==0) else x[cname2]/x[cname1], axis=1)
ts[cname4] = ts.apply(lambda x: 0 if (x[cname1]==0) else x[cname2]/x[cname1], axis=1)

tr[cname5] = tr.apply(lambda x: 0 if (x[cname1]==0) else x[cname3]/x[cname1], axis=1)
ts[cname5] = ts.apply(lambda x: 0 if (x[cname1]==0) else x[cname3]/x[cname1], axis=1)

tr[cname6] = tr.apply(lambda x: 0 if (x[cname1]==0) else (x[cname2]+x[cname3])/x[cname1], axis=1)
ts[cname6] = ts.apply(lambda x: 0 if (x[cname1]==0) else (x[cname2]+x[cname3])/x[cname1], axis=1)

In [30]:
#Number of Local items bought by the customer which were in that particular coupon code?
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['brand_type']=='Local')][['customer_id', 'item_id']].drop_duplicates()
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id'])['item_id'].nunique().reset_index()
tmp.rename(columns={'item_id': 'coupcust_Local_items_bought'}, inplace=True)
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_Local_items_bought'].isnull(), 'coupcust_Local_items_bought']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_Local_items_bought'].isnull(), 'coupcust_Local_items_bought']=0

In [31]:
#Number of Established items bought by the customer which were in that particular coupon code?
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['brand_type']=='Established')][['customer_id', 'item_id']].drop_duplicates()
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id'])['item_id'].nunique().reset_index()
tmp.rename(columns={'item_id': 'coupcust_Established_items_bought'}, inplace=True)
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_Established_items_bought'].isnull(), 'coupcust_Established_items_bought']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_Established_items_bought'].isnull(), 'coupcust_Established_items_bought']=0

In [32]:
#number of total Local transactions by the customer for the items in that particular coupon code
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['brand_type']=='Local')][['customer_id', 'item_id']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id']).size().rename('coupcust_Local_trans_count').reset_index()
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_Local_trans_count'].isnull(), 'coupcust_Local_trans_count']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_Local_trans_count'].isnull(), 'coupcust_Local_trans_count']=0

In [33]:
#number of total Established transactions by the customer for the items in that particular coupon code
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['brand_type']=='Established')][['customer_id', 'item_id']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id']).size().rename('coupcust_Established_trans_count').reset_index()
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_Established_trans_count'].isnull(), 'coupcust_Established_trans_count']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_Established_trans_count'].isnull(), 'coupcust_Established_trans_count']=0

In [34]:
#number of Local coupon discount transactions by the customer for the items in that particular coupon code
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['brand_type']=='Local')& (cust_item['coupon_discount']!=0)][['customer_id', 'item_id']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id']).size().rename('coupcust_Local_coupdisc_trans_count').reset_index()
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_Local_coupdisc_trans_count'].isnull(), 'coupcust_Local_coupdisc_trans_count']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_Local_coupdisc_trans_count'].isnull(), 'coupcust_Local_coupdisc_trans_count']=0

In [35]:
#number of Established coupon discount transactions by the customer for the items in that particular coupon code
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['brand_type']=='Established')& (cust_item['coupon_discount']!=0)][['customer_id', 'item_id']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id']).size().rename('coupcust_Established_coupdisc_trans_count').reset_index()
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_Established_coupdisc_trans_count'].isnull(), 'coupcust_Established_coupdisc_trans_count']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_Established_coupdisc_trans_count'].isnull(), 'coupcust_Established_coupdisc_trans_count']=0

In [36]:
#number of Local other discount transactions by the customer for the items in that particular coupon code
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['brand_type']=='Local')& (cust_item['other_discount']!=0)][['customer_id', 'item_id']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id']).size().rename('coupcust_Local_otherdisc_trans_count').reset_index()
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_Local_otherdisc_trans_count'].isnull(), 'coupcust_Local_otherdisc_trans_count']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_Local_otherdisc_trans_count'].isnull(), 'coupcust_Local_otherdisc_trans_count']=0

In [37]:
#number of Established other discount transactions by the customer for the items in that particular coupon code
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['brand_type']=='Established')& (cust_item['other_discount']!=0)][['customer_id', 'item_id']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id']).size().rename('coupcust_Established_otherdisc_trans_count').reset_index()
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_Established_otherdisc_trans_count'].isnull(), 'coupcust_Established_otherdisc_trans_count']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_Established_otherdisc_trans_count'].isnull(), 'coupcust_Established_otherdisc_trans_count']=0

In [38]:
#number of Local discount transactions by the customer for the items in that particular coupon code
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['brand_type']=='Local') &
               ((cust_item['other_discount']!=0)|(cust_item['coupon_discount']!=0))][['customer_id', 'item_id']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id']).size().rename('coupcust_Local_discount_trans_count').reset_index()
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_Local_discount_trans_count'].isnull(), 'coupcust_Local_discount_trans_count']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_Local_discount_trans_count'].isnull(), 'coupcust_Local_discount_trans_count']=0

In [39]:
#number of Established discount transactions by the customer for the items in that particular coupon code
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['brand_type']=='Established') &
               ((cust_item['other_discount']!=0)|(cust_item['coupon_discount']!=0))][['customer_id', 'item_id']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id']).size().rename('coupcust_Established_discount_trans_count').reset_index()
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_Established_discount_trans_count'].isnull(), 'coupcust_Established_discount_trans_count']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_Established_discount_trans_count'].isnull(), 'coupcust_Established_discount_trans_count']=0

In [40]:
br_l = ['Local', 'Established']
for x in br_l:
    cname1 = 'coupcust_'+str(x)+'_trans_count'
    cname2 = 'coupcust_'+str(x)+'_coupdisc_trans_count'
    cname3 = 'coupcust_'+str(x)+'_otherdisc_trans_count'
    cname4 = 'coupcust_'+str(x)+'_discount_trans_count'
    cname5 = 'coupcust_'+str(x)+'_coupdisc_trans_perc'
    cname6 = 'coupcust_'+str(x)+'_otherdisc_trans_perc'
    cname7 = 'coupcust_'+str(x)+'_discount_trans_perc'
    
    tr[cname5] = tr.apply(lambda x: 0 if (x[cname1]==0) else x[cname2]/x[cname1], axis=1)
    ts[cname5] = ts.apply(lambda x: 0 if (x[cname1]==0) else x[cname2]/x[cname1], axis=1)
    
    tr[cname6] = tr.apply(lambda x: 0 if (x[cname1]==0) else x[cname3]/x[cname1], axis=1)
    ts[cname6] = ts.apply(lambda x: 0 if (x[cname1]==0) else x[cname3]/x[cname1], axis=1)
    
    tr[cname7] = tr.apply(lambda x: 0 if (x[cname1]==0) else x[cname4]/x[cname1], axis=1)
    ts[cname7] = ts.apply(lambda x: 0 if (x[cname1]==0) else x[cname4]/x[cname1], axis=1)

In [41]:
#total Local item quantities bought by the customer which were in that particular coupon code?
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['brand_type']=='Local')][['customer_id', 'item_id', 'quantity']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id'])['quantity'].sum().reset_index()
tmp.rename(columns={'quantity': 'coupcust_Local_quantity_sum'}, inplace=True)
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_Local_quantity_sum'].isnull(), 'coupcust_Local_quantity_sum']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_Local_quantity_sum'].isnull(), 'coupcust_Local_quantity_sum']=0

In [42]:
#total Established item quantities bought by the customer which were in that particular coupon code?
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['brand_type']=='Established')][['customer_id', 'item_id', 'quantity']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id'])['quantity'].sum().reset_index()
tmp.rename(columns={'quantity': 'coupcust_Established_quantity_sum'}, inplace=True)
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_Established_quantity_sum'].isnull(), 'coupcust_Established_quantity_sum']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_Established_quantity_sum'].isnull(), 'coupcust_Established_quantity_sum']=0

In [43]:
#total Local quantities for coupon discount transactions bought by the customer which were in that particular coupon code?
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['brand_type']=='Local') &
               (cust_item['coupon_discount']!=0)][['customer_id', 'item_id', 'quantity']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id'])['quantity'].sum().reset_index()
tmp.rename(columns={'quantity': 'coupcust_Local_coupdisc_quantity_sum'}, inplace=True)
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_Local_coupdisc_quantity_sum'].isnull(), 'coupcust_Local_coupdisc_quantity_sum']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_Local_coupdisc_quantity_sum'].isnull(), 'coupcust_Local_coupdisc_quantity_sum']=0

In [44]:
#total Established quantities for coupon discount transactions bought by the customer which were in that particular coupon code?
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['brand_type']=='Established') &
               (cust_item['coupon_discount']!=0)][['customer_id', 'item_id', 'quantity']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id'])['quantity'].sum().reset_index()
tmp.rename(columns={'quantity': 'coupcust_Established_coupdisc_quantity_sum'}, inplace=True)
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_Established_coupdisc_quantity_sum'].isnull(), 'coupcust_Established_coupdisc_quantity_sum']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_Established_coupdisc_quantity_sum'].isnull(), 'coupcust_Established_coupdisc_quantity_sum']=0

In [45]:
#total Local quantities for other discount transactions bought by the customer which were in that particular coupon code?
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['brand_type']=='Local') &
               (cust_item['other_discount']!=0)][['customer_id', 'item_id', 'quantity']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id'])['quantity'].sum().reset_index()
tmp.rename(columns={'quantity': 'coupcust_Local_otherdisc_quantity_sum'}, inplace=True)
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_Local_otherdisc_quantity_sum'].isnull(), 'coupcust_Local_otherdisc_quantity_sum']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_Local_otherdisc_quantity_sum'].isnull(), 'coupcust_Local_otherdisc_quantity_sum']=0

In [46]:
#total Established quantities for other discount transactions bought by the customer which were in that particular coupon code?
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['brand_type']=='Established') &
               (cust_item['other_discount']!=0)][['customer_id', 'item_id', 'quantity']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id'])['quantity'].sum().reset_index()
tmp.rename(columns={'quantity': 'coupcust_Established_otherdisc_quantity_sum'}, inplace=True)
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_Established_otherdisc_quantity_sum'].isnull(), 'coupcust_Established_otherdisc_quantity_sum']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_Established_otherdisc_quantity_sum'].isnull(), 'coupcust_Established_otherdisc_quantity_sum']=0

In [47]:
#total Local quantities for discount transactions bought by the customer which were in that particular coupon code?
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['brand_type']=='Local') & 
               ((cust_item['other_discount']!=0) | (cust_item['coupon_discount']!=0))][['customer_id', 'item_id', 'quantity']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id'])['quantity'].sum().reset_index()
tmp.rename(columns={'quantity': 'coupcust_Local_discount_quantity_sum'}, inplace=True)
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_Local_discount_quantity_sum'].isnull(), 'coupcust_Local_discount_quantity_sum']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_Local_discount_quantity_sum'].isnull(), 'coupcust_Local_discount_quantity_sum']=0

In [48]:
#total Local quantities for discount transactions bought by the customer which were in that particular coupon code?
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['brand_type']=='Established') & 
               ((cust_item['other_discount']!=0) | (cust_item['coupon_discount']!=0))][['customer_id', 'item_id', 'quantity']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id'])['quantity'].sum().reset_index()
tmp.rename(columns={'quantity': 'coupcust_Established_discount_quantity_sum'}, inplace=True)
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_Established_discount_quantity_sum'].isnull(), 'coupcust_Established_discount_quantity_sum']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_Established_discount_quantity_sum'].isnull(), 'coupcust_Established_discount_quantity_sum']=0

In [49]:
br_l = ['Local', 'Established']
for x in br_l:
    cname1 = 'coupcust_'+str(x)+'_quantity_sum'
    cname2 = 'coupcust_'+str(x)+'_coupdisc_quantity_sum'
    cname3 = 'coupcust_'+str(x)+'_otherdisc_quantity_sum'
    cname4 = 'coupcust_'+str(x)+'_discount_quantity_sum'
    cname5 = 'coupcust_'+str(x)+'_coupdisc_quantity_perc'
    cname6 = 'coupcust_'+str(x)+'_otherdisc_quantity_perc'
    cname7 = 'coupcust_'+str(x)+'_discount_quantity_perc'
    
    tr[cname5] = tr.apply(lambda x: 0 if (x[cname1]==0) else x[cname2]/x[cname1], axis=1)
    ts[cname5] = ts.apply(lambda x: 0 if (x[cname1]==0) else x[cname2]/x[cname1], axis=1)
    
    tr[cname6] = tr.apply(lambda x: 0 if (x[cname1]==0) else x[cname3]/x[cname1], axis=1)
    ts[cname6] = ts.apply(lambda x: 0 if (x[cname1]==0) else x[cname3]/x[cname1], axis=1)
    
    tr[cname7] = tr.apply(lambda x: 0 if (x[cname1]==0) else x[cname4]/x[cname1], axis=1)
    ts[cname7] = ts.apply(lambda x: 0 if (x[cname1]==0) else x[cname4]/x[cname1], axis=1)

In [50]:
#total Local selling price for all transactions bought by the customer which were in that particular coupon code?
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['brand_type']=='Local')][['customer_id', 'item_id', 'selling_price']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id'])['selling_price'].sum().reset_index()
tmp.rename(columns={'selling_price': 'coupcust_Local_selling_price_sum'}, inplace=True)
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_Local_selling_price_sum'].isnull(), 'coupcust_Local_selling_price_sum']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_Local_selling_price_sum'].isnull(), 'coupcust_Local_selling_price_sum']=0

In [51]:
#total Established selling price for all transactions bought by the customer which were in that particular coupon code?
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['brand_type']=='Established')][['customer_id', 'item_id', 'selling_price']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id'])['selling_price'].sum().reset_index()
tmp.rename(columns={'selling_price': 'coupcust_Established_selling_price_sum'}, inplace=True)
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_Established_selling_price_sum'].isnull(), 'coupcust_Established_selling_price_sum']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_Established_selling_price_sum'].isnull(), 'coupcust_Established_selling_price_sum']=0

In [52]:
#total Local coupon discount for all transactions bought by the customer which were in that particular coupon code?
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['brand_type']=='Local')][['customer_id', 'item_id', 'coupon_discount']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id'])['coupon_discount'].sum().reset_index()
tmp.rename(columns={'coupon_discount': 'coupcust_Local_coupon_discount_sum'}, inplace=True)
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_Local_coupon_discount_sum'].isnull(), 'coupcust_Local_coupon_discount_sum']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_Local_coupon_discount_sum'].isnull(), 'coupcust_Local_coupon_discount_sum']=0

In [53]:
#total Established coupon discount for all transactions bought by the customer which were in that particular coupon code?
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['brand_type']=='Established')][['customer_id', 'item_id', 'coupon_discount']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id'])['coupon_discount'].sum().reset_index()
tmp.rename(columns={'coupon_discount': 'coupcust_Established_coupon_discount_sum'}, inplace=True)
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_Established_coupon_discount_sum'].isnull(), 'coupcust_Established_coupon_discount_sum']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_Established_coupon_discount_sum'].isnull(), 'coupcust_Established_coupon_discount_sum']=0

In [54]:
#total Local other discount for all transactions bought by the customer which were in that particular coupon code?
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['brand_type']=='Local')][['customer_id', 'item_id', 'other_discount']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id'])['other_discount'].sum().reset_index()
tmp.rename(columns={'other_discount': 'coupcust_Local_other_discount_sum'}, inplace=True)
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_Local_other_discount_sum'].isnull(), 'coupcust_Local_other_discount_sum']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_Local_other_discount_sum'].isnull(), 'coupcust_Local_other_discount_sum']=0

In [55]:
#total Established other discount for all transactions bought by the customer which were in that particular coupon code?
x = pd.concat([tr, ts], sort=False, axis='rows')[['coupon_id', 'customer_id']].drop_duplicates()
x1 = cust_item[(cust_item['brand_type']=='Established')][['customer_id', 'item_id', 'other_discount']]
x2 = coup_item[['coupon_id', 'item_id']]
x3 = x2.merge(x1, on='item_id', how='inner')
tmp = x3.groupby(['coupon_id', 'customer_id'])['other_discount'].sum().reset_index()
tmp.rename(columns={'other_discount': 'coupcust_Established_other_discount_sum'}, inplace=True)
tr = tr.merge(tmp, on=['coupon_id','customer_id'], how='left')
tr.loc[tr['coupcust_Established_other_discount_sum'].isnull(), 'coupcust_Established_other_discount_sum']=0
ts = ts.merge(tmp, on=['coupon_id','customer_id'], how='left')
ts.loc[ts['coupcust_Established_other_discount_sum'].isnull(), 'coupcust_Established_other_discount_sum']=0

In [56]:
br_l = ['Local', 'Established']
for x in br_l:
    cname1 = 'coupcust_'+str(x)+'_selling_price_sum'
    cname2 = 'coupcust_'+str(x)+'_coupon_discount_sum'
    cname3 = 'coupcust_'+str(x)+'_other_discount_sum'
    cname4 = 'coupcust_'+str(x)+'_coupdisc_price_perc'
    cname5 = 'coupcust_'+str(x)+'_otherdisc_price_perc'
    cname6 = 'coupcust_'+str(x)+'_discount_price_perc'

    tr[cname4] = tr.apply(lambda x: 0 if (x[cname1]==0) else x[cname2]/x[cname1], axis=1)
    ts[cname4] = ts.apply(lambda x: 0 if (x[cname1]==0) else x[cname2]/x[cname1], axis=1)

    tr[cname5] = tr.apply(lambda x: 0 if (x[cname1]==0) else x[cname3]/x[cname1], axis=1)
    ts[cname5] = ts.apply(lambda x: 0 if (x[cname1]==0) else x[cname3]/x[cname1], axis=1)

    tr[cname6] = tr.apply(lambda x: 0 if (x[cname1]==0) else (x[cname2]+x[cname3])/x[cname1], axis=1)
    ts[cname6] = ts.apply(lambda x: 0 if (x[cname1]==0) else (x[cname2]+x[cname3])/x[cname1], axis=1)

In [370]:
tr.columns

Index(['id', 'campaign_id', 'coupon_id', 'customer_id', 'redemption_status',
       'age_range', 'marital_status', 'rented', 'family_size',
       'no_of_children', 'income_bracket', 'campaign_type',
       'camp_duration_days', 'camp_duration_weeks', 'camp_start_month',
       'camp_end_month', 'camp_start_quarter', 'camp_end_quarter',
       'camp_start_year', 'camp_end_year', 'campaign_id_Enc', 'coupon_id_Enc',
       'customer_id_Enc', 'age_range_Enc', 'rented_Enc', 'family_size_Enc',
       'income_bracket_Enc', 'campaign_type_Enc', 'camp_start_month_Enc',
       'camp_end_month_Enc', 'camp_start_quarter_Enc', 'camp_end_quarter_Enc',
       'camp_start_year_Enc', 'camp_end_year_Enc', 'marital_status_Enc',
       'no_of_children_Enc', 'coupcust_items_bought', 'coupcust_quantity_sum',
       'coupcust_trans_count', 'coupcust_coupdisc_trans_count',
       'coupcust_otherdisc_trans_count', 'coupcust_discount_trans_count',
       'coupcust_coupdisc_trans_perc', 'coupcust_otherdisc_tran

In [57]:
#Preparing model data
#columns to drop
drop_col = ['age_range', 'marital_status', 'rented', 'family_size', 'no_of_children', 'income_bracket',
            'campaign_type', 'camp_start_month', 'camp_end_month', 'camp_start_quarter', 'camp_end_quarter',
            'camp_start_year', 'camp_end_year']
    
tr1 = tr.drop(drop_col, axis='columns')
#Merging campaign aggregation data to train dataset based on campaign_id
tr1 = tr1.merge(camp_agg, on='campaign_id', how='left').drop(['campaign_id'], axis='columns')
#Merging coupon aggregation data to train dataset based on coupon_id
tr1 = tr1.merge(coup_agg, on='coupon_id', how='left').drop(['coupon_id'], axis='columns')
#Merging customer aggregation data to train dataset based on customer_id
model_data = tr1.merge(cust_agg, on='customer_id', how='left').drop(['customer_id', 'coup_Local_brand_nunique',
                                                                     'coup_Established_brand_nunique'], axis='columns')

#preparing training and validation datasets
X_train, X_test, y_train, y_test = train_test_split(model_data.drop(['redemption_status'], axis=1),\
                                                    model_data['redemption_status'],\
                                                    test_size=0.25, random_state=100, stratify=model_data['redemption_status'])

In [58]:
#Testing Random Forest Model
rf = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=100, n_jobs=-1,
                            oob_score=True, class_weight="balanced_subsample")
rf.fit(X_train, y_train)
print("Feature Importance:\n"+
      str(pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False).head(25)))
print(rf.oob_score_)
print(roc_auc_score(y_train, rf.predict_proba(X_train)[:,1]))
print(confusion_matrix(y_train, rf.predict(X_train)))
print(roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]))
print(confusion_matrix(y_test, rf.predict(X_test)))

Feature Importance:
coupcust_Established_coupdisc_quantity_perc     0.058836
coupcust_Established_coupon_discount_sum        0.058085
coupcust_Established_discount_trans_perc        0.057590
coupcust_coupdisc_price_perc                    0.056858
coupcust_coupdisc_trans_perc                    0.052620
coupcust_trans_count                            0.041921
coupcust_coupon_discount_sum                    0.038618
coupcust_coupdisc_quantity_sum                  0.037388
coupcust_Established_coupdisc_price_perc        0.036280
coupcust_quantity_sum                           0.034956
coupcust_Established_discount_trans_count       0.027994
coupcust_Established_coupdisc_quantity_sum      0.027629
coupcust_coupdisc_trans_count                   0.025707
coupcust_otherdisc_quantity_perc                0.020779
coupcust_items_bought                           0.020369
coupcust_selling_price_sum                      0.019787
coupcust_Established_otherdisc_quantity_perc    0.016685
coupcust_Es

In [59]:
#Testing Catboost Model
cat = CatBoostClassifier(iterations=4000, eval_metric='AUC')
fit_params = {'early_stopping_rounds': 100, 'eval_set': [(X_test, y_test)], 'verbose': 500}
cat.fit(X_train, y_train, **fit_params)
y_pred1_prob = cat.predict_proba(X_train)[:,1]
y_pred2_prob = cat.predict_proba(X_test)[:,1]
print(roc_auc_score(y_train, y_pred1_prob))
print(roc_auc_score(y_test, y_pred2_prob))

Learning rate set to 0.06612
0:	test: 0.9872888	best: 0.9872888 (0)	total: 435ms	remaining: 29m
500:	test: 0.9988672	best: 0.9988779 (492)	total: 2m 25s	remaining: 16m 56s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.9988807751
bestIteration = 507

Shrink model to first 508 iterations.
1.0
0.998880775090141


In [61]:
#Testing LGBM Model
lgbm = LGBMClassifier(learning_rate=0.05, colsample_bytree=0.5, subsample=0.8, subsample_freq=1,\
                      max_bin=31, n_estimators=4000, min_child_samples= 250, num_leaves=8,\
                      objective='binary',scale_pos_weight=2.5)
fit_params = {'early_stopping_rounds': 100, 'eval_set': [(X_train, y_train),
                                                         (X_test, y_test)],
              'verbose': 500, 'eval_metric': 'auc'}
#warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)
lgbm.fit(X_train, y_train, **fit_params)
y_pred1_prob = lgbm.predict_proba(X_train)[:,1]
y_pred2_prob = lgbm.predict_proba(X_test)[:,1]
print(roc_auc_score(y_train, y_pred1_prob))
print(roc_auc_score(y_test, y_pred2_prob))

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[210]	training's auc: 0.999955	training's binary_logloss: 0.00351396	valid_1's auc: 0.998914	valid_1's binary_logloss: 0.0078413
0.9999551980089455
0.9989136102164797


In [62]:
#Selected LGBM model based on accuracy and speed
#Preparing train and test data  for final output
#Preparing model data
#columns to drop
drop_col = ['age_range', 'marital_status', 'rented', 'family_size', 'no_of_children', 'income_bracket',
            'campaign_type', 'camp_start_month', 'camp_end_month', 'camp_start_quarter', 'camp_end_quarter',
            'camp_start_year', 'camp_end_year']
    
tr1 = tr.drop(drop_col, axis='columns')
#Merging campaign aggregation data to train dataset based on campaign_id
tr1 = tr1.merge(camp_agg, on='campaign_id', how='left').drop(['campaign_id'], axis='columns')
#Merging coupon aggregation data to train dataset based on coupon_id
tr1 = tr1.merge(coup_agg, on='coupon_id', how='left').drop(['coupon_id'], axis='columns')
#Merging customer aggregation data to train dataset based on customer_id
model_data = tr1.merge(cust_agg, on='customer_id', how='left').drop(['customer_id', 'coup_Local_brand_nunique',
                                                                     'coup_Established_brand_nunique'], axis='columns')

X_train, y_train = model_data.drop(['redemption_status'], axis='columns'), model_data['redemption_status']

ts1 = ts.drop(drop_col, axis='columns')
ts1 = ts1.merge(camp_agg, on='campaign_id', how='left').drop(['campaign_id'], axis='columns')
#Merging coupon aggregation data to train dataset based on coupon_id
ts1 = ts1.merge(coup_agg, on='coupon_id', how='left').drop(['coupon_id'], axis='columns')
#Merging customer aggregation data to train dataset based on customer_id
test_data = ts1.merge(cust_agg, on='customer_id', how='left').drop(['customer_id', 'coup_Local_brand_nunique',
                                                                     'coup_Established_brand_nunique'], axis='columns')

In [64]:
def kfoldvalidationLGBM(X_train, y_train, X_test, splits=10):
    skf = StratifiedKFold(n_splits=splits, random_state=100, shuffle=True)
    y_pred_tot=[]
    y_tmp_ts1=[]
    y_tmp_pred=[]
    
    for i, idx in enumerate(skf.split(X_train, y_train)):
        X_tr1, y_tr1 = X_train.iloc[idx[0]], y_train.iloc[idx[0]]
        X_ts1, y_ts1 = X_train.iloc[idx[1]], y_train.iloc[idx[1]]
        
        lgbm = LGBMClassifier(learning_rate=0.05, colsample_bytree=0.5, subsample=0.8, subsample_freq=1,\
                      max_bin=63, n_estimators=4000, min_child_samples= 250, num_leaves=8,\
                      objective='binary',scale_pos_weight=1)
        #reg_alpha=0.1, reg_lambda=0.1
        fit_params = {'early_stopping_rounds': 100, 'eval_set': [(X_tr1, y_tr1), (X_ts1, y_ts1)],
                      'verbose': 500, 'eval_metric': 'auc'}
        lgbm.fit(X_tr1, y_tr1, **fit_params)
        print('Fold :',i+1)
        pred_ts1 = lgbm.predict_proba(X_ts1, num_iteration=lgbm.best_iteration_)[:, 1]
        print('AUC Score:\t',roc_auc_score(y_ts1, pred_ts1))
        y_tmp_ts1 =np.concatenate((y_tmp_ts1, y_ts1))
        y_tmp_pred =np.concatenate((y_tmp_pred, pred_ts1))
        pred_test = lgbm.predict_proba(X_test)[:,1]
        y_pred_tot.append(pred_test)
    
    print('Total AUC Score:\t', roc_auc_score(y_tmp_ts1, y_tmp_pred))
    return np.mean(y_pred_tot, 0)

In [388]:
#Calculating final prediction based on averaged 10fold predictions
pred_lgbm = kfoldvalidationLGBM(X_train, y_train, test_data)

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[117]	training's auc: 0.999616	training's binary_logloss: 0.00543611	valid_1's auc: 0.999058	valid_1's binary_logloss: 0.00791353
Fold : 1
AUC Score:	 0.9990578221930512
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[134]	training's auc: 0.999716	training's binary_logloss: 0.00481732	valid_1's auc: 0.998597	valid_1's binary_logloss: 0.00939188
Fold : 2
AUC Score:	 0.9985973195570707
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[185]	training's auc: 0.999859	training's binary_logloss: 0.003964	valid_1's auc: 0.998993	valid_1's binary_logloss: 0.00800056
Fold : 3
AUC Score:	 0.9989925402101727
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[231]	training's auc: 0.999934	training's binary_logloss: 0.00332066	valid_1's auc: 0.999008	valid_1's b

In [389]:
#Prparing final probability data for submission
out = pd.DataFrame({'id': ts['id'], 'redemption_status': pred_lgbm})
out.to_csv('KFoldLGBM_sub1.csv', index=False)