In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import os
import datetime

c_demographics = pd.read_csv('customer_demographics.csv')
c_transaction = pd.read_csv('customer_transaction_data.csv')
item_data = pd.read_csv('item_data.csv')
coupon_item_map = pd.read_csv('coupon_item_mapping.csv')
campaignData = pd.read_csv('campaign_data.csv')
train   = pd.read_csv('train.csv')
test = pd.read_csv('test_QyjYwdj.csv')
sampleSubm = pd.read_csv('sample_submission_Byiv0dS.csv')

item_data['brand_type_category'] = item_data['brand_type'] + '_' + item_data['category']
c_transaction = pd.merge(c_transaction, item_data, on = 'item_id', how = 'left')

c_transaction['date'] = pd.to_datetime(c_transaction['date'], format= '%Y-%m-%d')
campaignData['start_date'] = pd.to_datetime(campaignData['start_date'], format= '%d/%m/%y')
campaignData['end_date'] = pd.to_datetime(campaignData['end_date'], format= '%d/%m/%y')

campaignData = campaignData.sort_values(by = 'start_date')
campaignData['duration'] = campaignData['end_date'] - campaignData['start_date']

In [2]:
from tqdm import tqdm

In [3]:
c_transaction['total_disc'] = c_transaction['other_discount'] + c_transaction.coupon_discount
c_transaction['Total_Bill'] = c_transaction['selling_price'] * c_transaction['quantity'] + c_transaction['total_disc']
c_transaction['dayofweek'] = c_transaction['date'].dt.dayofweek
c_transaction['month'] = c_transaction['date'].dt.month
c_transaction['is_monthEnd'] = c_transaction['date'].dt.day > 25
c_transaction['is_monthStart'] = c_transaction['date'].dt.day < 7
c_transaction['disc_perc'] = abs(c_transaction['other_discount'] + c_transaction['coupon_discount']) / c_transaction['selling_price']
c_transaction['is_coupoun_disc'] = c_transaction['coupon_discount'] < 0
c_transaction['brand_category'] = c_transaction['brand'].astype('str') + '_' + c_transaction['category']
coupon_item_dict = {x : coupon_item_map[coupon_item_map.coupon_id == x]['item_id'].values for x in coupon_item_map.coupon_id.unique()}
c_transaction['dayofmonth'] = c_transaction['date'].dt.day

treat_age = {'70+' : 72, '46-55' : 50.5, '36-45' : 40.5, '26-35' : 30.5, '56-70' : 63, '18-25' : 21.5}
c_demographics['age_approximated'] = c_demographics['age_range'].apply(lambda x : treat_age[x])
c_demographics['income_bucket_div_age'] = c_demographics['income_bracket'] / c_demographics['age_approximated']
treat_family_size = {'1' : 1, '2' : 2, '3' : 3, '4' : 4, '5+' : 5}
c_demographics['family_size'] = c_demographics['family_size'].map(treat_family_size)
c_demographics['income_div_family_size'] = c_demographics['income_bracket'] / c_demographics['family_size']

c_demographics['no_of_children'] = c_demographics['no_of_children'].map({'1' : 1, '2' : 2, '3+' : 3})
c_demographics['income_bucket_div_no_of_child'] = c_demographics['income_bracket'] / c_demographics['no_of_children']

c_demographics['family_to_number_of_child'] = c_demographics['family_size'] / c_demographics['no_of_children']

campaignData['duration'] = campaignData['duration'].dt.days

In [4]:
train = pd.read_pickle('grpd_by_train.pkl')
test = pd.read_pickle('grpd_by_test.pkl')

aggDataTrain = pd.DataFrame()
ans = []
for cID in train.campaign_id.unique():
    date = campaignData[campaignData.campaign_id == cID]['start_date'].values[0]
    print("#" * 10 + '{} is {}'.format(cID, date) + '#'*10)
    tempCampTran = c_transaction[c_transaction.date < date]
    coupons = train[train.campaign_id == cID]['coupon_id'].unique()
    for coup in coupons:
            itemsCoup = coupon_item_dict[coup]
            tempCoupTran = tempCampTran[tempCampTran.item_id.isin(itemsCoup)]
            nUniqueCust = tempCoupTran['customer_id'].nunique()
            CoupounSize = len(itemsCoup)
            CoupSellingPriceMean = tempCoupTran['selling_price'].mean()
            CoupSellingPriceStd = tempCoupTran['selling_price'].std()
            CoupSellingPriceMin = tempCoupTran['selling_price'].max()
            CoupSellingPriceMax = tempCoupTran['selling_price'].min()
            CoupCouponDiscMean = tempCoupTran['coupon_discount'].mean()
            CoupCouponDiscStd = tempCoupTran['coupon_discount'].std()
            CoupCouponDiscMax = tempCoupTran['coupon_discount'].max()
            CoupCouponDiscMin = tempCoupTran['coupon_discount'].min()
            CoupOtherDiscMean = tempCoupTran['other_discount'].mean()
            CoupOtherDiscStd = tempCoupTran['other_discount'].std()
            CoupOtherDiscMax = tempCoupTran['other_discount'].max()
            CoupOtherDiscMin = tempCoupTran['other_discount'].min()
            try:
                CoupBrandMode = tempCoupTran['brand'].value_counts().index[0]
                CoupBrandTypeMode = tempCoupTran['brand_type_category'].value_counts().index[0]
                CoupFav8DayofWeek = tempCoupTran['dayofweek'].value_counts().index[0]                
            except:
                CoupBrandMode, CoupBrandTypeMode, CoupFav8DayofWeek = np.nan, np.nan, np.nan
            CoupBrandNunique = tempCoupTran['brand'].nunique()
            CoupBrandTypeNunique = tempCoupTran['brand_type_category'].nunique()
            CoupQuantMean = tempCoupTran['quantity'].mean()
            CoupQuantStd = tempCoupTran['quantity'].std()
            CoupQuantMax = tempCoupTran['quantity'].max()
            CoupQuantMin = tempCoupTran['quantity'].min()
            CoupTotalDiscMean = tempCoupTran['total_disc'].mean()
            CoupTotalDiscStd = tempCoupTran['total_disc'].std()
            CoupTotalDiscMax = tempCoupTran['total_disc'].max()
            CoupTotalDiscMin = tempCoupTran['total_disc'].min()
            CoupTotalBillMean = tempCoupTran['Total_Bill'].mean()
            CoupTotalBillStd = tempCoupTran['Total_Bill'].std()
            CoupTotalBillMax = tempCoupTran['Total_Bill'].max()
            CoupTotalBillMin = tempCoupTran['Total_Bill'].min()
            CoupIsMonthEndMean = tempCoupTran['is_monthEnd'].mean()
            CoupIsMonthStartMean = tempCoupTran['is_monthStart'].mean()
            ans.append(
            (
            cID,
            coup,
            nUniqueCust,
            CoupounSize,
            CoupSellingPriceMean,
            CoupSellingPriceStd,
            CoupSellingPriceMin,
            CoupSellingPriceMax,
            CoupCouponDiscMean,
            CoupCouponDiscStd,
            CoupCouponDiscMax,
            CoupCouponDiscMin,
            CoupOtherDiscMean,
            CoupOtherDiscStd,
            CoupOtherDiscMax,
            CoupOtherDiscMin,
            CoupBrandMode,
            CoupBrandTypeMode,
            CoupFav8DayofWeek,
            CoupBrandNunique,
            CoupBrandTypeNunique,
            CoupQuantMean,
            CoupQuantStd,
            CoupQuantMax,
            CoupQuantMin,
            CoupTotalDiscMean,
            CoupTotalDiscStd,
            CoupTotalDiscMax,
            CoupTotalDiscMin,
            CoupTotalBillMean,
            CoupTotalBillStd,
            CoupTotalBillMax,
            CoupTotalBillMin,
            CoupIsMonthEndMean,
            CoupIsMonthStartMean,
            )
            )

CoupounAggTrain = pd.DataFrame(ans, columns=[
    
            'cID',
            'coup',
            'nUniqueCust',
            'CoupounSize',
            'CoupSellingPriceMean',
            'CoupSellingPriceStd',
            'CoupSellingPriceMin',
            'CoupSellingPriceMax',
            'CoupCouponDiscMean',
            'CoupCouponDiscStd',
            'CoupCouponDiscMax',
            'CoupCouponDiscMin',
            'CoupOtherDiscMean',
            'CoupOtherDiscStd',
            'CoupOtherDiscMax',
            'CoupOtherDiscMin',
            'CoupBrandMode',
            'CoupBrandTypeMode',
            'CoupFav8DayofWeek',
            'CoupBrandNunique',
            'CoupBrandTypeNunique',
            'CoupQuantMean',
            'CoupQuantStd',
            'CoupQuantMax',
            'CoupQuantMin',
            'CoupTotalDiscMean',
            'CoupTotalDiscStd',
            'CoupTotalDiscMax',
            'CoupTotalDiscMin',
            'CoupTotalBillMean',
            'CoupTotalBillStd',
            'CoupTotalBillMax',
            'CoupTotalBillMin',
            'CoupIsMonthEndMean',
            'CoupIsMonthStartMean',
                                    ])

ans = []
for cID in test.campaign_id.unique():
    print(cID)
    date = campaignData[campaignData.campaign_id == cID]['start_date'].values[0]
    print("#" * 10 + '{} is {}'.format(cID, date) + '#'*10)
    tempCampTran = c_transaction[c_transaction.date < date]
    coupons = test[test.campaign_id == cID]['coupon_id'].unique()
    for coup in coupons:
            itemsCoup = coupon_item_dict[coup]
            tempCoupTran = tempCampTran[tempCampTran.item_id.isin(itemsCoup)]
            nUniqueCust = tempCoupTran['customer_id'].nunique()
            CoupounSize = len(itemsCoup)
            CoupSellingPriceMean = tempCoupTran['selling_price'].mean()
            CoupSellingPriceStd = tempCoupTran['selling_price'].std()
            CoupSellingPriceMin = tempCoupTran['selling_price'].max()
            CoupSellingPriceMax = tempCoupTran['selling_price'].min()
            CoupCouponDiscMean = tempCoupTran['coupon_discount'].mean()
            CoupCouponDiscStd = tempCoupTran['coupon_discount'].std()
            CoupCouponDiscMax = tempCoupTran['coupon_discount'].max()
            CoupCouponDiscMin = tempCoupTran['coupon_discount'].min()
            CoupOtherDiscMean = tempCoupTran['other_discount'].mean()
            CoupOtherDiscStd = tempCoupTran['other_discount'].std()
            CoupOtherDiscMax = tempCoupTran['other_discount'].max()
            CoupOtherDiscMin = tempCoupTran['other_discount'].min()
            try:
                CoupBrandMode = tempCoupTran['brand'].value_counts().index[0]
                CoupBrandTypeMode = tempCoupTran['brand_type_category'].value_counts().index[0]
                CoupFav8DayofWeek = tempCoupTran['dayofweek'].value_counts().index[0]                
            except:
                CoupBrandMode, CoupBrandTypeMode, CoupFav8DayofWeek = np.nan, np.nan, np.nan
            CoupBrandNunique = tempCoupTran['brand'].nunique()
            CoupBrandTypeNunique = tempCoupTran['brand_type_category'].nunique()
            CoupQuantMean = tempCoupTran['quantity'].mean()
            CoupQuantStd = tempCoupTran['quantity'].std()
            CoupQuantMax = tempCoupTran['quantity'].max()
            CoupQuantMin = tempCoupTran['quantity'].min()
            CoupTotalDiscMean = tempCoupTran['total_disc'].mean()
            CoupTotalDiscStd = tempCoupTran['total_disc'].std()
            CoupTotalDiscMax = tempCoupTran['total_disc'].max()
            CoupTotalDiscMin = tempCoupTran['total_disc'].min()
            CoupTotalBillMean = tempCoupTran['Total_Bill'].mean()
            CoupTotalBillStd = tempCoupTran['Total_Bill'].std()
            CoupTotalBillMax = tempCoupTran['Total_Bill'].max()
            CoupTotalBillMin = tempCoupTran['Total_Bill'].min()
            CoupIsMonthEndMean = tempCoupTran['is_monthEnd'].mean()
            CoupIsMonthStartMean = tempCoupTran['is_monthStart'].mean()
            ans.append(
            (
            cID,
            coup,
            nUniqueCust,
            CoupounSize,
            CoupSellingPriceMean,
            CoupSellingPriceStd,
            CoupSellingPriceMin,
            CoupSellingPriceMax,
            CoupCouponDiscMean,
            CoupCouponDiscStd,
            CoupCouponDiscMax,
            CoupCouponDiscMin,
            CoupOtherDiscMean,
            CoupOtherDiscStd,
            CoupOtherDiscMax,
            CoupOtherDiscMin,
            CoupBrandMode,
            CoupBrandTypeMode,
            CoupFav8DayofWeek,
            CoupBrandNunique,
            CoupBrandTypeNunique,
            CoupQuantMean,
            CoupQuantStd,
            CoupQuantMax,
            CoupQuantMin,
            CoupTotalDiscMean,
            CoupTotalDiscStd,
            CoupTotalDiscMax,
            CoupTotalDiscMin,
            CoupTotalBillMean,
            CoupTotalBillStd,
            CoupTotalBillMax,
            CoupTotalBillMin,
            CoupIsMonthEndMean,
            CoupIsMonthStartMean,
            )
            )

CoupounAggTest = pd.DataFrame(ans, columns=[
    
            'cID',
            'coup',
            'nUniqueCust',
            'CoupounSize',
            'CoupSellingPriceMean',
            'CoupSellingPriceStd',
            'CoupSellingPriceMin',
            'CoupSellingPriceMax',
            'CoupCouponDiscMean',
            'CoupCouponDiscStd',
            'CoupCouponDiscMax',
            'CoupCouponDiscMin',
            'CoupOtherDiscMean',
            'CoupOtherDiscStd',
            'CoupOtherDiscMax',
            'CoupOtherDiscMin',
            'CoupBrandMode',
            'CoupBrandTypeMode',
            'CoupFav8DayofWeek',
            'CoupBrandNunique',
            'CoupBrandTypeNunique',
            'CoupQuantMean',
            'CoupQuantStd',
            'CoupQuantMax',
            'CoupQuantMin',
            'CoupTotalDiscMean',
            'CoupTotalDiscStd',
            'CoupTotalDiscMax',
            'CoupTotalDiscMin',
            'CoupTotalBillMean',
            'CoupTotalBillStd',
            'CoupTotalBillMax',
            'CoupTotalBillMin',
            'CoupIsMonthEndMean',
            'CoupIsMonthStartMean',
                                    ])

CoupounAggTest.to_csv('CoupounAggTest.csv', index = False)
CoupounAggTrain.to_csv('CoupounAggTrain.csv', index = False)

##########13 is 2013-05-19T00:00:00.000000000##########
##########9 is 2013-03-11T00:00:00.000000000##########
##########8 is 2013-02-16T00:00:00.000000000##########
##########11 is 2013-04-22T00:00:00.000000000##########
##########29 is 2012-10-08T00:00:00.000000000##########
##########30 is 2012-11-19T00:00:00.000000000##########
##########2 is 2012-12-17T00:00:00.000000000##########
##########5 is 2013-01-12T00:00:00.000000000##########
##########12 is 2013-04-22T00:00:00.000000000##########
##########26 is 2012-08-12T00:00:00.000000000##########
##########3 is 2012-12-22T00:00:00.000000000##########
##########4 is 2013-01-07T00:00:00.000000000##########
##########10 is 2013-04-08T00:00:00.000000000##########
##########7 is 2013-02-02T00:00:00.000000000##########
##########28 is 2012-09-16T00:00:00.000000000##########
##########27 is 2012-08-25T00:00:00.000000000##########
##########6 is 2013-01-28T00:00:00.000000000##########
##########1 is 2012-12-12T00:00:00.000000000##########
2