In [1]:
import pandas as pd
import numpy as np
import datetime
import os
import logging
import gc
from multiprocessing import Pool
from functools import partial

price_threshold=10
logging.basicConfig(filename='V2_Multiprocessing_20190126.log', level=logging.INFO)

In [2]:
last_saturday=datetime.datetime.now().date()-datetime.timedelta(days=(datetime.datetime.now().date().weekday()+2))
print(last_saturday)
last_saturday=datetime.date(2019,1,26) # To be changed to the running Tuesday
print(last_saturday)

2019-02-16
2019-01-26


In [3]:
output_folder="/home/jian/celery/DBasket/output/"

output_folder=output_folder+str(last_saturday)+"/"

try:
    os.stat(output_folder)
except:
    os.mkdir(output_folder)

In [4]:
def recursive_file_gen(my_root_dir):
    for root, dirs, files in os.walk(my_root_dir):
        for file in files:
            yield os.path.join(root, file)
            
most_recent_daily_data=list(recursive_file_gen("/home/jian/BigLots/"))
most_recent_daily_data=[x for x in most_recent_daily_data if ("MediaStormDailySales" in x) and (str(last_saturday) in x)]

if len(most_recent_daily_data)==1:
    most_recent_daily_data=most_recent_daily_data[0]
else:
    most_recent_daily_data=np.nan
    logging.info("Last Weekly Daily Data Error", str(datetime.datetime.now()))

In [5]:
most_recent_daily_data

'/home/jian/BigLots/2019_by_weeks/MediaStorm_2019-01-26/MediaStormDailySales20190129-111506-486.txt'

In [6]:
data=pd.read_table(most_recent_daily_data,dtype=str,sep="|")
print("len_sub_class_id:",data['subclass_id'].apply(lambda x: len(x)).unique())
print("len_class_code_id:",data['class_code_id'].apply(lambda x: len(x)).unique())
data['subclass_id']=data['subclass_id'].apply(lambda x: x.zfill(3))
data['product_comb']=data['class_code_id']+"-"+data['subclass_id']

data['subclass_transaction_amt']=data['subclass_transaction_amt'].astype(float)
data['subclass_transaction_units']=data['subclass_transaction_units'].astype(int)
data=data[(data['subclass_transaction_amt']>0) & (data['subclass_transaction_units']>0)]

data['price']=data['subclass_transaction_amt']/data['subclass_transaction_units']


len_sub_class_id: [1 2 3]
len_class_code_id: [5]


In [7]:
taxonomy=pd.read_csv("/home/jian/BigLots/static_files/ProductTaxonomy/MediaStormProductTaxonomy20190201-133832-059.txt",dtype=str,sep="|")
taxonomy['subclass_id']=taxonomy['subclass_id'].apply(lambda x: x.zfill(3))
division_id_id_name=pd.read_table("/home/jian/BigLots/static_files/MediaStorm Data Extract - Division Names.txt",dtype=str,sep="|")
department_id_name=pd.read_table("/home/jian/BigLots/static_files/MediaStorm Data Extract - Department Names.txt",dtype=str,sep="|")
class_id_name=pd.read_table("/home/jian/BigLots/static_files/MediaStorm Data Extract - Class Names.txt",dtype=str,sep="|",encoding ='ISO-8859-1')
# 

data_item_avg_price=data[['product_comb','price']].groupby(['product_comb'])['price'].mean().to_frame().reset_index()
data_item_avg_price=data_item_avg_price.rename(columns={"price":"avg_price"})

data_item_avg_price['class_code_id']=data_item_avg_price['product_comb'].apply(lambda x: x.split("-")[0])
data_item_avg_price['subclass_id']=data_item_avg_price['product_comb'].apply(lambda x: x.split("-")[1])

data_item_avg_price=pd.merge(data_item_avg_price,taxonomy,on=['class_code_id','subclass_id'],how="left")


data_item_avg_price=pd.merge(data_item_avg_price,division_id_id_name,on="division_id",how="left")
data_item_avg_price=pd.merge(data_item_avg_price,department_id_name,on="department_id",how="left")
data_item_avg_price=pd.merge(data_item_avg_price,class_id_name,on="class_code_id",how="left")
data_item_avg_price=data_item_avg_price[['product_comb','avg_price','division_id','division_desc','department_id','department_desc',
                                         'class_code_id','class_code_desc','subclass_id','subclass_desc']]

data_item_avg_price.to_csv(output_folder+"/Price_"+str(last_saturday)+".csv",index=False)

In [8]:
# $10 of all items as in the email on 2019-01-14

product_comb_under_10_set=set(data_item_avg_price[data_item_avg_price['avg_price']<price_threshold]['product_comb'].unique().tolist())
product_comb_10_and_above_list=data_item_avg_price[data_item_avg_price['avg_price']>=10]['product_comb'].unique().tolist()
product_comb_10_and_above_df=data_item_avg_price.sort_values('avg_price',ascending=False)
product_comb_10_and_above_df=product_comb_10_and_above_df[product_comb_10_and_above_df['avg_price']>=10].reset_index()
del product_comb_10_and_above_df['index']

print(data.shape)
data=data[~data['product_comb'].isin(product_comb_under_10_set)]
data_under_10=data[data['product_comb'].isin(product_comb_under_10_set)]
data=data.reset_index()
del data['index']
print(data.shape)
dict_item_avg_price=data_item_avg_price.set_index(['product_comb'])['avg_price'].to_dict()

(11438315, 10)
(926237, 10)


In [9]:
del data['class_code_id']
del data['subclass_id']
data_NonRewards=data[pd.isnull(data['customer_id_hashed'])]
data_Rewards=data[~pd.isnull(data['customer_id_hashed'])]

print("Rewards - Row_RawData:",data_Rewards.shape)
print("Rewards - Unique_id:", len(data_Rewards['customer_id_hashed'].unique()))

print("Non_Rewards - Row_RawData:",data_NonRewards.shape)
print("Non_Rewards - Unique_id:", len(data_NonRewards['customer_id_hashed'].unique()))
# data=data[(data['subclass_transaction_amt']>0) & (data['subclass_transaction_units']>0)] #Already filtered at the beginning

gc.collect()

Rewards - Row_RawData: (598533, 8)
Rewards - Unique_id: 381977
Non_Rewards - Row_RawData: (327704, 8)
Non_Rewards - Unique_id: 1


196

In [50]:
data_Rewards.shape

(598533, 8)

In [11]:
data_Rewards[['location_id','transaction_dt','transaction_id','customer_id_hashed']].drop_duplicates().shape

(420169, 4)

In [12]:
def count_unique(x):
    return len(set(x))

# Get the count of actual transactions

In [13]:
Rewards_transactions_list=data_Rewards.groupby(['location_id','transaction_dt','transaction_id','customer_id_hashed'])['product_comb'].apply(list).to_frame().reset_index().rename(columns={"product_comb":"basket_list"})
Rewards_transactions_units_sales=data_Rewards.groupby(['location_id','transaction_dt','transaction_id','customer_id_hashed'])['subclass_transaction_units','subclass_transaction_amt'].sum().reset_index().rename(columns={"subclass_transaction_units":"total_item_units","subclass_transaction_amt":"total_item_revenue"})
Rewards_transactions=pd.merge(Rewards_transactions_list,Rewards_transactions_units_sales,on=['location_id','transaction_dt','transaction_id','customer_id_hashed'],how="left")
Rewards_transactions['basket_str']=Rewards_transactions['basket_list'].apply(lambda x: sorted(x)).astype(str)
Rewards_transactions['transactin_id_given']=[x for x in range(1,len(Rewards_transactions)+1)]
Rewards_transactions['types']=Rewards_transactions['basket_list'].apply(lambda x: len(x))


In [14]:
Rewards_Trans_by_ID=Rewards_transactions.groupby(['customer_id_hashed'])['transactin_id_given'].count().to_frame().reset_index().rename(columns={"transactin_id_given":"trans_count"})

Rewards_IDCounts_by_Trans=Rewards_Trans_by_ID.groupby(['trans_count'])['customer_id_hashed'].count().to_frame().reset_index()
df_Rewards_IDCounts_by_Trans=Rewards_IDCounts_by_Trans.copy()
df_Rewards_IDCounts_by_Trans['trans_count']=np.where(df_Rewards_IDCounts_by_Trans['trans_count']>=3,"3+",df_Rewards_IDCounts_by_Trans['trans_count'])
df_Rewards_IDCounts_by_Trans['trans_count']=df_Rewards_IDCounts_by_Trans['trans_count'].replace(1,"1").replace(2,"2")
df_Rewards_IDCounts_by_Trans=df_Rewards_IDCounts_by_Trans.groupby(['trans_count'])['customer_id_hashed'].sum().to_frame().reset_index().rename(columns={"customer_id_hashed":"ID_Counts"})
df_Rewards_IDCounts_by_Trans['Label']="Rewards_ID"
df_Rewards_IDCounts_by_Trans=df_Rewards_IDCounts_by_Trans[['Label','trans_count','ID_Counts']]

In [15]:
df_Non_Rewards_Trans_Count=data_NonRewards[['location_id','transaction_dt','transaction_id']].drop_duplicates()

df_output_1_count_by_trans_of_ids_price_10Plus=df_Rewards_IDCounts_by_Trans.append(pd.DataFrame({'Label':"Non_Rewards_Trans",'trans_count':"1+",'ID_Counts':len(df_Non_Rewards_Trans_Count)},index=[3]))
df_output_1_count_by_trans_of_ids_price_10Plus=df_output_1_count_by_trans_of_ids_price_10Plus[['Label','trans_count','ID_Counts']]
df_output_1_count_by_trans_of_ids_price_10Plus

Unnamed: 0,Label,trans_count,ID_Counts
0,Rewards_ID,1,350009
1,Rewards_ID,2,27668
2,Rewards_ID,3+,4300
3,Non_Rewards_Trans,1+,247437


In [16]:
Rewards_data_transactions_list=data_Rewards.groupby(['location_id','transaction_dt','transaction_id','customer_id_hashed'])['product_comb'].apply(list).to_frame().reset_index().rename(columns={"product_comb":"basket_list"})
Rewards_data_transactions_units_sales=data_Rewards.groupby(['location_id','transaction_dt','transaction_id','customer_id_hashed'])['subclass_transaction_units','subclass_transaction_amt'].sum().reset_index().rename(columns={"subclass_transaction_units":"total_item_units","subclass_transaction_amt":"total_item_revenue"})

Rewards_data_transactions=pd.merge(Rewards_data_transactions_list,Rewards_data_transactions_units_sales,on=['location_id','transaction_dt','transaction_id','customer_id_hashed'],how="left")
Rewards_data_transactions['basket_str']=Rewards_data_transactions['basket_list'].apply(lambda x: sorted(x)).astype(str)
Rewards_data_transactions['transactin_id_given']=[x for x in range(1,len(Rewards_data_transactions)+1)]
Rewards_data_transactions['types']=Rewards_data_transactions['basket_list'].apply(lambda x: len(x))

# To save


Rewards_data_transactions=pd.merge(data_Rewards,Rewards_data_transactions,on=["location_id","transaction_dt","transaction_id","customer_id_hashed"],how="left")
apply_func={"subclass_transaction_units":"sum","transactin_id_given":"count","subclass_transaction_amt":"sum"}

single_prod_df=Rewards_data_transactions.groupby(['product_comb'])['subclass_transaction_units','transactin_id_given','subclass_transaction_amt'].agg(apply_func).reset_index().rename(columns={"subclass_transaction_units":"Total_Units","transactin_id_given":"Total_Trans","subclass_transaction_amt":"revenue"})
total_unit=single_prod_df['Total_Units'].sum()
total_trans=len(Rewards_data_transactions)

single_prod_df['prob_unit']=single_prod_df['Total_Units']/total_unit
single_prod_df['prob_tran']=single_prod_df['Total_Trans']/total_trans

dict_single_prod_unit=single_prod_df.set_index(['product_comb'])['prob_unit'].to_dict()
dict_single_prod_tran=single_prod_df.set_index(['product_comb'])['prob_tran'].to_dict()


In [17]:
Rewards_Trans_by_ID=Rewards_Trans_by_ID.rename(columns={"trans_count":"trans_count_by_id"})
Rewards_data_transactions=pd.merge(Rewards_data_transactions,Rewards_Trans_by_ID,on="customer_id_hashed",how="left")
Rewards_data_transactions['trans_count_by_id']=np.where(Rewards_data_transactions['trans_count_by_id']>=3,"3+",Rewards_data_transactions['trans_count_by_id'])
Rewards_data_transactions['trans_count_by_id']=Rewards_data_transactions['trans_count_by_id'].replace(1,"1").replace(2,"2")


In [18]:
Rewards_data_transactions.shape

(598533, 15)

In [19]:
df_output_2_1_count_by_trans_of_ids_price_10Plus=Rewards_data_transactions.groupby(['trans_count_by_id','types'])['transactin_id_given'].apply(count_unique).reset_index().rename(columns={"transactin_id_given":"Transaction_Count"})
df_output_2_1_count_by_trans_of_ids_price_10Plus_actual=df_output_2_1_count_by_trans_of_ids_price_10Plus.copy()
df_output_2_1_count_by_trans_of_ids_price_10Plus['types']=np.where(df_output_2_1_count_by_trans_of_ids_price_10Plus['types']>=6,"6+",df_output_2_1_count_by_trans_of_ids_price_10Plus['types'])

df_output_2_1_count_by_trans_of_ids_price_10Plus=df_output_2_1_count_by_trans_of_ids_price_10Plus.groupby(['trans_count_by_id','types'])['Transaction_Count'].sum().reset_index()
df_output_2_1_count_by_trans_of_ids_price_10Plus=df_output_2_1_count_by_trans_of_ids_price_10Plus.pivot_table(index="types",columns="trans_count_by_id",values="Transaction_Count").reset_index().rename(columns={"types":"item_types"})

df_output_2_1_count_by_trans_of_ids_price_10Plus=df_output_2_1_count_by_trans_of_ids_price_10Plus.sort_values("item_types")
df_output_2_1_count_by_trans_of_ids_price_10Plus['Label']="Rewards"
df_output_2_1_count_by_trans_of_ids_price_10Plus

trans_count_by_id,item_types,1,2,3+,Label
0,1,258806,38048,9977,Rewards
1,2,61944,10920,2994,Rewards
2,3,18449,3740,1023,Rewards
3,4,6256,1378,406,Rewards
4,5,2405,663,215,Rewards
5,6+,2149,587,209,Rewards


In [20]:
df_output_2_2_count_by_trans_of_ids_price_10Plus=data_NonRewards.groupby(['location_id','transaction_dt','transaction_id'])['product_comb'].apply(list).reset_index()
df_output_2_2_count_by_trans_of_ids_price_10Plus['item_types']=df_output_2_2_count_by_trans_of_ids_price_10Plus['product_comb'].apply(len)
df_output_2_2_count_by_trans_of_ids_price_10Plus_actual=df_output_2_2_count_by_trans_of_ids_price_10Plus.groupby(['item_types'])['transaction_id'].count().to_frame().reset_index()
df_output_2_2_count_by_trans_of_ids_price_10Plus=df_output_2_2_count_by_trans_of_ids_price_10Plus_actual.copy()
df_output_2_2_count_by_trans_of_ids_price_10Plus['item_types']=np.where(df_output_2_2_count_by_trans_of_ids_price_10Plus['item_types']>=6,"6+",df_output_2_2_count_by_trans_of_ids_price_10Plus['item_types'])
df_output_2_2_count_by_trans_of_ids_price_10Plus=df_output_2_2_count_by_trans_of_ids_price_10Plus.groupby(['item_types'])['transaction_id'].sum().to_frame().reset_index().rename(columns={"transaction_id":"Transaction_Count"})
df_output_2_2_count_by_trans_of_ids_price_10Plus['Label']="Non_Rewards"
df_output_2_2_count_by_trans_of_ids_price_10Plus

Unnamed: 0,item_types,Transaction_Count,Label
0,1,195298,Non_Rewards
1,2,36310,Non_Rewards
2,3,9886,Non_Rewards
3,4,3312,Non_Rewards
4,5,1265,Non_Rewards
5,6+,1366,Non_Rewards


In [21]:
data['customer_id_hashed']=data['customer_id_hashed'].fillna("nan")
df_output_3_count_by_trans_of_ids_price_10Plus=data.groupby(['location_id','transaction_dt','transaction_id','customer_id_hashed'])['product_comb'].apply(list).reset_index()
df_output_3_count_by_trans_of_ids_price_10Plus['item_types']=df_output_3_count_by_trans_of_ids_price_10Plus['product_comb'].apply(len)
df_output_3_count_by_trans_of_ids_price_10Plus_actual=df_output_3_count_by_trans_of_ids_price_10Plus.groupby(['item_types'])['transaction_id'].count().to_frame().reset_index()
df_output_3_count_by_trans_of_ids_price_10Plus=df_output_3_count_by_trans_of_ids_price_10Plus_actual.copy()
df_output_3_count_by_trans_of_ids_price_10Plus['item_types']=np.where(df_output_3_count_by_trans_of_ids_price_10Plus['item_types']>=6,"6+",df_output_3_count_by_trans_of_ids_price_10Plus['item_types'])
df_output_3_count_by_trans_of_ids_price_10Plus=df_output_3_count_by_trans_of_ids_price_10Plus.groupby(['item_types'])['transaction_id'].sum().to_frame().reset_index().rename(columns={"transaction_id":"Transaction_Count"})
df_output_3_count_by_trans_of_ids_price_10Plus['Label']="Rewards_and_NonRewards"
df_output_3_count_by_trans_of_ids_price_10Plus


Unnamed: 0,item_types,Transaction_Count,Label
0,1,502129,Rewards_and_NonRewards
1,2,112168,Rewards_and_NonRewards
2,3,33098,Rewards_and_NonRewards
3,4,11352,Rewards_and_NonRewards
4,5,4548,Rewards_and_NonRewards
5,6+,4311,Rewards_and_NonRewards


In [22]:
writer=pd.ExcelWriter(output_folder+"BL_Transaction_Summary_JL_"+str(datetime.datetime.now().date())+".xlsx",engine="xlsxwriter")
df_output_1_count_by_trans_of_ids_price_10Plus.to_excel(writer,"summary_1_transactions_ids",index=False)
df_output_2_1_count_by_trans_of_ids_price_10Plus.to_excel(writer,"summary_2_1_Rewards_trans_items",index=False)
df_output_2_2_count_by_trans_of_ids_price_10Plus.to_excel(writer,"summary_2_2_NonRew_trans_items",index=False)
df_output_3_count_by_trans_of_ids_price_10Plus.to_excel(writer,"summary_3_all_transactions",index=False)
writer.save()

del data
gc.collect()

102

In [23]:
unique_id_df=Rewards_data_transactions.groupby(['product_comb'])['customer_id_hashed'].apply(count_unique).to_frame().reset_index().rename(columns={"customer_id_hashed":"unique_ids"})
single_prod_df=pd.merge(single_prod_df,unique_id_df,on="product_comb")

In [24]:
data_basket=Rewards_data_transactions.groupby(['basket_str'])['total_item_units','total_item_revenue','transactin_id_given'].agg(
            {"total_item_units":"sum","total_item_revenue":"sum","transactin_id_given":"count"}).reset_index().rename(columns={"transactin_id_given":"trans_count"})
data_basket['basket_list']=data_basket['basket_str'].apply(eval)
data_basket['item_types']=data_basket['basket_list'].apply(len)
data_basket=data_basket.sort_values(['item_types','basket_str'])

data_basket=data_basket.reset_index()
del data_basket['index']

unique_id_by_basket=Rewards_data_transactions.groupby(['basket_str'])['customer_id_hashed'].apply(lambda x: len(set(x))).to_frame().reset_index().rename(columns={'customer_id_hashed':"unique_ids"})
data_basket=pd.merge(data_basket,unique_id_by_basket,on="basket_str",how="left")

In [25]:
# data_basket.to_csv("/home/jian/Projects/Big_Lots/Analysis/2018_Q4/Product_Basket/data_for_freq_dist_JL_"+str(datetime.datetime.now().date())+".csv",index=False)

In [26]:
from itertools import combinations
def findsubsets(total_set,item_counts):
    return list(set(combinations(total_set, item_counts)))

for i in range(2,5): # Up to 4 only
    locals()['set_'+str(i)+"_comb"]=[]
    output_1_basket_str_list_i=sorted(data_basket[data_basket['item_types']==i]['basket_str'].unique().tolist())
    output_2_basket_str_list_i_plus=[]
    basket_str_list_i_plus=data_basket[data_basket['item_types']>i]['basket_str'].unique().tolist()
    
    
    for set_str in basket_str_list_i_plus:
        set_list=eval(set_str)
        output_2_basket_str_list_i_plus=list(set(output_2_basket_str_list_i_plus+[str(list(x)) for x in findsubsets(set_list,i)]))
        
    locals()['set_'+str(i)+"_comb"]=sorted(list(set(output_1_basket_str_list_i+output_2_basket_str_list_i_plus)))
    print(i, datetime.datetime.now())
    print(len(locals()['set_'+str(i)+"_comb"]))
    

2 2019-02-22 10:37:18.165541
28610
3 2019-02-22 10:38:38.550591
177381
4 2019-02-22 10:39:21.164348
517779


In [27]:
'''
basket_transaction_2_plus=data_basket[data_basket['item_types']>=2][['basket_str','trans_count']]
basket_transaction_3_plus=data_basket[data_basket['item_types']>=3][['basket_str','trans_count']]
basket_transaction_4_plus=data_basket[data_basket['item_types']>=4][['basket_str','trans_count']]
basket_transaction_5_plus=data_basket[data_basket['item_types']>=5][['basket_str','trans_count']]
'''

"\nbasket_transaction_2_plus=data_basket[data_basket['item_types']>=2][['basket_str','trans_count']]\nbasket_transaction_3_plus=data_basket[data_basket['item_types']>=3][['basket_str','trans_count']]\nbasket_transaction_4_plus=data_basket[data_basket['item_types']>=4][['basket_str','trans_count']]\nbasket_transaction_5_plus=data_basket[data_basket['item_types']>=5][['basket_str','trans_count']]\n"

In [28]:
list_set_all=set_2_comb+set_3_comb+set_4_comb
total_len=len(list_set_all)
total_len

723770

In [29]:
data_basket.head(2)

Unnamed: 0,basket_str,total_item_units,total_item_revenue,trans_count,basket_list,item_types,unique_ids
0,['11003-003'],51994,625086.84,39403,[11003-003],1,38751
1,['11014-019'],4,44.65,4,[11014-019],1,4


In [30]:
processors=30

interval=int(np.floor(total_len/processors))
# list_set_all_subset_0=list_set_all_subset_[:interval]
# 0 to 9, 10 in total
all_list_of_input=[]
for i in range(processors-1): 
    #1 to 9
    locals()['list_set_all_subset_'+str(i)]=list_set_all[interval*i:interval*(i+1)]
    all_list_of_input=all_list_of_input+[locals()['list_set_all_subset_'+str(i)]]
locals()['list_set_all_subset_'+str(processors-1)]=list_set_all[interval*(processors-1):]
all_list_of_input=all_list_of_input+[locals()['list_set_all_subset_'+str(processors-1)]]

In [31]:
Rewards_data_transactions.shape

(598533, 15)

In [32]:
def getting_BAI_items(list_set_subset_i):
    i_counter=0
    dict_basket_support_trans={}
    dict_basket_support_items={}
    dict_basket_BAI_trans={}
    dict_basket_BAI_items={}
    dict_basket_unique_ids={}
    dict_basket_revenue={} #revenue only for the selected subset of items
    for basket_n in list_set_subset_i:
        basket_n_list=eval(basket_n)
        len_items=len(basket_n_list)
        
        df=Rewards_data_transactions[Rewards_data_transactions['product_comb'].isin(basket_n_list)][['basket_str','transactin_id_given','subclass_transaction_units','customer_id_hashed','subclass_transaction_amt']]
        
        trans_denominator=1
        items_denominator=1
        
        for k in range(len_items):
            globals()['basket_item_'+str(k)]=basket_n_list[k]
            df=df[df['basket_str'].apply(lambda x: globals()['basket_item_'+str(k)] in x)]
            trans_denominator=trans_denominator*dict_single_prod_tran[globals()['basket_item_'+str(k)]]
            items_denominator=items_denominator*dict_single_prod_unit[globals()['basket_item_'+str(k)]]

        trans_basket=len(df['transactin_id_given'].unique())
        items_basket=df['subclass_transaction_units'].sum()
        unique_ids_basket=len(df['customer_id_hashed'].unique())
        revenue_bakset=df['subclass_transaction_amt'].sum()
        
        dict_basket_support_trans.update({basket_n:trans_basket})
        dict_basket_support_items.update({basket_n:items_basket})
        dict_basket_unique_ids.update({basket_n:unique_ids_basket})
        dict_basket_revenue.update({basket_n:revenue_bakset})

        BAI_basket_trans=(trans_basket/total_trans)/trans_denominator*100
        BAI_basket_items=(items_basket/total_unit)/items_denominator*100
        
        dict_basket_BAI_trans.update({basket_n:BAI_basket_trans})
        dict_basket_BAI_items.update({basket_n:BAI_basket_items})
        
        i_counter+=1
        if i_counter%1000==10:
            logging.info(str(datetime.datetime.now())+"|"+str(i_counter))
    results_json={}
    results_json.update({"dict_basket_support_trans":dict_basket_support_trans})
    results_json.update({"dict_basket_support_items":dict_basket_support_items})
    results_json.update({"dict_basket_BAI_trans":dict_basket_BAI_trans})
    results_json.update({"dict_basket_BAI_items":dict_basket_BAI_items})
    results_json.update({"dict_basket_unique_ids":dict_basket_unique_ids})
    results_json.update({"dict_basket_revenue":dict_basket_revenue})
    
    return results_json

In [33]:
from multiprocessing import Pool

result_dict_basket_support_trans={}
result_dict_basket_support_items={}
result_dict_basket_BAI_trans={}
result_dict_basket_BAI_items={}
result_dict_basket_unique_ids={}
result_dict_basket_revenue={}

if __name__ == '__main__':
    p = Pool(processors)
    result=p.map(getting_BAI_items, all_list_of_input)
    for res in result:
        if res is not None:
            result_dict_basket_support_trans.update(res["dict_basket_support_trans"])
            result_dict_basket_support_items.update(res["dict_basket_support_items"])
            result_dict_basket_BAI_trans.update(res["dict_basket_BAI_trans"])
            result_dict_basket_BAI_items.update(res["dict_basket_BAI_items"])
            result_dict_basket_unique_ids.update(res['dict_basket_unique_ids'])
            result_dict_basket_revenue.update(res['dict_basket_revenue'])
    p.close()
    p.join()
    

In [34]:
all_list_of_input[0][0]

"['11003-003', '11014-019']"

In [35]:
output_1=data_basket[data_basket['item_types']==1]
output_2=data_basket[data_basket['item_types'].isin([2,3,4])]
output_3=data_basket[data_basket['item_types']>=5]

output_1['BAI_trans']=100
output_1['BAI_items']=100

output_2['BAI_trans']=output_2['basket_str'].apply(lambda x: result_dict_basket_BAI_trans[x])
output_2['BAI_items']=output_2['basket_str'].apply(lambda x: result_dict_basket_BAI_items[x])

output_basket=output_1.append(output_2).append(output_3) # To add those only in multiple item trans
#E.g. [a,b,c,d] [a,c] doesn't exsit


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if

In [36]:
single_prod_df.head(2)

Unnamed: 0,product_comb,Total_Units,Total_Trans,revenue,prob_unit,prob_tran,unique_ids
0,11003-003,65971,50434,787479.59,0.086028,0.084263,49499
1,11014-019,5,5,56.4,7e-06,8e-06,5


In [37]:
single_prod_df['BAI_Trans']=100
single_prod_df['BAI_Items']=100


In [38]:
df1=pd.DataFrame(result_dict_basket_support_trans,index=['Total_Trans']).T.reset_index().rename(columns={"index":"basket_str"})
df2=pd.DataFrame(result_dict_basket_support_items,index=['Total_Units']).T.reset_index().rename(columns={"index":"basket_str"})
df3=pd.DataFrame(result_dict_basket_BAI_trans,index=['BAI_Trans']).T.reset_index().rename(columns={"index":"basket_str"})
df4=pd.DataFrame(result_dict_basket_BAI_items,index=['BAI_Items']).T.reset_index().rename(columns={"index":"basket_str"})
df5=pd.DataFrame(result_dict_basket_unique_ids,index=['unique_ids']).T.reset_index().rename(columns={"index":"basket_str"})
df6=pd.DataFrame(result_dict_basket_revenue,index=['revenue']).T.reset_index().rename(columns={"index":"basket_str"})

output_all_234_available=pd.merge(df1,df2,on='basket_str')
output_all_234_available=pd.merge(df3,output_all_234_available,on='basket_str')
output_all_234_available=pd.merge(df4,output_all_234_available,on='basket_str')
output_all_234_available=pd.merge(df5,output_all_234_available,on='basket_str')
output_all_234_available=pd.merge(df6,output_all_234_available,on='basket_str')

In [39]:
single_prod_df.head(2)

Unnamed: 0,product_comb,Total_Units,Total_Trans,revenue,prob_unit,prob_tran,unique_ids,BAI_Trans,BAI_Items
0,11003-003,65971,50434,787479.59,0.086028,0.084263,49499,100,100
1,11014-019,5,5,56.4,7e-06,8e-06,5,100,100


In [40]:
single_prod_df['basket_str']="['"+single_prod_df['product_comb']+"']"
del single_prod_df['product_comb']

output_all_1234_available=single_prod_df.append(output_all_234_available)
output_all_1234_available['basket_list']=output_all_1234_available['basket_str'].apply(eval)
output_all_1234_available['item_types']=output_all_1234_available['basket_list'].apply(len)
output_all_1234_available=output_all_1234_available.sort_values('item_types',ascending=True)

# All posibble from the shopped large basket 1-5
output_3=data_basket[data_basket['item_types']>=5]
output_3=output_3.rename(columns={"trans_count":"Total_Trans","total_item_units":"Total_Units","total_item_revenue":"revenue"})

output_all_1234_available=output_all_1234_available.append(output_3) #Appended >5

# Step 2

In [41]:
# Apply the BAI of items to the baskets

len(result_dict_basket_BAI_items)
data_item_avg_price_dict=data_item_avg_price.set_index(["product_comb"]).to_dict()['avg_price']
len(data_item_avg_price_dict)

1604

In [42]:
data_basket.head(2)

Unnamed: 0,basket_str,total_item_units,total_item_revenue,trans_count,basket_list,item_types,unique_ids
0,['11003-003'],51994,625086.84,39403,[11003-003],1,38751
1,['11014-019'],4,44.65,4,[11014-019],1,4


In [45]:
data_basket_item_2=data_basket[data_basket['item_types']==2]
data_basket_item_3=data_basket[data_basket['item_types']==3]
data_basket_item_4=data_basket[data_basket['item_types']==4]

In [44]:
data_basket_wide_actual=data_basket[data_basket['item_types']>1][['basket_list']]
output_df=pd.DataFrame()
def get_actual_wide_products(input_x):

    df=pd.DataFrame({"items":input_x},index=[ind for ind in range(len(input_x))])
    df['class']=df['items'].apply(lambda x: x.split("-")[0])
    df['price']=df['items'].apply(lambda x: dict_item_avg_price[x])
    df=df.sort_values("price",ascending=False).reset_index()
    del df['index']
    
    champion_item=df['items'].tolist()[0]
    champion_class=champion_item.split("-")[0]
    
    df=df[df['items']!=champion_item]
    df['identifier']=df['class'].apply(lambda x: str(x==champion_class)).replace("True","Complementary").replace("False","Secondary")
    df=df.sort_values(['identifier','price'],ascending=[True,False]).head(4).reset_index()
    del df['index']
    
    # empty_df=pd.DataFrame(columns=("items","price",'identifier'),index=[0])
    
    df_C=df[df['identifier']=="Complementary"]
    C_1=np.nan
    C_2=np.nan
    C_3=np.nan
    C_4=np.nan
    for i in range(len())
    
    return globals()['output_df']

SyntaxError: invalid syntax (<ipython-input-44-1f722d3c1a88>, line 26)

In [67]:
len(Rewards_data_transactions_copy['transactin_id_given'].unique())

420169

In [68]:
Rewards_data_transactions['transactin_id_given'].max()

420169

In [89]:
Rewards_data_transactions.head(2)

Unnamed: 0,location_id,transaction_dt,transaction_id,customer_id_hashed,subclass_transaction_units,subclass_transaction_amt,product_comb,price,basket_list,total_item_units,total_item_revenue,basket_str,transactin_id_given,types,trans_count_by_id
0,4517,2019-01-26,1966,61f240fee1095d40aeee11bc7b11b0441f30d5a6c587dc...,1,30.0,34503-004,30.0,"[34503-004, 34504-003, 35078-002]",4,85.0,"['34503-004', '34504-003', '35078-002']",253265,3,2
1,5091,2019-01-26,9272,6f8e01cff2fe069d3e4b345c6b54a7b0c4a4bda408670a...,1,7.78,11003-003,7.78,[11003-003],1,7.78,['11003-003'],304597,1,1


In [87]:
data_NonRewards.shape

(327704, 8)

In [83]:
Rewards_data_transactions_copy=Rewards_data_transactions[['transactin_id_given','product_comb']]
Rewards_data_transactions_copy['avg_price']=Rewards_data_transactions_copy['product_comb'].apply(lambda x: dict_item_avg_price[x])
Rewards_data_transactions_copy['class_code_id']=Rewards_data_transactions_copy['product_comb'].apply(lambda x: x.split("-")[0])
actual_basket_from_trans=pd.DataFrame()

i_counter=0
for trans_id_given,group in Rewards_data_transactions_copy.groupby(['transactin_id_given']):
    group=group.sort_values("avg_price",ascending=False).reset_index()
    del group['index']
    group=group.head(5)
    champion_item=group['product_comb'][0]
    champion_class=champion_item.split("-")[0]
    
    group=group[group['product_comb']!=champion_item]
    complementary_head4=group[group['class_code_id']==champion_class].head(4)
    secondary_head4=group[group['class_code_id']!=champion_class].head(4)
    
    del complementary_head4['class_code_id']
    del secondary_head4['class_code_id']
    
    empty_df=pd.DataFrame(columns=["transactin_id_given","product_comb","avg_price"],index=[0])
    if len(complementary_head4)<4:
        len_to_fill_C=4-len(complementary_head4)
        complementary_head4=complementary_head4.append([empty_df]*len_to_fill_C,ignore_index=True)
    complementary_head4['catagory']="Complementary"
    complementary_head4=complementary_head4.reset_index()
    complementary_head4['index']=[str(x+1) for x in range(4)]
    complementary_head4['catagory']=complementary_head4['catagory']+"_"+complementary_head4['index']
    complementary_head4=complementary_head4[['catagory','product_comb']]
    
    if len(secondary_head4)<4:
        len_to_fill_S=4-len(secondary_head4)
        secondary_head4=secondary_head4.append([empty_df]*len_to_fill_S,ignore_index=True)
    secondary_head4['catagory']="Secondary"
    secondary_head4=secondary_head4.reset_index()
    secondary_head4['index']=[str(x+1) for x in range(4)]
    secondary_head4['catagory']=secondary_head4['catagory']+"_"+secondary_head4['index']
    secondary_head4=secondary_head4[['catagory','product_comb']]
    
    champion_df=pd.DataFrame({"product_comb":champion_item,"catagory":"Champion"},index=[0])
    
    df=champion_df.append(complementary_head4).append(secondary_head4)
    df.index=df['catagory']
    del df['catagory']
    df=df.T
    df['transactin_id_given']=trans_id_given
    actual_basket_from_trans=actual_basket_from_trans.append(df)
    
    i_counter+=1
    if i_counter%4000==10:
        print(i_counter,datetime.datetime.now())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


10 2019-02-22 15:48:35.474611


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


4010 2019-02-22 15:50:20.508004
8010 2019-02-22 15:52:16.599025


KeyboardInterrupt: 

In [None]:
group

In [76]:
secondary_head4.append([empty_df]*0,ignore_index=True)

IndexError: list index out of range

In [46]:
Rewards_data_transactions.head(2)

Unnamed: 0,location_id,transaction_dt,transaction_id,customer_id_hashed,subclass_transaction_units,subclass_transaction_amt,product_comb,price,basket_list,total_item_units,total_item_revenue,basket_str,transactin_id_given,types,trans_count_by_id
0,4517,2019-01-26,1966,61f240fee1095d40aeee11bc7b11b0441f30d5a6c587dc...,1,30.0,34503-004,30.0,"[34503-004, 34504-003, 35078-002]",4,85.0,"['34503-004', '34504-003', '35078-002']",253265,3,2
1,5091,2019-01-26,9272,6f8e01cff2fe069d3e4b345c6b54a7b0c4a4bda408670a...,1,7.78,11003-003,7.78,[11003-003],1,7.78,['11003-003'],304597,1,1


In [None]:
'''
data_basket_wide_actual=data_basket[data_basket['item_types']>1][['basket_list']]
output_df=pd.DataFrame()
def get_actual_wide_products(input_x):

    df=pd.DataFrame({"items":input_x},index=[ind for ind in range(len(input_x))])
    df['class']=df['items'].apply(lambda x: x.split("-")[0])
    df['price']=df['items'].apply(lambda x: dict_item_avg_price[x])
    df=df.sort_values("price",ascending=False).reset_index()
    del df['index']
    
    champion_item=df['items'].tolist()[0]
    champion_class=champion_item.split("-")[0]
    
    df=df[df['items']!=champion_item]
    df['identifier']=df['class'].apply(lambda x: str(x==champion_class)).replace("True","Complementary").replace("False","Secondary")
    df=df.sort_values(['identifier','price'],ascending=[True,False]).head(4).reset_index()
    del df['index']
    
    # empty_df=pd.DataFrame(columns=("items","price",'identifier'),index=[0])
    
    df_C=df[df['identifier']=="Complementary"]
    df_S=df[df['identifier']=="Secondary"]
    C_1=np.nan
    C_2=np.nan
    C_3=np.nan
    C_4=np.nan
    
    S_1=np.nan
    S_2=np.nan
    S_3=np.nan
    S_4=np.nan
    
    for i in range(len(df_C)):
        locals()["C_"+str(i+1)]=df_C['items'][i]
    for i in range(len(df_S)):
        locals()["S_"+str(i+1)]=df_S['items'][i]
    return champion_item,C_1,C_2,C_3,C_4,S_1,S_2,S_3,S_4
'''

In [None]:
data_basket_wide_actual=data_basket[data_basket['item_types']>1][['basket_list']]
len(data_basket_wide_actual)

In [None]:
input_a=data_basket[['basket_list']][2112:2222]
print(datetime.datetime.now())
# input_a['Campion'],input_a['Complementary_SubClass_1'],input_a['Complementary_SubClass_2'],input_a['Complementary_SubClass_3'],input_a['Complementary_SubClass_4'],input_a['Secondary_SubClass_1'],input_a['Secondary_SubClass_2'],input_a['Secondary_SubClass_3'],input_a['Secondary_SubClass_4']
df_x=input_a['basket_list'].apply(lambda x: get_actual_wide_products(x))
print(datetime.datetime.now())



In [146]:
pd.DataFrame(df_x)

Unnamed: 0,basket_list
2112,"(31001-008, nan, nan, nan, nan, nan, nan, nan,..."
2113,"(21002-001, nan, nan, nan, nan, nan, nan, nan,..."
2114,"(21002-001, nan, nan, nan, nan, nan, nan, nan,..."
2115,"(21002-001, nan, nan, nan, nan, nan, nan, nan,..."
2116,"(21002-001, nan, nan, nan, nan, nan, nan, nan,..."
2117,"(21002-001, nan, nan, nan, nan, nan, nan, nan,..."
2118,"(21002-001, nan, nan, nan, nan, nan, nan, nan,..."
2119,"(21002-001, nan, nan, nan, nan, nan, nan, nan,..."
2120,"(21002-001, nan, nan, nan, nan, nan, nan, nan,..."
2121,"(21002-001, nan, nan, nan, nan, nan, nan, nan,..."


In [141]:
test=pd.DataFrame({"A":["a","b"]},index=[0,1])
def test_input(x):
    o_1=x*2
    o_2=x.upper()
    return o_1,o_2
test['B'],test['C']=test['A'].apply(lambda x: test_input(x))
test

Unnamed: 0,A,B,C
0,a,aa,bb
1,b,A,B


In [51]:
def brewak_basket_to_top_5(input_x):
    input_x=eval(input_x)
    len_input_x=len(input_x)
    df=pd.DataFrame({"subcalss_item":input_x},index=range(len(len_input_x)))
    df['price']=df['subcalss_item'].apply(lambda x: data_item_avg_price_dict[x])
    df=df.sort_values("price",ascending=False).head(5)
    
    chapion_subclass=df['subcalss_item'].tolist()[0]
    df['class_code_id']=df['subcalss_item'].apply(lambda x: x.split({"-"}[0]))
    chapion_subclass_id=df['class_code_id'].tolist(0)
    
    complementary_subclass_df=df[df['class_code_id']==chapion_class_id]
    
    if len(complementary_subclass_df)>0:
        complementary_subclass_df_1=complementary_subclass_df.head(3)
        complementary_subclass_df_2=

In [56]:
data_basket.shape

(43101, 7)

In [48]:
data_item_avg_price.head(2)

Unnamed: 0,product_comb,avg_price,division_id,division_desc,department_id,department_desc,class_code_id,class_code_desc,subclass_id,subclass_desc
0,11001-001,2.570352,1,FOOD,108,CANDYSNACK,11001,CANDY,1,Chocolate Bagged
1,11001-002,1.809745,1,FOOD,108,CANDYSNACK,11001,CANDY,2,Non-Chocolate Bagged


In [53]:
Rewards_data_transactions.head(2)

Unnamed: 0,location_id,transaction_dt,transaction_id,customer_id_hashed,subclass_transaction_units,subclass_transaction_amt,product_comb,price,basket_list,total_item_units,total_item_revenue,basket_str,transactin_id_given,types,trans_count_by_id
0,5214,2019-02-03,6854,bd1b066c07df0780663dd52cdb8f8ded3a1d23654602e9...,2,18.0,36406-004,9.0,[36406-004],2,18.0,['36406-004'],302206,1,2
1,1372,2019-02-09,7668,c1679177fcec7e56bf32ba9c1593bea4a12fa932238169...,2,20.0,36406-004,10.0,"[36406-004, 35080-010]",3,35.0,"['35080-010', '36406-004']",42354,2,1


In [54]:
Rewards_data_transactions['transactin_id_given'].max()

370493

In [26]:
writer=pd.ExcelWriter(output_folder+'BL_DBasket_Version2_JL_'+str(datetime.datetime.now().date())+".xlsx",engine="xlsxwriter")
# output=output[['basket_str','basket_list','BAI_trans','BAI_units','item_types','total_item_revenue','total_item_units','trans_count','unique_ids','price_list']]
output_all_1234_available=output_all_1234_available[['basket_list','BAI_Trans','BAI_Items','item_types','revenue','Total_Units','Total_Trans','unique_ids']]
output_all_1234_available=output_all_1234_available.sort_values(['item_types','BAI_Trans'],ascending=[True,False])
output_all_1234_available.to_excel(writer,"BAI_including_subsets",index=False)
data_basket.to_excel(writer,"basket_shopped_together",index=False)
writer.save()

logging.info("Done: "+str(datetime.datetime.now()))

In [27]:
output_all_1234_available.to_csv(output_folder+"BL_DBasket_Version2_BAI_output_JL_"+str(datetime.datetime.now().date())+".csv",index=False)
data_basket.to_csv(output_folder+"BL_DBasket_Version2_actual_whole_baskets_output_JL_"+str(datetime.datetime.now().date())+".csv",index=False)



In [33]:
output_all_1234_available.shape

(1242554, 8)

In [37]:
output_all_1234_available[output_all_1234_available['item_types']==1].to_csv(output_folder+"BL_DBasket_Version2_BAI_output_Item_1_JL_"+str(datetime.datetime.now().date())+".csv",index=False)
output_all_1234_available[output_all_1234_available['item_types']==2].to_csv(output_folder+"BL_DBasket_Version2_BAI_output_Item_2_JL_"+str(datetime.datetime.now().date())+".csv",index=False)
output_all_1234_available[output_all_1234_available['item_types']==3].to_csv(output_folder+"BL_DBasket_Version2_BAI_output_Item_3_JL_"+str(datetime.datetime.now().date())+".csv",index=False)
output_all_1234_available[output_all_1234_available['item_types']==4].to_csv(output_folder+"BL_DBasket_Version2_BAI_output_Item_4_JL_"+str(datetime.datetime.now().date())+".csv",index=False)
output_all_1234_available[output_all_1234_available['item_types']>=5].to_csv(output_folder+"BL_DBasket_Version2_BAI_output_Item_5Plus_JL_"+str(datetime.datetime.now().date())+".csv",index=False)


