In [20]:
import pandas as pd
import numpy as np
from datetime import datetime

In [21]:
#read the csv
data = pd.read_csv('data_modified.csv', sep='|', index_col=False)
data.head()

Unnamed: 0,transaction_id,date,product_code,qty
0,0,2016-06-06,153,1.0
1,1,2016-06-07,153,1.0
2,2,2016-06-27,153,1.0
3,3,2016-06-28,153,1.0
4,4,2016-07-04,153,1.0


In [22]:
#function that returns the year and (ISO) week number for a given date (e.g. '2019-03-15' -> 201911)
def getWeekNum(s):
    objDate = datetime.strptime(s, '%Y-%m-%d')
    return int(str(objDate.isocalendar()[0]) + str(objDate.isocalendar()[1]).zfill(2))

In [23]:
#add new (empty) columns to the dataframe
data.insert(2, 'week', 0)
data.insert(3, 'family', 0)
data.insert(4, 'sub_family', 0)

In [24]:
#set the values for week, family and sub_family for all rows
for i in range(len(data)):
    data.loc[i, 'week'] = getWeekNum(data.loc[i, 'date'])
    data.loc[i, 'family'] = data.loc[i, 'product_code']//100
    data.loc[i, 'sub_family'] = data.loc[i, 'product_code']//10
data.head()

Unnamed: 0,transaction_id,date,week,family,sub_family,product_code,qty
0,0,2016-06-06,201623,1,15,153,1.0
1,1,2016-06-07,201623,1,15,153,1.0
2,2,2016-06-27,201626,1,15,153,1.0
3,3,2016-06-28,201626,1,15,153,1.0
4,4,2016-07-04,201627,1,15,153,1.0


In [25]:
#calculate the cumulated values for the different levels of aggregation
grouped_day_product_code = data.groupby(['date', 'product_code'])
grouped_day_family = data.groupby(['date', 'family'])
grouped_day_sub_family = data.groupby(['date', 'sub_family'])

grouped_week_product_code = data.groupby(['week', 'product_code'])
grouped_week_family = data.groupby(['week', 'family'])
grouped_week_sub_family = data.groupby(['week', 'sub_family'])

In [26]:
#create an empty dataframe with dates as index and the different aggregation levels as columns
unique_dates = data.date.unique()
unique_dates.sort()

unique_products = data.product_code.unique()
unique_products.sort()
unique_products = np.char.mod('%d', unique_products)

unique_fam = (data.product_code//100).unique()
unique_fam.sort()
unique_fam = np.char.mod('%d', unique_fam)

unique_sub_fam = (data.product_code//10).unique()
unique_sub_fam.sort()
unique_sub_fam = np.char.mod('%d', unique_sub_fam)

resultByDay = pd.DataFrame(
    index=unique_dates, 
    columns=np.concatenate((unique_products, unique_fam, unique_sub_fam)))

In [73]:
#fill our new dataframe with the results (by day) from before
for name, group in grouped_day_product_code:
    resultByDay.loc[group['date'].max(), str(group['product_code'].max())] = group['qty'].sum()
for name, group in grouped_day_family:
    resultByDay.loc[group['date'].max(), str(group['family'].max())] = group['qty'].sum()   
for name, group in grouped_day_sub_family:
    resultByDay.loc[group['date'].max(), str(group['sub_family'].max())] = group['qty'].sum()
    
resultByDay.fillna(0, inplace= True)

In [74]:
resultByDay.head()

Unnamed: 0,121,122,123,124,131,132,133,141,142,143,...,12,13,14,15,16,20,25,34,53,56
2016-01-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
2016-01-24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2016-01-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
2016-01-26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2016-01-27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [31]:
#create an empty dataframe with weeks as index and the different levels of aggregation as columns
unique_weeks = data.week.unique()
unique_weeks.sort()

resultByWeek = pd.DataFrame(
    index=unique_weeks, 
    columns=np.concatenate((unique_products, unique_fam, unique_sub_fam)))

In [32]:
#fill our new dataframe with the results (by week) from before
for name, group in grouped_week_product_code:
    resultByWeek.loc[group['week'].max(), str(group['product_code'].max())] = group['qty'].sum()
for name, group in grouped_week_family:
    resultByWeek.loc[group['week'].max(), str(group['family'].max())] = group['qty'].sum()   
for name, group in grouped_week_sub_family:
    resultByWeek.loc[group['week'].max(), str(group['sub_family'].max())] = group['qty'].sum()
    
resultByWeek.fillna(0, inplace= True)

In [33]:
resultByWeek.head()

Unnamed: 0,121,122,123,124,131,132,133,141,142,143,...,12,13,14,15,16,20,25,34,53,56
201603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0
201604,0.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,2.0,...,2.0,1.0,5.0,4.0,8.0,4.0,6.0,1.0,1.0,2.0
201605,1.0,9.0,0.0,1.0,4.0,8.0,3.0,11.0,7.0,6.0,...,11.0,15.0,29.0,34.0,51.0,0.0,0.0,0.0,0.0,0.0
201606,3.0,10.0,0.0,2.0,4.0,4.0,9.0,12.0,9.0,6.0,...,15.0,17.0,30.0,34.0,56.0,0.0,0.0,0.0,0.0,0.0
201607,1.0,10.0,6.0,0.0,3.0,7.0,12.0,10.0,3.0,13.0,...,17.0,22.0,34.0,5.0,60.0,0.0,0.0,0.0,0.0,0.0


In [34]:
#add columns for the sales variations from the previous week (prevWeek) and from the previous year (prevYear)
resultByWeek.insert(len(resultByWeek.columns), 'prevWeek', 0)
resultByWeek.insert(len(resultByWeek.columns), 'prevYear', 0)

In [35]:
#fill the values for the relative variation with the previous week (in percent)
resultByWeek['prevWeek'] = ((resultByWeek['1'].shift(0)-resultByWeek['1'].shift(1))
                            /resultByWeek['1'].shift(1))*100
#resultByWeek.drop(resultByWeek.tail(1).index,inplace=True)

In [66]:
#NEW fill the values for the relative variation with the previous week (in percent)
resultByWeek['prevWeek'] = ((resultByWeek.loc[:,resultByWeek.columns.isin([str(x) for x in range(1, 10)])].agg('sum', axis=1).shift(0)-resultByWeek.loc[:,resultByWeek.columns.isin([str(x) for x in range(1, 10)])].agg('sum', axis=1).shift(1))
                            /resultByWeek.loc[:,resultByWeek.columns.isin([str(x) for x in range(1, 10)])].agg('sum', axis=1).shift(1))*100


In [54]:
#NEW fill the values for the relative variation with the same week of the previous year (in percent)
for it, row in resultByWeek.loc[resultByWeek.index[0]+100:].iterrows():
    resultByWeek.loc[it, 'prevYear'] = (resultByWeek.loc[it, resultByWeek.columns.isin([str(x) for x in range(1, 10)])].agg('sum')-resultByWeek.loc[it-100, resultByWeek.columns.isin([str(x) for x in range(1, 10)])].agg('sum'))/resultByWeek.loc[it-100, resultByWeek.columns.isin([str(x) for x in range(1, 10)])].agg('sum')

In [71]:
resultByWeek.head()

Unnamed: 0,121,122,123,124,131,132,133,141,142,143,...,14,15,16,20,25,34,53,56,prevWeek,prevYear
201603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,,0.0
201604,0.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,2.0,...,5.0,4.0,8.0,4.0,6.0,1.0,1.0,2.0,1033.333333,0.0
201605,1.0,9.0,0.0,1.0,4.0,8.0,3.0,11.0,7.0,6.0,...,29.0,34.0,51.0,0.0,0.0,0.0,0.0,0.0,311.764706,0.0
201606,3.0,10.0,0.0,2.0,4.0,4.0,9.0,12.0,9.0,6.0,...,30.0,34.0,56.0,0.0,0.0,0.0,0.0,0.0,8.571429,0.0
201607,1.0,10.0,6.0,0.0,3.0,7.0,12.0,10.0,3.0,13.0,...,34.0,5.0,60.0,0.0,0.0,0.0,0.0,0.0,-9.210526,0.0
