In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import ast
import operator

In [2]:
# load the csv file

dataset = pd.read_csv('new_stores/store_0002.csv', header=0, infer_datetime_format=True,
                   parse_dates=['ds'], index_col=['ds'])

n_days = len(dataset.groupby(dataset.index.date))
print("days: "+str(n_days))
dataset = dataset[['sales','n_clients']]
dataset

days: 1743


Unnamed: 0_level_0,sales,n_clients
ds,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-02 10:00:00,121.00,2
2015-01-02 10:30:00,36.00,4
2015-01-02 11:00:00,131.00,5
2015-01-02 11:30:00,126.00,3
2015-01-02 12:30:00,6.00,1
...,...,...
2020-10-18 11:30:00,267.99,16
2020-10-18 12:00:00,616.90,16
2020-10-18 12:30:00,420.42,18
2020-10-18 13:00:00,183.64,9


In [3]:
# resample to hours
dataset.loc[dataset['sales'] == 0, 'sales'] = -1
dataset = dataset.resample('H').sum()
dataset.loc[dataset['sales'] == 0, 'sales'] = np.nan
dataset.dropna(inplace=True)
dataset.loc[dataset['sales'] < 0, 'sales'] = 0

dataset

Unnamed: 0_level_0,sales,n_clients
ds,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-02 10:00:00,157.00,6
2015-01-02 11:00:00,257.00,8
2015-01-02 12:00:00,6.00,1
2015-01-02 13:00:00,107.04,1
2015-01-02 15:00:00,178.00,10
...,...,...
2020-10-18 09:00:00,566.88,22
2020-10-18 10:00:00,953.23,39
2020-10-18 11:00:00,797.97,31
2020-10-18 12:00:00,1037.32,34


In [4]:
def get_most_regular_schedule(df):
    check_dict = dict()
    # iterate by days
    for idx, day in df.groupby(df.index.date):
        day_sched = list()
        # get list(keys-schedules) for the dict
        for i in day.index.time:
            hour = i.strftime("%H:%M:%S")
            day_sched.append(hour)
        day_sched = str(day_sched)
        if day_sched in check_dict:
            check_dict[day_sched]+=1
        else:
            check_dict.update({day_sched : 1})
    
    # the schedule with max ocurrences
    k, v = max(check_dict.items(), key=operator.itemgetter(1))
    k = ast.literal_eval(k)
    
    return k,v, check_dict
    

In [5]:
reg_schedule, v, check_dict = get_most_regular_schedule(dataset)
print(reg_schedule,v)

['08:00:00', '09:00:00', '10:00:00', '11:00:00', '12:00:00', '13:00:00', '14:00:00', '15:00:00', '16:00:00', '17:00:00', '18:00:00', '19:00:00'] 572


In [6]:
def fill_gaps(df, reg_schedule=None, hour_in=None, hour_out=None):
    counter = 0 # counter of imputations to perform
    
    # create custom schedule with hour_in and hour_out
    if hour_in is not None and hour_out is not None:
        hour_in = datetime.strptime(hour_in, '%H:%M:%S')
        hour_out = datetime.strptime(hour_out, '%H:%M:%S')
        reg_schedule = list()
        while hour_in <= hour_out:
            reg_schedule.append(str(datetime.strptime(str(hour_in.time()), '%H:%M:%S').time()))
            hour_in+=timedelta(minutes=30)
        print(reg_schedule)
        
    # fill the gaps with NaN
    for i in reg_schedule:
        i = datetime.strptime(i, '%H:%M:%S').time()
        print(i)
        for j, day in df.groupby(df.index.date):
            if i not in list(day.index.time):
                #print(i, day)
                df.loc[pd.to_datetime(str(j)+" "+str(i))] = [np.nan,np.nan]
                counter+=1
                
    # add column "imputed"
    df['imputed'] = np.where(((pd.isnull(df['n_clients'])) & (pd.isnull(df['sales']))), 1, -1)
    
    print("Total rows to input: "+str(counter))
    df = df.sort_index()     
    
    # Dataframe with NaNs to be imputed and excess datetimes
    return df       

In [7]:
dataset = fill_gaps(dataset, reg_schedule)#, hour_in='08:00:00', hour_out='12:00:00')
dataset

08:00:00
09:00:00
10:00:00
11:00:00
12:00:00
13:00:00
14:00:00
15:00:00
16:00:00
17:00:00
18:00:00
19:00:00
Total rows to input: 4181


Unnamed: 0_level_0,sales,n_clients,imputed
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-01-02 08:00:00,,,1
2015-01-02 09:00:00,,,1
2015-01-02 10:00:00,157.0,6.0,-1
2015-01-02 11:00:00,257.0,8.0,-1
2015-01-02 12:00:00,6.0,1.0,-1
...,...,...,...
2020-10-18 15:00:00,,,1
2020-10-18 16:00:00,,,1
2020-10-18 17:00:00,,,1
2020-10-18 18:00:00,,,1


In [8]:
# check schedules again
reg_schedule, v, check_dict = get_most_regular_schedule(dataset)

# see other schedules as well
for x, y in check_dict.items():
    print(str(x)+" : "+str(y)+" \n")

['08:00:00', '09:00:00', '10:00:00', '11:00:00', '12:00:00', '13:00:00', '14:00:00', '15:00:00', '16:00:00', '17:00:00', '18:00:00', '19:00:00'] : 1690 

['08:00:00', '09:00:00', '10:00:00', '11:00:00', '12:00:00', '13:00:00', '14:00:00', '15:00:00', '16:00:00', '17:00:00', '18:00:00', '19:00:00', '20:00:00', '21:00:00'] : 1 

['08:00:00', '09:00:00', '10:00:00', '11:00:00', '12:00:00', '13:00:00', '14:00:00', '15:00:00', '16:00:00', '17:00:00', '18:00:00', '19:00:00', '23:00:00'] : 1 

['08:00:00', '09:00:00', '10:00:00', '11:00:00', '12:00:00', '13:00:00', '14:00:00', '15:00:00', '16:00:00', '17:00:00', '18:00:00', '19:00:00', '20:00:00', '21:00:00', '22:00:00', '23:00:00'] : 1 

['08:00:00', '09:00:00', '10:00:00', '11:00:00', '12:00:00', '13:00:00', '14:00:00', '15:00:00', '16:00:00', '17:00:00', '18:00:00', '19:00:00', '20:00:00'] : 5 

['08:00:00', '09:00:00', '10:00:00', '11:00:00', '12:00:00', '13:00:00', '14:00:00', '15:00:00', '16:00:00', '17:00:00', '18:00:00', '19:00:00', '

In [9]:
# uncomment to test a break in the schedule
# del reg_schedule[9:11]

In [10]:
# Analyze data to see if there is a close_day and which is it!
temp = dataset.resample('D').sum()
temp['weekday'] = temp.index.dayofweek
temp['2015-03']
# The present dataset usually shuts down on sunday (weekday = 6)

  temp['2015-03']


Unnamed: 0_level_0,sales,n_clients,imputed,weekday
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-03-01,0.0,0.0,0,6
2015-03-02,1194.0,45.0,-6,0
2015-03-03,1334.26,45.0,-8,1
2015-03-04,1524.88,48.0,-6,2
2015-03-05,1470.02,41.0,-6,3
2015-03-06,1505.84,57.0,-8,4
2015-03-07,1478.52,42.0,2,5
2015-03-08,0.0,0.0,0,6
2015-03-09,1213.43,47.0,-6,0
2015-03-10,1185.58,37.0,-6,1


In [11]:
# Imput missing data day-by-day
def input_inday(df, method='linear'):
    # ignore pandas.loc warning
    import warnings
    warnings.filterwarnings('ignore')
    
    # get list of days
    for dt in np.unique(df.index.date):
        # input each day individualy
        df.loc[str(dt)] = df.loc[str(dt)].interpolate(method=method, limit_direction='both')
        
    warnings.filterwarnings('default')
    
    return df

In [12]:
def findoffset_nextweekday_nonzero(d, weekday):
    offset = 7
    d = d.resample('D').max()
    d = d[d['weekday'] == weekday]
    
    if d.iloc[0].at['imputed'] != 0:
        #print('No sense invoquing this function!')
        return -1
    
    d = d.iloc[1: , :]
    
    for r in d.itertuples():
        if r.imputed != 0:
            return offset
        else:
            offset = offset + 7
    
    print('This weekday is never open!')
    print(weekday)
    return -1

In [13]:
def filter_input_schedule(df, schedule, close_day=-1):
    # resample again to get all line back
    df = df.resample('H').sum()
    
    # between times
    df = df.between_time(schedule[0], schedule[-1])
    
    # delete close day from dataset
    if 7>= close_day >= -1:
        # get weekdays
        df['weekday'] = df.index.dayofweek
        df = df[df['weekday'] != close_day]
        
    # perform in-day imputation
    df.loc[df['imputed'] == 1, ['sales','n_clients']] = np.nan
    df = input_inday(df, method='linear')
    
    # identify days with total zeros
    df.loc[df['imputed'] == 0, ['sales','n_clients']] = np.nan
    
    # CONSTANTS
    # SAMPLES_IN_A_DAY = len(schedule) # uncomment when break in schedule is dealt with
    SAMPLES_IN_A_DAY = (pd.to_datetime(reg_schedule[-1]) - pd.to_datetime(reg_schedule[0])).seconds//3600 + 1
    WEEK_DAYS = 7 if close_day == -1 else 6
    
    # if 1st week has missing days, get offset for replacement -> copy future days
    list_offsets = []
    for dt in np.unique(df.head(WEEK_DAYS*SAMPLES_IN_A_DAY).index.date):
        list_offsets.append(findoffset_nextweekday_nonzero(df, dt.weekday()))
    #print(list_offsets)

    for row in df.itertuples():
        if row.imputed == 0:
            # if 1st week has missing values, switch with the offset calculated above
            if row.Index in df.head(WEEK_DAYS*SAMPLES_IN_A_DAY).index:
                x = df.index.get_loc(row.Index) // SAMPLES_IN_A_DAY
                df.at[row.Index, 'sales'] = df.at[row.Index + pd.DateOffset(days=list_offsets[x]), 'sales']
                df.at[row.Index, 'n_clients'] = df.at[row.Index + pd.DateOffset(days=list_offsets[x]), 'n_clients']
            # else replace missing day with same day from past week
            else:
                df.at[row.Index, 'sales'] = df.at[row.Index - pd.DateOffset(days=7), 'sales']
                df.at[row.Index, 'n_clients'] = df.at[row.Index - pd.DateOffset(days=7), 'n_clients']
    
    return df

In [14]:
df = filter_input_schedule(dataset, reg_schedule, close_day=6)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 21768 entries, 2015-01-02 08:00:00 to 2020-10-17 19:00:00
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sales      21768 non-null  float64
 1   n_clients  21768 non-null  float64
 2   imputed    21768 non-null  int64  
 3   weekday    21768 non-null  int64  
dtypes: float64(2), int64(2)
memory usage: 1.3 MB


In [16]:
#

In [17]:
# Not used anymore in this version!!!
# code for break in schedule may still be usefull in the future

# Remove non-regular time rows
def filter_schedule(df, schedule):
    # convert schedule to datetime pd.Series
    pd_schedule = pd.to_datetime(pd.Series(schedule))
    # create a sequence of indexed halfhours
    pd_schedule = pd_schedule.dt.strftime('%H').astype('int64')*2 + pd_schedule.dt.strftime('%M').astype('int64')//30
    
    # check if the schedule is continuous
    if (max(pd_schedule) - min(pd_schedule) + 1 == len(pd_schedule)):
        # no breaks
        df = df.between_time(schedule[0], schedule[-1])
    else:
        # there is a break
        df_schedule = pd.DataFrame({'time':schedule, 'hh':pd_schedule})
        # find break
        for i in df_schedule.index:
            if (df_schedule.at[i+1,'hh'] - df_schedule.at[i,'hh']) != 1:
                break
        close_morning = df_schedule.at[i,'time']
        open_afternoon = df_schedule.at[i+1,'time']
        #print('break from',close_morning,'to',open_afternoon)
        # make 2 splits
        df1 = df.between_time(schedule[0], close_morning)
        df2 = df.between_time(open_afternoon, schedule[-1])
        # join
        df = pd.concat([df1, df2]).sort_index()
    
    return df