In [13]:
import pandas as pd


In [14]:
df_features = pd.read_csv('../data/features.csv')
df_sales = pd.read_csv('../data/sales.csv')
df_stores = pd.read_csv('../data/stores.csv')

In [15]:
df_features['Date'] = pd.to_datetime(df_features['Date'], format='%d/%m/%Y')
df_sales['Date'] = pd.to_datetime(df_sales['Date'], format='%d/%m/%Y')


In [16]:
df_features = df_features.rename(columns={'IsHoliday': 'IsHolidayFeatures'})
df_combined = pd.merge(df_sales, df_stores, on='Store', how='left')
df_final = pd.merge(df_combined, df_features, on=['Store', 'Date'], how='left')
df_final['week'] = df_final['Date'].dt.isocalendar().week
df_final['month'] = df_final['Date'].dt.month
df_final['day'] = df_final['Date'].dt.day
df_final.set_index('Date', inplace=True)
df_final = df_final.sort_index()
df_final.drop(columns=['IsHolidayFeatures'], axis=1, inplace=True)


In [5]:
df_final.to_csv('../data/combined_data.csv')

In [6]:
def prepare_forecasting_features_year(df):
    df = df.copy()
    shiftable_columns = ['Weekly_Sales', 'Temperature', 'MarkDown1','Fuel_Price',
                        'MarkDown2', 'MarkDown3', 'MarkDown4', 
                        'MarkDown5', 'CPI', 'Unemployment']
    dropable_columns  = ['Temperature', 'MarkDown1','Fuel_Price',
                        'MarkDown2', 'MarkDown3', 'MarkDown4', 
                        'MarkDown5', 'CPI', 'Unemployment']
    for col in shiftable_columns:
        df[f'lag_yearly_{col}'] = df.groupby(['Store', 'Dept'])[col].shift(52)
        df[f'{col}_historical_week_avg'] = df.groupby(
            ['Store', 'Dept', 'week']
        )[f'lag_yearly_{col}'].transform('mean')
    feature_df = df.drop(dropable_columns, axis=1)
    df_clean = feature_df.dropna(subset=['lag_yearly_Weekly_Sales'])
    return df_clean

In [17]:
def prepare_forecasting_features_4_weeks(df):
    df = df.copy()
    shiftable_columns = ['Weekly_Sales', 'Temperature', 'MarkDown1', 'Fuel_Price',
                         'MarkDown2', 'MarkDown3', 'MarkDown4', 
                         'MarkDown5', 'CPI', 'Unemployment']
    dropable_columns = ['Temperature', 'MarkDown1', 'Fuel_Price',
                        'MarkDown2', 'MarkDown3', 'MarkDown4', 
                        'MarkDown5', 'CPI', 'Unemployment']
    
    lags = [4, 8, 16, 32]
    for lag in lags:
        for col in shiftable_columns:
            df[f'lag_{lag}_{col}'] = df.groupby(['Store', 'Dept'])[col].shift(lag)

    
    for col in shiftable_columns:
        df[f'{col}_historical_week_avg'] = df.groupby(['Store', 'Dept', 'week'])[col].transform('mean')
    
    feature_df = df.drop(dropable_columns, axis=1)
    df_clean = feature_df.dropna(subset=['lag_32_Weekly_Sales'])
    return df_clean


In [8]:
df_year_prediction = prepare_forecasting_features_year(df_final)


In [9]:
df_4_weeks_prediction = prepare_forecasting_features_4_weeks(df_final)

In [10]:
df_year_prediction.to_csv('../data/year_lag_data.csv')
df_4_weeks_prediction.to_csv('../data/4_weeks_lag_data.csv')

In [49]:
store =  df[(df['Store'] == 1) & (df['Dept'] == 1)]

In [51]:
store.to_csv('1.csv')