In [1]:
import pandas as pd

In [2]:
df_2022 = pd.read_csv("../DATA/prepared/INWI-22.csv",index_col=0,parse_dates=["REPORT_DATE"])
df_2023 = pd.read_csv("../DATA/prepared/INWI-23.csv",index_col=0,parse_dates=["REPORT_DATE"])
df_2022.shape,df_2023.shape

((1127785, 7), (906065, 7))

## Fusing both of the dataFrames into one for splitting

In [16]:
df = pd.concat([df_2022,df_2023])
df.reset_index(drop=True,inplace=True)

In [17]:
# Adding the *6 promotion check column
stop_time_intervals = [
    ["2022-01-03","2022-01-16"],["2022-04-11","2022-04-24"],["2022-06-10","2022-06-23"],["2022-09-22","2022-10-05"] # Year 2022
   ,["2023-01-04","2023-01-07"],["2023-04-03","2023-04-16"],["2023-06-01","2023-06-14"],["2023-11-18","2023-12-01"] # Year 2023
]

stop_time_intervals = [(pd.to_datetime(start), pd.to_datetime(end)) for start, end in stop_time_intervals]

def is_not_in_promotion(date):
    return all(not (start <= date <= end) for start, end in stop_time_intervals)

df['IS_CODE6_ENABLED'] = df['REPORT_DATE'].apply(is_not_in_promotion)

In [19]:
# Determine the split index for 80% train and 20% test
split_index = int(len(df) * 0.8)

In [20]:
# Split the DataFrame
df_train = df.iloc[:split_index]
df_test = df.iloc[split_index:]
df_train.shape,df_test.shape

((1627080, 8), (406770, 8))

## Creating a dataset that contains only the timestamp and revenue value

In [25]:
grouped_df = df.groupby(["REPORT_DATE"]).agg({
    "REVENUE":"sum",
    "IS_CODE6_ENABLED":"first"
}).reset_index()

In [26]:
# Adding the necessary fields for timeseries forecasting
grouped_df['TIMESTAMP_NUMERIC'] = pd.to_numeric(pd.to_datetime(grouped_df['REPORT_DATE']))

grouped_df['DAY_OF_WEEK'] = pd.to_datetime(grouped_df['REPORT_DATE']).dt.dayofweek
grouped_df['DAY_OF_MONTH'] = pd.to_datetime(grouped_df['REPORT_DATE']).dt.day
grouped_df['DAY_OF_YEAR'] = pd.to_datetime(grouped_df['REPORT_DATE']).dt.day_of_year
grouped_df['WEEK_OF_YEAR'] = pd.to_datetime(grouped_df['REPORT_DATE']).dt.isocalendar().week
grouped_df['MONTH'] = pd.to_datetime(grouped_df['REPORT_DATE']).dt.month
grouped_df['QUARTER'] = pd.to_datetime(grouped_df['REPORT_DATE']).dt.quarter
grouped_df['YEAR'] = pd.to_datetime(grouped_df['REPORT_DATE']).dt.year

grouped_df['ONE_DAY_LAG'] = grouped_df['REVENUE'].shift(1)  
grouped_df['TWO_DAY_LAG'] = grouped_df['REVENUE'].shift(2)  
grouped_df['ONE_WEEK_LAG'] = grouped_df['REVENUE'].shift(7)  
grouped_df['ONE_MONTH_LAG'] = grouped_df['REVENUE'].shift(30)

## Defining the X for training and testing and y for training and testing

In [27]:
split_index = int(len(grouped_df)*0.8)

X_train = grouped_df.iloc[:split_index].drop(columns=['REVENUE']).values
y_train = grouped_df.iloc[:split_index]['REVENUE'].values

X_test = grouped_df.iloc[split_index:].drop(columns=['REVENUE']).values
y_test = grouped_df.iloc[split_index:]['REVENUE'].values