In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
sales_df = pd.read_csv('out/sales_aggregated.csv', index_col=0)

In [3]:
sales_df = sales_df.transpose()

In [4]:
sales_df.columns.name = None
sales_df = sales_df.rename_axis("Date")
sales_df = sales_df.reset_index()

sales_df.head()

Unnamed: 0,Date,SCHAB Z KOŚCIĄ,SCHAB BEZ KOŚCI,KARCZEK BEZ KOŚCI,SZYNKA Z KOŚCIA,SZYNKA BEZ KOŚCI,ŁOPATKA Z KOŚCIA,ŁOPATKA BEZ KOŚCI,ŁOPATKA MIELONA,ŻEBERKO,...,SALCESON,KASZANKA,PASZTETOWA,BOCZEK WEDZ-GOT,BOCZEK WĘDZONY,PODGZRDLE WĘDZONE,GOLONKA WĘDZONA,ŻEBERKA WĘDZONE,PARÓWKI,WĘDLINY
0,01_2015,181.92,192.732,156.977,0.0,177.45,280.98,157.57,244.28,250.19,...,42.47,97.598,34.47,3.49,87.55,33.4,0.72,2.24,0.0,29.58
1,02_2015,174.21,179.31,171.419,2.84,213.17,246.35,165.918,211.5,263.57,...,41.65,66.74,20.947,16.515,56.85,18.72,0.96,0.0,0.0,27.04
2,03_2015,391.492,331.325,301.939,62.11,385.08,378.56,377.568,392.25,435.851,...,64.7,107.05,33.805,49.825,82.562,27.23,2.14,0.91,31.61,34.29
3,04_2015,330.39,177.61,228.79,26.91,219.12,450.51,218.15,361.44,300.84,...,51.75,79.78,33.255,44.86,58.5,75.57,0.0,0.0,37.69,28.973
4,05_2015,459.24,159.792,272.26,33.65,215.273,279.39,257.57,363.41,387.389,...,49.72,103.8,51.71,53.247,77.56,39.19,0.92,3.48,14.22,43.28


In [5]:
sales_df = sales_df.melt(id_vars=["Date"], var_name='Product', value_name='Sales in kg')

In [6]:
sales_df

Unnamed: 0,Date,Product,Sales in kg
0,01_2015,SCHAB Z KOŚCIĄ,181.920
1,02_2015,SCHAB Z KOŚCIĄ,174.210
2,03_2015,SCHAB Z KOŚCIĄ,391.492
3,04_2015,SCHAB Z KOŚCIĄ,330.390
4,05_2015,SCHAB Z KOŚCIĄ,459.240
...,...,...,...
5395,08_2023,WĘDLINY,513.860
5396,09_2023,WĘDLINY,580.460
5397,10_2023,WĘDLINY,691.890
5398,11_2023,WĘDLINY,622.950


In [7]:
def extract_season(value):
    month = str(value).split('_')[0]
    if month in ['12', '01', '02']: return "WIN"
    elif month in ['03', '04', '05']: return "SPR"
    elif month in ['06', '07', '08']: return "SUM"
    elif month in ['09', '10', '11']: return "FAL"

def include_holidays(value):
    month = str(value).split('_')[0]
    if month in ['03', '04']: return "EASTER"
    if month == "12": return "CHRISTMAS"


def feature_engineering(df):
    df['Holiday'] = df['Date'].apply(include_holidays)
    df['Season'] = df['Date'].apply(extract_season)
    return df
    
    
def encoding(df):
    return pd.get_dummies(df, columns=['Holiday', 'Season'], dtype=int)


    
    

sales_df = feature_engineering(sales_df.copy())
sales_df = encoding(sales_df.copy())

sales_df

Unnamed: 0,Date,Product,Sales in kg,Holiday_CHRISTMAS,Holiday_EASTER,Season_FAL,Season_SPR,Season_SUM,Season_WIN
0,01_2015,SCHAB Z KOŚCIĄ,181.920,0,0,0,0,0,1
1,02_2015,SCHAB Z KOŚCIĄ,174.210,0,0,0,0,0,1
2,03_2015,SCHAB Z KOŚCIĄ,391.492,0,1,0,1,0,0
3,04_2015,SCHAB Z KOŚCIĄ,330.390,0,1,0,1,0,0
4,05_2015,SCHAB Z KOŚCIĄ,459.240,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
5395,08_2023,WĘDLINY,513.860,0,0,0,0,1,0
5396,09_2023,WĘDLINY,580.460,0,0,1,0,0,0
5397,10_2023,WĘDLINY,691.890,0,0,1,0,0,0
5398,11_2023,WĘDLINY,622.950,0,0,1,0,0,0


In [9]:
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5400 entries, 0 to 5399
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Date               5400 non-null   object 
 1   Product            5400 non-null   object 
 2   Sales in kg        5400 non-null   float64
 3   Holiday_CHRISTMAS  5400 non-null   int64  
 4   Holiday_EASTER     5400 non-null   int64  
 5   Season_FAL         5400 non-null   int64  
 6   Season_SPR         5400 non-null   int64  
 7   Season_SUM         5400 non-null   int64  
 8   Season_WIN         5400 non-null   int64  
dtypes: float64(1), int64(6), object(2)
memory usage: 379.8+ KB
