# **Importação das Bibliotecas**

In [1]:
import polars as pl
from datetime import datetime

# **Leitura dos Dados**

In [2]:
raw_data = pl.read_csv("../data/01_raw/store_item_demand.csv")
raw_data.head()

date,store,item,sales
str,i64,i64,i64
"""2013-01-01""",1,1,13
"""2013-01-02""",1,1,11
"""2013-01-03""",1,1,14
"""2013-01-04""",1,1,13
"""2013-01-05""",1,1,10


# **Análise Inicial**

In [3]:
raw_data.describe()

statistic,date,store,item,sales
str,str,f64,f64,f64
"""count""","""913000""",913000.0,913000.0,913000.0
"""null_count""","""0""",0.0,0.0,0.0
"""mean""",,5.5,25.5,52.250287
"""std""",,2.872283,14.430878,28.801144
"""min""","""2013-01-01""",1.0,1.0,0.0
"""25%""",,3.0,13.0,30.0
"""50%""",,6.0,26.0,47.0
"""75%""",,8.0,38.0,70.0
"""max""","""2017-12-31""",10.0,50.0,231.0


## **Datas**

In [4]:
gb_raw_data = raw_data.group_by(["store", "item"]).len().sort(by="len")

start = datetime.strptime(raw_data["date"].min(), "%Y-%m-%d")
end = datetime.strptime(raw_data["date"].max(), "%Y-%m-%d")
all_dates = pl.datetime_range(start=start, end=end, interval="1d", eager=True).alias("date")

equal_dates = 0
stores = raw_data["store"].unique().to_list()
items = raw_data["item"].unique().to_list()
for store in stores:
    for item in items:
        store_item_data = raw_data.filter(
            (pl.col("store") == store) & (pl.col("item") == item)
        )
        if (all_dates == store_item_data["date"].str.to_datetime().sort()).all():
            equal_dates += 1

if equal_dates == gb_raw_data.shape[0]:
    print("Todas as séries temporais possuem as mesmas datas.")
else:
    print("As séries temporais não possuem as mesmas datas.")

Todas as séries temporais possuem as mesmas datas.


## **Demandas intermitentes**

In [5]:
raw_data.filter(pl.col("sales") == 0)

date,store,item,sales
str,i64,i64,i64
"""2014-01-15""",6,4,0


In [6]:
grouped_data = raw_data.group_by(["store", "item"]).agg(
    pl.col("sales").mean().alias("sales_mean"),
    pl.col("sales").std().alias("sales_std"),
)

grouped_data = grouped_data.with_columns(
    (pl.col("sales_std") / pl.col("sales_mean")).alias("sales_std/mean")
)

grouped_data.sort("sales_std/mean")

store,item,sales_mean,sales_std,sales_std/mean
i64,i64,f64,f64,f64
8,15,108.047645,28.265443,0.261602
3,28,100.143483,26.219792,0.261822
2,28,112.638007,29.526526,0.262136
2,38,103.249179,27.084663,0.262323
8,13,103.819825,27.257209,0.262543
…,…,…,…,…
6,41,16.607338,5.902563,0.355419
7,27,15.213582,5.478561,0.36011
5,5,14.086528,5.157271,0.366114
7,5,12.733844,4.73128,0.371552


# **Conclusões**

- Nehuma série possui valores faltantes (NaN);
- Nenhuma série possui demanda negativa ou algum valor absurdo que precise ser tratado; 
- Todas as séries possuem dados diários de "01/01/2013" até "31/12/2017", sem faltar nenhum dia;
- De acordo com a quantidade de demandas igual a 0 e os valores mínimo e máximo do coeficiente de variação das demandas, não será necessário o uso de um modelo para demandas intermitentes.