In [1]:
import polars as pl

In [2]:
data_path = "data"


In [8]:
sales = (
    pl.read_csv("../data/sales_train_evaluation.csv")
    .lazy()
    .melt(id_vars=[
        "id", "item_id", "dept_id", "cat_id", "store_id", "state_id"
        ], value_name="sales", variable_name="d")
    .with_columns(
        pl.col("d").str.slice(2, ).alias("days_since_start").cast(int)
    )
)

In [9]:
calendar = (
    pl.read_csv("../data/calendar.csv")
    .lazy()
    .melt(id_vars=[
        "date", "wm_yr_wk", "weekday", "wday", "month", "year", "d", 
        "event_name_1", "event_type_1", "event_name_2", "event_type_2"
        ], variable_name="state_id", value_name="snap")
    .with_columns(
        pl.col("date").str.strptime(pl.Date, "%Y-%m-%d"),
        pl.col("state_id").str.slice(-2, )
    )
)

In [11]:
prices = (
    pl.read_csv("../data/sell_prices.csv")
    .lazy()
)

In [15]:
df = (
    sales
    .join(calendar, on=["d", "state_id"])
    # NOTE: if the price is not reported in the prices table this filters automatically the
    #       days without listings in the beginning
    .join(prices, on=["store_id", "item_id", "wm_yr_wk"])
    .collect()
)

In [16]:
df.head()

id,item_id,dept_id,cat_id,store_id,state_id,d,sales,days_since_start,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap,sell_price
str,str,str,str,str,str,str,i64,i64,date,i64,str,i64,i64,i64,str,str,str,str,i64,f64
"""HOBBIES_1_008_…","""HOBBIES_1_008""","""HOBBIES_1""","""HOBBIES""","""CA_1""","""CA""","""d_1""",12,1,2011-01-29,11101,"""Saturday""",1,1,2011,,,,,0,0.46
"""HOBBIES_1_009_…","""HOBBIES_1_009""","""HOBBIES_1""","""HOBBIES""","""CA_1""","""CA""","""d_1""",2,1,2011-01-29,11101,"""Saturday""",1,1,2011,,,,,0,1.56
"""HOBBIES_1_010_…","""HOBBIES_1_010""","""HOBBIES_1""","""HOBBIES""","""CA_1""","""CA""","""d_1""",0,1,2011-01-29,11101,"""Saturday""",1,1,2011,,,,,0,3.17
"""HOBBIES_1_012_…","""HOBBIES_1_012""","""HOBBIES_1""","""HOBBIES""","""CA_1""","""CA""","""d_1""",0,1,2011-01-29,11101,"""Saturday""",1,1,2011,,,,,0,5.98
"""HOBBIES_1_015_…","""HOBBIES_1_015""","""HOBBIES_1""","""HOBBIES""","""CA_1""","""CA""","""d_1""",4,1,2011-01-29,11101,"""Saturday""",1,1,2011,,,,,0,0.7


In [17]:
df.write_parquet("../data/full_table.parquet")

In [18]:
(
    df
    .select([
        "id", "item_id", "dept_id", "cat_id", "store_id", "state_id", 
        "sales", "date", "event_name_1", "event_type_1", "event_name_2",
        "event_type_2", "snap", "sell_price"
    ])
    .write_parquet("../data/reduced_table.parquet")
)