In [1]:
#!pip install -q lightgbm pandas numpy

In [2]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor

# 1. Importando os arquivos

In [3]:
df27 = pd.read_parquet('part27.snappy.parquet')
df51 = pd.read_parquet('part51.snappy.parquet')
df71 = pd.read_parquet('part71.snappy.parquet')

In [4]:
df27.info()
df51.info()
df71.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14419 entries, 0 to 14418
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   pdv            14419 non-null  object
 1   premise        14419 non-null  object
 2   categoria_pdv  14419 non-null  object
 3   zipcode        14419 non-null  int32 
dtypes: int32(1), object(3)
memory usage: 394.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6560698 entries, 0 to 6560697
Data columns (total 11 columns):
 #   Column               Dtype  
---  ------               -----  
 0   internal_store_id    object 
 1   internal_product_id  object 
 2   distributor_id       object 
 3   transaction_date     object 
 4   reference_date       object 
 5   quantity             float64
 6   gross_value          float64
 7   net_value            float64
 8   gross_profit         float64
 9   discount             float64
 10  taxes                float64
dtypes: float64(6), object(

In [5]:
# Keep df51 as the "fact table" with its native keys
tx = df51.rename(columns={"transaction_date":"dt"}).copy()
# enforce types
tx["dt"] = pd.to_datetime(tx["dt"], errors="coerce")
tx = tx.dropna(subset=["dt"])
tx["quantity"] = pd.to_numeric(tx["quantity"], errors="coerce").fillna(0.0).clip(lower=0)

In [6]:
# Rename dimensions to match df51 keys and LEFT join (do NOT inner-join)
stores = df27.rename(columns={"pdv":"internal_store_id"}).copy()
prods  = df71.rename(columns={"produto":"internal_product_id"}).copy()

# 2. Fazendo o join entre as tabelas

In [7]:
tx = tx.merge(stores, on="internal_store_id", how="left")
tx = tx.merge(prods,  on="internal_product_id", how="left")

In [8]:
tx.head()

Unnamed: 0,internal_store_id,internal_product_id,distributor_id,dt,reference_date,quantity,gross_value,net_value,gross_profit,discount,...,premise,categoria_pdv,zipcode,categoria,descricao,tipos,label,subcategoria,marca,fabricante
0,7384367747233276219,328903483604537190,9,2022-07-13,2022-07-01,1.0,38.125,37.890625,10.042625,3.95,...,Off Premise,Package/Liquor,80905.0,Package,BUD LIGHT CHELADA FUEGO 15/25 CN,Package,Core,Specialty,Bud Light Chelada Fuego,AB Anheuser Busch Inc
1,3536908514005606262,5418855670645487653,5,2022-03-21,2022-03-01,6.0,107.25,106.440002,24.732002,17.1,...,Off Premise,Package/Liquor,80239.0,Package,MICHELOB ULTRA 18/12 CN,Package,Core,Lager,Michelob Ultra,AB Anheuser Busch Inc
2,3138231730993449825,1087005562675741887,6,2022-09-06,2022-09-01,3.0,56.625,56.220001,14.124002,5.25,...,Off Premise,Package/Liquor,80634.0,Package,BUD LIGHT LIME 18/12 CN,Package,Core,Lager,Bud Light Lime,AB Anheuser Busch Inc
3,3681167389484217654,1401422983880045188,5,2022-09-11,2022-09-01,129.0,1037.160023,1037.160023,156.348026,479.880006,...,Off Premise,Package/Liquor,80226.0,Distilled Spirits,99 BUTTERSCOTCH 12/10/50ML 99PF,Allocated Spirits,,Liqueurs & Cordials,99 Butterscotch,Sazerac Spirits
4,7762413312337359369,6614994347738381720,4,2022-02-18,2022-02-01,1.0,26.23,23.950241,6.550241,0.0,...,Off Premise,Convenience,30096.0,Package,NB VOODOO RANGER IMPERIAL IPA 15/19.2 CN,Package,Core,IPA,New Belgium Voodoo Ranger Imperial IPA,NB New Belgium


In [9]:
tx_2022 = tx[(tx["dt"] >= "2022-01-01") & (tx["dt"] <= "2022-12-31")].copy()
tx_2022["week_end"] = tx_2022["dt"].dt.to_period("W-SAT").dt.end_time

In [10]:
# unit price / margin (basic)
eps = 1e-9
tx_2022["unit_price_gross"] = tx_2022["gross_value"] / (tx_2022["quantity"] + eps)
tx_2022["unit_price_net"]   = tx_2022["net_value"]   / (tx_2022["quantity"] + eps)
tx_2022["unit_margin"]      = tx_2022["gross_profit"]/ (tx_2022["quantity"] + eps)

In [11]:
tx_2022.head()

Unnamed: 0,internal_store_id,internal_product_id,distributor_id,dt,reference_date,quantity,gross_value,net_value,gross_profit,discount,...,descricao,tipos,label,subcategoria,marca,fabricante,week_end,unit_price_gross,unit_price_net,unit_margin
0,7384367747233276219,328903483604537190,9,2022-07-13,2022-07-01,1.0,38.125,37.890625,10.042625,3.95,...,BUD LIGHT CHELADA FUEGO 15/25 CN,Package,Core,Specialty,Bud Light Chelada Fuego,AB Anheuser Busch Inc,2022-07-16 23:59:59.999999999,38.125,37.890625,10.042625
1,3536908514005606262,5418855670645487653,5,2022-03-21,2022-03-01,6.0,107.25,106.440002,24.732002,17.1,...,MICHELOB ULTRA 18/12 CN,Package,Core,Lager,Michelob Ultra,AB Anheuser Busch Inc,2022-03-26 23:59:59.999999999,17.875,17.74,4.122
2,3138231730993449825,1087005562675741887,6,2022-09-06,2022-09-01,3.0,56.625,56.220001,14.124002,5.25,...,BUD LIGHT LIME 18/12 CN,Package,Core,Lager,Bud Light Lime,AB Anheuser Busch Inc,2022-09-10 23:59:59.999999999,18.875,18.74,4.708
3,3681167389484217654,1401422983880045188,5,2022-09-11,2022-09-01,129.0,1037.160023,1037.160023,156.348026,479.880006,...,99 BUTTERSCOTCH 12/10/50ML 99PF,Allocated Spirits,,Liqueurs & Cordials,99 Butterscotch,Sazerac Spirits,2022-09-17 23:59:59.999999999,8.04,8.04,1.212
4,7762413312337359369,6614994347738381720,4,2022-02-18,2022-02-01,1.0,26.23,23.950241,6.550241,0.0,...,NB VOODOO RANGER IMPERIAL IPA 15/19.2 CN,Package,Core,IPA,New Belgium Voodoo Ranger Imperial IPA,NB New Belgium,2022-02-19 23:59:59.999999999,26.23,23.950241,6.550241


In [12]:
# aggregate ONLY observed weeks
weekly = (
    tx_2022.groupby(["internal_store_id","internal_product_id","week_end"], as_index=False)
           .agg(
               quantidade=("quantity","sum"),
               price_gross=("unit_price_gross","mean"),
               price_net=("unit_price_net","mean"),
               margin=("unit_margin","mean"),
               disc=("discount","mean"),
               taxes=("taxes","mean"),
               premise=("premise","first"),
               categoria_pdv=("categoria_pdv","first"),
               zipcode=("zipcode","first"),
               categoria=("categoria","first"),
               tipos=("tipos","first"),
               label=("label","first"),
               subcategoria=("subcategoria","first"),
               marca=("marca","first"),
               fabricante=("fabricante","first"),
           )
           .sort_values(["internal_store_id","internal_product_id","week_end"])
           .reset_index(drop=True)
)

In [13]:
weekly = weekly.rename(columns={
    "internal_store_id":"pdv",
    "internal_product_id":"produto"
})

In [14]:
weekly.head(20)

Unnamed: 0,pdv,produto,week_end,quantidade,price_gross,price_net,margin,disc,taxes,premise,categoria_pdv,zipcode,categoria,tipos,label,subcategoria,marca,fabricante
0,1000237487041964405,1837429607327399565,2022-02-12 23:59:59.999999999,1.0,35.200001,32.920242,9.420242,0.0,2.279758,On Premise,Winery,30175.0,Package,Package,Core,IPA,Fire Maker Perfect Match IPA,Fire Maker Brewing Co
1,1000237487041964405,1837429607327399565,2022-02-19 23:59:59.999999999,2.0,35.200001,32.920242,9.420242,0.0,4.559516,On Premise,Winery,30175.0,Package,Package,Core,IPA,Fire Maker Perfect Match IPA,Fire Maker Brewing Co
2,1000237487041964405,1837429607327399565,2022-05-28 23:59:59.999999999,1.0,38.400001,36.120243,10.700243,0.0,2.279758,On Premise,Winery,30175.0,Package,Package,Core,IPA,Fire Maker Perfect Match IPA,Fire Maker Brewing Co
3,1000237487041964405,1837429607327399565,2022-06-25 23:59:59.999999999,2.0,38.400002,36.120243,10.700243,0.0,4.559516,On Premise,Winery,30175.0,Package,Package,Core,IPA,Fire Maker Perfect Match IPA,Fire Maker Brewing Co
4,1000237487041964405,1837429607327399565,2022-07-16 23:59:59.999999999,2.0,38.400002,36.120243,10.700243,0.0,4.559516,On Premise,Winery,30175.0,Package,Package,Core,IPA,Fire Maker Perfect Match IPA,Fire Maker Brewing Co
5,1000237487041964405,1837429607327399565,2022-08-27 23:59:59.999999999,2.0,38.400002,36.120243,10.700243,0.0,4.559516,On Premise,Winery,30175.0,Package,Package,Core,IPA,Fire Maker Perfect Match IPA,Fire Maker Brewing Co
6,1000237487041964405,1837429607327399565,2022-10-01 23:59:59.999999999,1.0,38.400001,36.120243,10.700243,0.0,2.279758,On Premise,Winery,30175.0,Package,Package,Core,IPA,Fire Maker Perfect Match IPA,Fire Maker Brewing Co
7,1000237487041964405,1837429607327399565,2022-11-19 23:59:59.999999999,2.0,38.400002,36.120243,10.700243,0.0,4.559516,On Premise,Winery,30175.0,Package,Package,Core,IPA,Fire Maker Perfect Match IPA,Fire Maker Brewing Co
8,1000237487041964405,4038588102284338370,2022-04-30 23:59:59.999999999,1.0,38.400001,36.120243,10.870243,0.0,2.279758,On Premise,Winery,30175.0,Package,Package,Core,Ale,Jekyll Cooter Brown,Jekyll Brewing
9,1000237487041964405,5429216175252037173,2022-02-12 23:59:59.999999999,2.0,38.400002,36.120243,10.870243,0.0,4.559516,On Premise,Winery,30175.0,Package,Package,In&Out,IPA,Jekyll Southern Juice IPA,Jekyll Brewing


In [15]:
for c in ["price_gross","price_net","margin","disc","taxes","zipcode"]:
    if c in weekly.columns:
        weekly[c] = pd.to_numeric(weekly[c], errors="coerce").fillna(0)

feature engineering é ESSENCIAL para a boa performance do modelo

In [16]:
def add_feats(df):
    df = df.sort_values(["pdv","produto","week_end"]).copy()
    g = df.groupby(["pdv","produto"])
    df["lag1"] = g["quantidade"].shift(1)
    df["lag2"] = g["quantidade"].shift(2)
    df["lag3"] = g["quantidade"].shift(3)
    df["lag4"] = g["quantidade"].shift(4)
    df["rmean4"]  = g["quantidade"].rolling(4, min_periods=1).mean().reset_index(level=[0,1], drop=True).shift(1)
    df["rmean12"] = g["quantidade"].rolling(12, min_periods=1).mean().reset_index(level=[0,1], drop=True).shift(1)
    # price/margin lags
    for c in ["price_gross","price_net","margin","disc","taxes"]:
        if c in df.columns:
            df[f"{c}_lag1"] = g[c].shift(1)
    # context fallbacks if shit happens
    df["store_rmean4"] = df.groupby("pdv")["quantidade"].rolling(4, min_periods=1).mean().reset_index(level=0, drop=True).shift(1)
    df["prod_rmean4"]  = df.groupby("produto")["quantidade"].rolling(4, min_periods=1).mean().reset_index(level=0, drop=True).shift(1)
    return df

weekly = add_feats(weekly)

# target = next week's qty
weekly["target_next"] = weekly.groupby(["pdv","produto"])["quantidade"].shift(-1)

train_df = weekly.dropna(subset=["lag1","target_next"]).copy()

target para next week, sutil porém muito importante

In [17]:
cat_cols = ["pdv","produto","premise","categoria_pdv","categoria","tipos","label","subcategoria","marca","fabricante"]
for c in cat_cols:
    if c in train_df.columns:
        train_df[c] = train_df[c].astype("category")

# encode time as numeric (no holidays)
train_df["week_ord"] = pd.to_datetime(train_df["week_end"]).view("int64") // 10**9

  train_df["week_ord"] = pd.to_datetime(train_df["week_end"]).view("int64") // 10**9


In [18]:
feature_cols = [
    "pdv","produto","week_ord",
    "lag1","lag2","lag3","lag4","rmean4","rmean12",
    "price_gross","price_net","margin","disc","taxes",
    "price_gross_lag1","price_net_lag1","margin_lag1","disc_lag1","taxes_lag1",
    "store_rmean4","prod_rmean4",
    "premise","categoria_pdv","categoria","tipos","label","subcategoria","marca","fabricante",
    "zipcode",
]
X_train = train_df[feature_cols].copy()
y_train = train_df["target_next"].astype(float).values

# downcast to save RAM
for c in X_train.select_dtypes(include=["float64"]).columns:
    X_train[c] = pd.to_numeric(X_train[c], downcast="float")
for c in X_train.select_dtypes(include=["int64"]).columns:
    X_train[c] = pd.to_numeric(X_train[c], downcast="integer")

In [19]:
# quick checks
print("Rows in weekly:", len(weekly))
print("Rows in train_df:", len(train_df))
print("Sum(y_train):", y_train.sum(), " Var:", np.var(y_train))

Rows in weekly: 6252324
Rows in train_df: 4578688
Sum(y_train): 18237305.43721736  Var: 79.19768630494345


# 3. Treinando o Modelo

In [21]:
train_df.head(20)

Unnamed: 0,pdv,produto,week_end,quantidade,price_gross,price_net,margin,disc,taxes,premise,...,rmean12,price_gross_lag1,price_net_lag1,margin_lag1,disc_lag1,taxes_lag1,store_rmean4,prod_rmean4,target_next,week_ord
1,1000237487041964405,1837429607327399565,2022-02-19 23:59:59.999999999,2.0,35.200001,32.920242,9.420242,0.0,4.559516,On Premise,...,1.0,35.200001,32.920242,9.420242,0.0,2.279758,1.0,1.0,1.0,1645315199
2,1000237487041964405,1837429607327399565,2022-05-28 23:59:59.999999999,1.0,38.400001,36.120243,10.700243,0.0,2.279758,On Premise,...,1.5,35.200001,32.920242,9.420242,0.0,4.559516,1.5,1.5,2.0,1653782399
3,1000237487041964405,1837429607327399565,2022-06-25 23:59:59.999999999,2.0,38.400002,36.120243,10.700243,0.0,4.559516,On Premise,...,1.333333,38.400001,36.120243,10.700243,0.0,2.279758,1.333333,1.333333,2.0,1656201599
4,1000237487041964405,1837429607327399565,2022-07-16 23:59:59.999999999,2.0,38.400002,36.120243,10.700243,0.0,4.559516,On Premise,...,1.5,38.400002,36.120243,10.700243,0.0,4.559516,1.5,1.5,2.0,1658015999
5,1000237487041964405,1837429607327399565,2022-08-27 23:59:59.999999999,2.0,38.400002,36.120243,10.700243,0.0,4.559516,On Premise,...,1.6,38.400002,36.120243,10.700243,0.0,4.559516,1.75,1.75,1.0,1661644799
6,1000237487041964405,1837429607327399565,2022-10-01 23:59:59.999999999,1.0,38.400001,36.120243,10.700243,0.0,2.279758,On Premise,...,1.666667,38.400002,36.120243,10.700243,0.0,4.559516,1.75,1.75,2.0,1664668799
10,1000237487041964405,5429216175252037173,2022-02-19 23:59:59.999999999,2.0,38.400002,36.120243,10.870243,0.0,4.559516,On Premise,...,2.0,38.400002,36.120243,10.870243,0.0,4.559516,1.5,2.0,2.0,1645315199
11,1000237487041964405,5429216175252037173,2022-03-26 23:59:59.999999999,2.0,38.400002,36.120243,10.870243,0.0,4.559516,On Premise,...,2.0,38.400002,36.120243,10.870243,0.0,4.559516,1.75,2.0,1.0,1648339199
12,1000237487041964405,5429216175252037173,2022-05-28 23:59:59.999999999,1.0,38.400001,36.120243,10.870243,0.0,2.279758,On Premise,...,2.0,38.400002,36.120243,10.870243,0.0,4.559516,1.75,2.0,2.0,1653782399
13,1000237487041964405,5429216175252037173,2022-06-25 23:59:59.999999999,2.0,38.400002,36.120243,10.870243,0.0,4.559516,On Premise,...,1.75,38.400001,36.120243,10.870243,0.0,2.279758,1.75,1.75,1.0,1656201599


In [23]:
lgbm = LGBMRegressor(
    objective="poisson" if y_train.sum() > 0 else "regression",
    metric="rmse",
    force_col_wise=True,
    n_estimators=2500,
    learning_rate=0.05,
    num_leaves=255,
    max_depth=-1,
    max_bin=255,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_samples=20,
    min_child_weight=1e-3,
    feature_pre_filter=False,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1,
)

cat_used = [c for c in cat_cols if c in X_train.columns]

In [24]:
lgbm.fit(X_train, y_train, categorical_feature=cat_used)

[LightGBM] [Info] Total Bins 14618
[LightGBM] [Info] Number of data points in the train set: 4578688, number of used features: 30
[LightGBM] [Info] Start training from score 1.382057


In [25]:
print("Top features:")
print(pd.Series(lgbm.feature_importances_, index=X_train.columns).sort_values(ascending=False).head(20))

Top features:
pdv             213661
produto          88733
week_ord         28114
lag2             24538
taxes            21782
lag3             21519
disc             20679
lag4             20445
lag1             19791
marca            19301
prod_rmean4      16694
rmean4           16509
rmean12          16052
store_rmean4     15163
disc_lag1        13788
taxes_lag1       12290
zipcode          11883
margin            8906
margin_lag1       8555
price_gross       7046
dtype: int32


## 4. Salvando o Modelo e Resultados

In [26]:
import joblib

filename = 'lgbm_model.pkl'
joblib.dump(lgbm, filename)

['lgbm_model.pkl']

In [27]:
jan_we = pd.to_datetime(["2023-01-07","2023-01-14","2023-01-21","2023-01-28","2023-02-04"])
history = weekly.copy()

def next_inputs(hist, week_end):
    # recompute lags from current history (observed + prior preds)
    hist = hist.sort_values(["pdv","produto","week_end"]).copy()
    hist = add_feats(hist)
    last = (hist.sort_values("week_end").groupby(["pdv","produto"], as_index=False).tail(1)).copy()
    # set future week
    last["week_end"] = pd.to_datetime(week_end)
    last["week_ord"] = last["week_end"].view("int64") // 10**9
    # fill numeric NaNs
    for c in ["lag1","lag2","lag3","lag4","rmean4","rmean12",
              "price_gross","price_net","margin","disc","taxes",
              "price_gross_lag1","price_net_lag1","margin_lag1","disc_lag1","taxes_lag1",
              "store_rmean4","prod_rmean4","zipcode","week_ord"]:
        if c in last.columns:
            last[c] = pd.to_numeric(last[c], errors="coerce").fillna(0)
    # categoricals
    for c in cat_cols:
        if c in last.columns:
            last[c] = last[c].astype("category")
    return last

fcsts = []
for we in jan_we:
    base = next_inputs(history, we)
    X_next = base[[c for c in feature_cols if c in base.columns]].copy()
    yhat = lgbm.predict(X_next)
    yhat = np.maximum(yhat, 0.0)
    add = base[["pdv","produto","week_end"]].copy()
    add["quantidade"] = yhat
    # carry attributes forward (no holiday nonsense needed)
    for c in ["price_gross","price_net","margin","disc","taxes",
              "premise","categoria_pdv","zipcode","categoria","tipos","label","subcategoria","marca","fabricante"]:
        if c in base.columns:
            add[c] = base[c].values
    history = pd.concat([history, add], ignore_index=True)
    fcsts.append(add)

jan_fcst = pd.concat(fcsts, ignore_index=True)

  last["week_ord"] = last["week_end"].view("int64") // 10**9
  last["week_ord"] = last["week_end"].view("int64") // 10**9
  last["week_ord"] = last["week_end"].view("int64") // 10**9
  last["week_ord"] = last["week_end"].view("int64") // 10**9
  last["week_ord"] = last["week_end"].view("int64") // 10**9


In [28]:
week_map = {
    pd.Timestamp("2023-01-07"): 1,
    pd.Timestamp("2023-01-14"): 2,
    pd.Timestamp("2023-01-21"): 3,
    pd.Timestamp("2023-01-28"): 4,
    pd.Timestamp("2023-02-04"): 5,
}
jan_fcst["semana"] = jan_fcst["week_end"].map(week_map)

submission = (
    jan_fcst.assign(quantidade=lambda d: np.rint(d["quantidade"]).astype(int))
            .rename(columns={"pdv":"pdv", "produto":"produto"})
            [["semana","pdv","produto","quantidade"]]
            .sort_values(["semana","pdv","produto"])
            .reset_index(drop=True)
)

# keep only pairs seen in 2022
seen_pairs = set(map(tuple, weekly[["pdv","produto"]].drop_duplicates().values))
submission = submission[submission[["pdv","produto"]].apply(tuple, axis=1).isin(seen_pairs)]

submission.to_parquet("submission.parquet", index=False)
submission.to_csv("submission.csv", sep=";", index=False, encoding="utf-8")
print("Saved:", len(submission), "rows")


Saved: 5221550 rows


## 5. Filtrando Resultado Final por Prioridade

In [29]:
MAX_ROWS = 1_500_000
WEEKS = 5
MAX_PAIRS = MAX_ROWS // WEEKS

In [30]:
last_week = pd.to_datetime(weekly["week_end"].max())
recent_cut = last_week - pd.Timedelta(weeks=12)

weekly_recent = weekly[weekly["week_end"] >= recent_cut]

pair_total = (
    weekly.groupby(["pdv","produto"], as_index=False)["quantidade"]
          .sum().rename(columns={"quantidade":"total_2022"})
)
pair_recent = (
    weekly_recent.groupby(["pdv","produto"], as_index=False)["quantidade"]
                 .sum().rename(columns={"quantidade":"recent_12w"})
)
pair_last_sale = (
    weekly.groupby(["pdv","produto"], as_index=False)["week_end"]
          .max().rename(columns={"week_end":"last_sale"})
)

pair_stats = (
    pair_total.merge(pair_recent, on=["pdv","produto"], how="left")
              .merge(pair_last_sale, on=["pdv","produto"], how="left")
              .fillna({"recent_12w":0})
)

In [31]:
# prioritize recent_12w, then total_2022, then recency of last sale
pair_stats = pair_stats.sort_values(
    ["recent_12w", "total_2022", "last_sale"],
    ascending=[False, False, False]
).reset_index(drop=True)

# and pick top pairs so that total rows <= 1.5M (for whatever reason)
n_pairs = min(len(pair_stats), MAX_PAIRS)
top_pairs = pair_stats.iloc[:n_pairs, :][["pdv","produto"]]
top_set = set(map(tuple, top_pairs.values))

In [32]:
submission_capped = (
    submission[submission[["pdv","produto"]].apply(tuple, axis=1).isin(top_set)]
      .copy()
)

In [33]:
submission_capped = submission_capped.drop_duplicates(["semana","pdv","produto"], keep="last")

In [34]:
for col in ["semana","pdv","produto","quantidade"]:
    submission_capped[col] = pd.to_numeric(submission_capped[col], errors="coerce")

# If any NA popped due to bad parsing, fill with -1 or raise:
if submission_capped[["semana","pdv","produto","quantidade"]].isnull().any().any():
    bad = submission_capped[submission_capped[["semana","pdv","produto","quantidade"]].isnull().any(axis=1)]
    print("WARNING: Some IDs/quantities could not be parsed to int. Showing first few bad rows:")
    print(bad.head())

In [35]:
submission_capped = submission_capped.astype({"semana":"int64","pdv":"int64","produto":"int64","quantidade":"int64"})

In [36]:
rows_per_pair = submission_capped.groupby(["pdv","produto"]).size().value_counts()
print("Rows per (pdv, produto):")

Rows per (pdv, produto):


In [37]:
rows_per_pair.head()

Unnamed: 0,count
5,300000


In [38]:
submission_capped.to_parquet("submission_capped.parquet", index=False)
submission_capped.to_csv("submission_capped.csv", sep=";", index=False, encoding="utf-8")

In [40]:
submission_capped.describe()

Unnamed: 0,semana,pdv,produto,quantidade
count,1500000.0,1500000.0,1500000.0,1500000.0
mean,3.0,4.664732e+18,4.366918e+18,7.538432
std,1.414214,2.681782e+18,2.718441e+18,17.80778
min,1.0,1833564000000000.0,7798075000000000.0,1.0
25%,2.0,2.357589e+18,1.835194e+18,2.0
50%,3.0,4.689504e+18,4.101834e+18,4.0
75%,4.0,7.062864e+18,6.760021e+18,9.0
max,5.0,9.22299e+18,9.221123e+18,6971.0
