In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import lightgbm as lgb
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [3]:
sales = pd.read_csv('../data/sales.csv', index_col=0)
online = pd.read_csv('../data/online.csv', index_col=0)
all_sales = pd.concat([sales, online], ignore_index=True)

In [4]:
# Group by date, store_id, item_id and aggregate
all_sales = all_sales.groupby(['date', 'store_id', 'item_id']).agg({
    'quantity': 'sum',
    'price_base': 'mean'
}).reset_index()
all_sales.head()

Unnamed: 0,date,store_id,item_id,quantity,price_base
0,2022-08-28,1,001829cb707d,7.0,134.76
1,2022-08-28,1,0022b986c8f0,2.0,59.9
2,2022-08-28,1,002f51c34a7a,4.0,83.11
3,2022-08-28,1,0052403cd09c,1.0,45.7
4,2022-08-28,1,005addd8096b,2.0,77.25


In [5]:
# Convert date to datetime
all_sales['date'] = pd.to_datetime(all_sales['date'])
# Create basic time features
all_sales['year'] = all_sales['date'].dt.year
all_sales['month'] = all_sales['date'].dt.month
# all_sales['day'] = all_sales['date'].dt.day
all_sales['day_of_week'] = all_sales['date'].dt.dayofweek

In [6]:
all_sales['date'] = all_sales['date'].astype(str)
all_sales['store_id'] = all_sales['store_id'].astype(str)
all_sales['year'] = all_sales['year'].astype(str)
all_sales['month'] = all_sales['month'].astype(str)
# all_sales['day'] = all_sales['day'].astype(str)
all_sales['day_of_week'] = all_sales['day_of_week'].astype(str)

In [7]:
all_sales = all_sales[all_sales['quantity'] > 0]
all_sales = all_sales.fillna('unknown')
all_sales.isna().sum()

date           0
store_id       0
item_id        0
quantity       0
price_base     0
year           0
month          0
day_of_week    0
dtype: int64

In [8]:
def train_val_test_split(df, target, train_size, val_size, test_size, random_state):
    
    df_full_train, df_test = train_test_split(df, test_size=test_size, random_state=random_state)
    val_portion = val_size / (train_size + val_size)
    df_train, df_val = train_test_split(df_full_train, test_size=val_portion, random_state=random_state)

    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    y_train = df_train[target].values
    y_val = df_val[target].values
    y_test = df_test[target].values

    del df_train[target]
    del df_val[target]
    del df_test[target]

    return df_full_train, df_train, df_test, df_val, y_train, y_val, y_test

In [9]:
df_full_train, df_train, df_test, df_val, y_train, y_val, y_test = \
    train_val_test_split(df=all_sales, target='quantity', train_size=0.6, val_size=0.2, test_size=0.2, random_state=1)

In [10]:
def train_lgbm(df_train, y_train, params=None):
    if params is None:
        # Simplified parameters for faster training
        params = {
            'objective': 'regression',
            'metric': 'rmse',
            'num_leaves': 20,  # reduced from 31
            'learning_rate': 0.1,  # increased from 0.05
            'feature_fraction': 0.8,
            'num_threads': -1,  # Use all CPU cores
            'verbosity': -1  # Reduce verbosity
        }
    
    dicts = df_train.to_dict(orient='records')
    dv = DictVectorizer(sparse=True)
    X_train = dv.fit_transform(dicts)
    
    train_data = lgb.Dataset(X_train, label=y_train)
    
    # Add progress bar
    print("Training LightGBM model...")
    model = lgb.train(
        params, 
        train_data, 
        num_boost_round=50,  # reduced from 100
        callbacks=[lgb.log_evaluation(period=10)]
    )
    
    return dv, model

In [11]:
def predict(df, dv, model):
    dicts = df.to_dict(orient='records')

    X = dv.transform(dicts)
    y_pred = model.predict(X)

    return y_pred

def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [12]:
# Train model
dv, lgb_model = train_lgbm(df_train, y_train)

Training LightGBM model...


In [13]:
# Make predictions on val
y_pred = predict(df_val, dv, lgb_model)
print('LightGBM RMSE:', round(rmse(y_val, y_pred), 3))

LightGBM RMSE: 13.717


In [14]:
# Make predictions on test
y_pred = predict(df_test, dv, lgb_model)
print('LightGBM RMSE:', round(rmse(y_test, y_pred), 3))

LightGBM RMSE: 12.487


In [15]:
test = pd.read_csv('../data/test.csv', sep=';')
test.head()

Unnamed: 0,row_id,item_id,store_id,date
0,0,c578da8e8841,1,27.09.2024
1,1,c578da8e8841,1,28.09.2024
2,2,c578da8e8841,1,29.09.2024
3,3,c578da8e8841,1,30.09.2024
4,4,c578da8e8841,1,01.10.2024


In [16]:
def prepare_test_data(test_df):
    # Convert date to datetime
    test_df['date'] = pd.to_datetime(test_df['date'], format='%d.%m.%Y')
    
    # Create time features
    test_df['year'] = test_df['date'].dt.year
    test_df['month'] = test_df['date'].dt.month
    test_df['day_of_week'] = test_df['date'].dt.dayofweek
    
    # Convert to string type
    test_df['date'] = test_df['date'].astype(str)
    test_df['store_id'] = test_df['store_id'].astype(str)
    test_df['year'] = test_df['year'].astype(str)
    test_df['month'] = test_df['month'].astype(str)
    test_df['day_of_week'] = test_df['day_of_week'].astype(str)
    
    return test_df

In [17]:
processed_test = prepare_test_data(test)

In [18]:
test_predictions = predict(processed_test, dv, lgb_model)

In [19]:
submission = pd.DataFrame({
    'row_id': test['row_id'],
    'quantity': test_predictions
})

In [20]:
submission.to_csv('submission_lgbm_20250109.csv', index=False)