In [42]:
import pandas as pd
import numpy as np
from math import pi
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_absolute_percentage_error
import torch
import torch.nn as nn
from scipy import stats
import optuna
import xgboost as xgb
import catboost as cb
import lightgbm
from sklearn.model_selection import cross_val_score

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
train = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/train.csv')
train_calendar = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/train_calendar.csv')

test = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/test.csv')
test_calendar = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/test_calendar.csv')

In [4]:
def align_train_data(train_df, test_df, target_column=None):
    """
    Align columns of the training DataFrame to match the test DataFrame.
    
    Parameters:
    - train_df: DataFrame containing training data.
    - test_df: DataFrame containing test data.
    - target_column: Optional; target column to exclude from alignment.
    
    Returns:
    - Aligned training DataFrame with only columns present in the test DataFrame.
    """
    # Get columns to keep
    test_columns = test_df.columns.tolist()
    
    # Remove extra columns from the training DataFrame
    train_columns = [col for col in train_df.columns if col in test_columns]
    
    # Ensure the target column is included in the train DataFrame if it's present
    if target_column and target_column in train_df.columns:
        train_columns.append(target_column)
    
    # Filter and reorder columns
    aligned_train_df = train_df[train_columns]
    
    return aligned_train_df

def merge_with_calendar(calendar_df, data_df):
    merged_df = calendar_df.merge(data_df, how='right', on=['date', 'warehouse'])
    return merged_df

# Preprocess function
def preprocess(merged_df):
    # Fill missing values with 0
    merged_df.fillna(0, inplace=True)

    # Convert 'date' to datetime
    merged_df['date'] = pd.to_datetime(merged_df['date'])

    # Generate time-based features
    merged_df['day_of_week'] = merged_df['date'].dt.dayofweek
    merged_df['month'] = merged_df['date'].dt.month
    merged_df['is_weekend'] = merged_df['day_of_week'].isin([5, 6]).astype(int)
    merged_df['year'] = merged_df['date'].dt.year
    merged_df['day_of_month'] = merged_df['date'].dt.day
    merged_df['week_of_year'] = merged_df['date'].dt.isocalendar().week
    merged_df['quarter'] = merged_df['date'].dt.quarter
    merged_df['is_start_of_month'] = merged_df['date'].dt.is_month_start.astype(int)
    merged_df['is_end_of_month'] = merged_df['date'].dt.is_month_end.astype(int)
    merged_df['is_quarter_start'] = merged_df['date'].dt.is_quarter_start.astype(int)
    merged_df['is_quarter_end'] = merged_df['date'].dt.is_quarter_end.astype(int)

    # Generate cyclical features
    merged_df['month_normalized'] = merged_df['month'] / 12
    merged_df['day_normalized'] = merged_df['day_of_month'] / 31
    merged_df['month_sin'] = np.sin(2 * np.pi * merged_df['month_normalized'])
    merged_df['month_cos'] = np.cos(2 * np.pi * merged_df['month_normalized'])
    merged_df['day_sin'] = np.sin(2 * np.pi * merged_df['day_normalized'])
    merged_df['day_cos'] = np.cos(2 * np.pi * merged_df['day_normalized'])
    merged_df.drop(columns=['month_normalized', 'day_normalized'], inplace=True)

    return merged_df

train = align_train_data(train, test, target_column='orders')
train_calendar = align_train_data(train_calendar, test_calendar, target_column='orders')

# Merge and preprocess train data
train_merged = merge_with_calendar(train_calendar, train)
train_preprocessed = preprocess(train_merged)

# Merge and preprocess test data
test_merged = merge_with_calendar(test_calendar, test)
test_preprocessed = preprocess(test_merged)

In [5]:
# Encode object columns
object_cols = [col for col in train_preprocessed.columns if train_preprocessed[col].dtype == 'object' and col != 'id']
le_dict = {}

for col in object_cols:
    le = LabelEncoder()
    # Combine train and test data for consistent encoding
    combined_data = pd.concat([train_preprocessed[col], test_preprocessed[col]], axis=0).astype(str)
    le.fit(combined_data)
    train_preprocessed[col] = le.transform(train_preprocessed[col].astype(str))
    test_preprocessed[col] = le.transform(test_preprocessed[col].astype(str))
    le_dict[col] = le

In [8]:
def remove_outliers_zscore(df, group_col, value_col, threshold=3):
    # Group by the warehouse
    grouped = df.groupby(group_col)
    
    # Initialize an empty DataFrame for filtered data
    filtered_df = pd.DataFrame()

    for name, group in grouped:
        # Calculate Z-scores for the group
        z_scores = np.abs(stats.zscore(group[value_col]))
        
        # Filter out outliers
        group_filtered = group[z_scores < threshold]
        
        # Append to the filtered DataFrame
        filtered_df = pd.concat([filtered_df, group_filtered], ignore_index=True)
    
    return filtered_df

# Apply the function
train_filtered_zscore = remove_outliers_zscore(train_preprocessed, 'warehouse', 'orders')

def remove_outliers_iqr(df, group_col, value_col):
    # Group by the warehouse
    grouped = df.groupby(group_col)
    
    # Initialize an empty DataFrame for filtered data
    filtered_df = pd.DataFrame()

    for name, group in grouped:
        # Calculate Q1 and Q3
        Q1 = group[value_col].quantile(0.25)
        Q3 = group[value_col].quantile(0.75)
        IQR = Q3 - Q1
        
        # Define outlier bounds
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Filter out outliers
        group_filtered = group[(group[value_col] >= lower_bound) & (group[value_col] <= upper_bound)]
        
        # Append to the filtered DataFrame
        filtered_df = pd.concat([filtered_df, group_filtered], ignore_index=True)
    
    return filtered_df

# Apply the function
train_filtered_iqr = remove_outliers_iqr(train_filtered_zscore, 'warehouse', 'orders')


In [9]:
# Separate features and target in train data
X = train_filtered_iqr.drop(columns=['orders', 'id', 'date'], axis=1)
y = train_filtered_iqr['orders']

# Scale the features
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Prepare test data for submission
test_ids = test_preprocessed['id']
test = test_preprocessed.drop(columns=['id', 'date'], axis=1)
test = scaler.transform(test)

In [None]:
def catboost_params(trial):
    return {
        'iterations': trial.suggest_int('iterations', 100, 10000),
        'depth': trial.suggest_int('depth', 1, 16),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-4, 10.0),
        'border_count': trial.suggest_int('border_count', 1, 255),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 10.0),
        'random_strength': trial.suggest_loguniform('random_strength', 0.01, 10.0),
        'rsm': trial.suggest_uniform('rsm', 0.1, 1.0),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 20),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'scale_pos_weight': trial.suggest_loguniform('scale_pos_weight', 0.01, 10.0)
    }

def lgbm_params(trial):
    return {
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'max_depth': trial.suggest_int('max_depth', -1, 128),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 10000),
        'min_split_gain': trial.suggest_loguniform('min_split_gain', 1e-4, 10.0),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 10.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        'subsample': trial.suggest_uniform('subsample', 0.1, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 0, 10),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.1, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 10.0),
        'scale_pos_weight': trial.suggest_loguniform('scale_pos_weight', 0.01, 10.0)
    }

def xgb_params(trial):
    return {
        'n_estimators': trial.suggest_int('n_estimators', 100, 10000),
        'max_depth': trial.suggest_int('max_depth', 1, 16),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'subsample': trial.suggest_uniform('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.1, 1.0),
        'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0.1, 1.0),
        'colsample_bynode': trial.suggest_uniform('colsample_bynode', 0.1, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 10.0),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 10.0),
        'gamma': trial.suggest_loguniform('gamma', 1e-4, 10.0),
        'scale_pos_weight': trial.suggest_loguniform('scale_pos_weight', 0.01, 10.0)
    }


In [None]:
def objective_catboost_cv(trial):
    params = catboost_params(trial)
    model = CatBoostRegressor(**params, silent=True)
    score = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    mse = -score.mean()
    return mse

study_catboost = optuna.create_study(direction='minimize')
study_catboost.optimize(objective_catboost_cv, n_trials=100)
print("Best CatBoost params:", study_catboost.best_params)

In [None]:
def objective_lgbm_cv(trial):
    params = lgbm_params(trial)
    model = LGBMRegressor(**params)
    score = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    mse = -score.mean()
    return mse

study_lgbm = optuna.create_study(direction='minimize')
study_lgbm.optimize(objective_lgbm_cv, n_trials=100)
print("Best LGBM params:", study_lgbm.best_params)

In [None]:
def objective_xgb_cv(trial):
    params = xgb_params(trial)
    model = XGBRegressor(**params)
    score = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    mse = -score.mean()
    return mse

study_xgb = optuna.create_study(direction='minimize')
study_xgb.optimize(objective_xgb_cv, n_trials=100)
print("Best XGB params:", study_xgb.best_params)