In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.compose import make_column_selector, ColumnTransformer
import seaborn as sns
from pandas.plotting import autocorrelation_plot
import holidays
import datetime
import wbdata
import pycountry

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train["date"] = pd.to_datetime(train["date"])
test["date"] = pd.to_datetime(test["date"])
train

In [None]:
#Now we do feature engineering on the test set and training set
# Function to check if a date is a holiday for that country
def is_holiday(row):
    try:
        country_code = row['country']  # make sure this matches ISO format like 'PH', 'US', etc.
        date = row['date']
        return date in holidays.country_holidays(country_code)
    except:
        return False  # if country is invalid, default to False

# --- Economic indicators to fetch ---
indicators = {
    'NY.GDP.MKTP.CD': 'gdp',
    'SL.UEM.TOTL.ZS': 'unemployment',
    'FP.CPI.TOTL.ZG': 'inflation',
    'FR.INR.LEND': 'interest_rate'
}

# --- Get country codes ---
def get_country_code(name):
    try:
        return pycountry.countries.lookup(name).alpha_3
    except LookupError:
        return None
test['num_sold'] = np.nan
combined = pd.concat([train, test], sort=False)
combined.sort_values(by=['country', 'store', 'product', 'date'], inplace=True)

# Step 2: Feature Engineering (same as before)
combined['month'] = combined['date'].dt.to_period('M').dt.to_timestamp()
combined['day_of_week'] = combined["date"].dt.dayofweek.astype(np.int8)
combined['day'] = combined["date"].dt.day.astype(np.int8)
combined['year'] = combined["date"].dt.year.astype(np.int16)
combined['is_weekend'] = combined['day_of_week'].isin([5, 6]).astype(np.int8)
combined['weekofyear'] = combined["date"].dt.isocalendar().week.astype(np.int8)

# --- Holiday Feature ---
combined['is_holiday'] = combined.apply(is_holiday, axis=1).astype(np.int8)

# --- World Bank Economic Data ---
combined['country_code'] = combined['country'].apply(get_country_code)
combined = combined.merge(econ_data, how='left', on=['country_code', 'month'])

# --- Lag Features ---
for lag in [1, 7]:
    combined[f'lag_{lag}'] = combined.groupby(['country', 'store', 'product'])['num_sold'].shift(lag)

# --- Rolling Mean ---
combined['rolling_7'] = (
    combined.groupby(['country', 'store', 'product'])['num_sold']
    .shift(1)
    .rolling(window=7, min_periods=1)
    .mean()
)

# --- Momentum Feature ---
combined['sales_momentum'] = (combined['lag_1'] - combined['rolling_7']) / (combined['rolling_7'] + 1e-5)

# --- Expanding Mean ---
combined['expanding_mean'] = (
    combined.groupby(['country', 'store', 'product'])['num_sold']
    .transform(lambda x: x.shift(1).expanding(min_periods=1).mean())
)

# --- Sales Difference & Growth ---
combined['sales_diff'] = combined['lag_1'] - combined['lag_7']
combined['sales_growth'] = combined['lag_1'] / (combined['lag_7'] + 1e-5)

# --- Categorical Encoding (use existing encoders) ---
for col in ['country', 'store', 'product']:
    le = label_encoders[col]
    combined[col] = le.transform(combined[col])

# --- Combined Categorical Features ---
combined['country_product'] = (combined['country'].astype(str) + '_' + combined['product'].astype(str)).astype('category').cat.codes
combined['store_product'] = (combined['store'].astype(str) + '_' + combined['product'].astype(str)).astype('category').cat.codes
combined['country_store'] = (combined['country'].astype(str) + '_' + combined['store'].astype(str)).astype('category').cat.codes

# --- Time Encoding (Cyclic) ---
combined['month_sin'] = np.sin(2 * np.pi * combined['month'].dt.month / 12)
combined['month_cos'] = np.cos(2 * np.pi * combined['month'].dt.month / 12)
combined['day_of_week_sin'] = np.sin(2 * np.pi * combined['day_of_week'] / 7)
combined['day_of_week_cos'] = np.cos(2 * np.pi * combined['day_of_week'] / 7)

# --- Final Cleanup ---
combined.drop(columns=["date_x", "date_y"], inplace=True, errors='ignore')
combined.fillna(0, inplace=True)

# --- Downcast to save memory ---
for col in combined.select_dtypes(include=['float64']).columns:
    combined[col] = pd.to_numeric(combined[col], downcast='float')
for col in combined.select_dtypes(include=['int64']).columns:
    combined[col] = pd.to_numeric(combined[col], downcast='integer')

# Step 3: Drop temporary column and fill NA
combined.fillna(0, inplace=True)

In [None]:
# Step 4: Re-split
X_train = combined.loc[combined.index.isin(train.index)].copy()
X_test = combined.loc[combined.index.isin(test.index)].copy()

# Step 5: Drop target and predict
X_train_final = X_train.drop(columns=['num_sold'])
y_train = train['num_sold']
X_test_final = X_test.drop(columns=['num_sold'])


In [None]:
#Now to do the machine learning
month_transformer = FunctionTransformer(lambda x: x.astype('int64').values.reshape(-1, 1))

# Identify numeric and categorical columns from X
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Remove 'month' from numeric if it's detected there, to process separately
if 'month' in numeric_cols:
    numeric_cols.remove('month')

# Define preprocessing pipeline
preprocessor = ColumnTransformer([
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy='mean')),
        ("scaler", StandardScaler())
    ]), numeric_cols),
    
    ("month", Pipeline([
        ("month_int", month_transformer),
        ("scaler", StandardScaler())
    ]), ['month']),
    
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy='constant', fill_value='missing')),
        ("onehot", OneHotEncoder(handle_unknown='ignore'))
    ]), categorical_cols)
])


ridge_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("ridge", Ridge())
])

In [None]:
# Hyperparameter tuning
param_grid = {
    "ridge__alpha": [0.01, 0.1, 1.0, 10.0, 100.0]
}

search = GridSearchCV(ridge_pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
search.fit(X_train_final, y_train)

# Best model
best_model = search.best_estimator_
y_pred = best_model.predict(X_val)
rmse = mean_squared_error(y_val, y_pred)

print(f"✅ Best Alpha: {search.best_params_['ridge__alpha']}")
print(f"✅ Validation RMSE: {rmse:.4f}")

In [None]:
test_preds = best_model.predict(X_test)

submission = pd.DataFrame({
    'date': submission_dates,
    'num_sold': test_preds.astype(int)  # If your competition requires integers
})

In [None]:
#The Final File
submission.to_csv('submission.csv', index=False)