In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.compose import make_column_selector, ColumnTransformer
import seaborn as sns
from pandas.plotting import autocorrelation_plot
import holidays
import datetime
import wbdata
import pycountry

2025-06-04 07:45:38.002078: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748994338.428011    2223 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748994338.537497    2223 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748994339.441522    2223 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748994339.441574    2223 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748994339.441576    2223 computation_placer.cc:177] computation placer alr

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train["date"] = pd.to_datetime(train["date"])
test["date"] = pd.to_datetime(test["date"])
train

Unnamed: 0,id,date,country,store,product,num_sold
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0
2,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0
3,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0
4,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0
...,...,...,...,...,...,...
230125,230125,2016-12-31,Singapore,Premium Sticker Mart,Holographic Goose,466.0
230126,230126,2016-12-31,Singapore,Premium Sticker Mart,Kaggle,2907.0
230127,230127,2016-12-31,Singapore,Premium Sticker Mart,Kaggle Tiers,2299.0
230128,230128,2016-12-31,Singapore,Premium Sticker Mart,Kerneler,1242.0


In [10]:
#Now we do feature engineering on the test set and training set
# Function to check if a date is a holiday for that country
def is_holiday(row):
    try:
        country_code = row['country']  # make sure this matches ISO format like 'PH', 'US', etc.
        date = row['date']
        return date in holidays.country_holidays(country_code)
    except:
        return False  # if country is invalid, default to False

# --- Economic indicators to fetch ---
indicators = {
    'NY.GDP.MKTP.CD': 'gdp',
    'SL.UEM.TOTL.ZS': 'unemployment',
    'FP.CPI.TOTL.ZG': 'inflation',
    'FR.INR.LEND': 'interest_rate'
}

# --- Get country codes ---
def get_country_code(name):
    try:
        return pycountry.countries.lookup(name).alpha_3
    except LookupError:
        return None

test['num_sold'] = np.nan
combined = pd.concat([train, test], sort=False)
combined.sort_values(by=['country', 'store', 'product', 'date'], inplace=True)

# Step 2: Feature Engineering (same as before)
combined['month'] = combined['date'].dt.to_period('M').dt.to_timestamp()
combined['day_of_week'] = combined["date"].dt.dayofweek.astype(np.int8)
combined['day'] = combined["date"].dt.day.astype(np.int8)
combined['year'] = combined["date"].dt.year.astype(np.int16)
combined['is_weekend'] = combined['day_of_week'].isin([5, 6]).astype(np.int8)
combined['weekofyear'] = combined["date"].dt.isocalendar().week.astype(np.int8)

# --- Holiday Feature ---
combined['is_holiday'] = combined.apply(is_holiday, axis=1).astype(np.int8)

# --- Lag Features ---
for lag in [1, 7]:
    combined[f'lag_{lag}'] = combined.groupby(['country', 'store', 'product'])['num_sold'].shift(lag)

# --- Rolling Mean ---
combined['rolling_7'] = (
    combined.groupby(['country', 'store', 'product'])['num_sold']
    .shift(1)
    .rolling(window=7, min_periods=1)
    .mean()
)

# --- World Bank Economic Data ---
# Fetch World Bank data (note: data is annual)
combined['country_code'] = combined['country'].apply(get_country_code)
countries = combined['country_code'].dropna().unique().tolist()

min_year = combined['month'].min().year
max_year = combined['month'].max().year
start_date = datetime.datetime(min_year, 1, 1)
end_date = datetime.datetime(max_year, 12, 31)

econ_frames = []
for country in countries:
    df = wbdata.get_dataframe(
        indicators,
        country=country,
        date=(start_date, end_date),  
    )
    df['country'] = country
    econ_frames.append(df)

# Combine all country data
econ_data = pd.concat(econ_frames).reset_index()

# Fix column names and extract 'month' and 'country_code'
econ_data.rename(columns={'date': 'month', 'country': 'econ_country'}, inplace=True)
econ_data['month'] = pd.to_datetime(econ_data['month']).dt.to_period('M').dt.to_timestamp()
econ_data['country_code'] = econ_data['econ_country']
combined = combined.merge(econ_data, how='left', on=['country_code', 'month'])

# --- Momentum Feature ---
combined['sales_momentum'] = (combined['lag_1'] - combined['rolling_7']) / (combined['rolling_7'] + 1e-5)



required_cols = ['country', 'store', 'product', 'num_sold']
missing = [col for col in required_cols if col not in combined.columns]
if missing:
    raise KeyError(f"Missing columns in `combined`: {missing}")
assert 'country' in combined.columns, "country column missing before merge"
combined = combined.merge(econ_data, how='left', on=['country_code', 'month'])
assert 'country' in combined.columns, "country column missing after merge"
# --- Expanding Mean ---
combined['expanding_mean'] = (
    combined.groupby(['country', 'store', 'product'])['num_sold']
    .transform(lambda x: x.shift(1).expanding(min_periods=1).mean())
)

label_encoders = {}
for col in ['country', 'store', 'product']:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col].astype(str))
    label_encoders[col] = le

# --- Categorical Encoding (use existing encoders) ---
def safe_label_encode(series, le):
    mapping = {cls: i for i, cls in enumerate(le.classes_)}
    return series.map(lambda x: mapping.get(x, -1)).astype(int)

for col in ['country', 'store', 'product']:
    le = label_encoders[col]
    combined[col] = safe_label_encode(combined[col], le)

# --- Sales Difference & Growth ---
combined['sales_diff'] = combined['lag_1'] - combined['lag_7']
combined['sales_growth'] = combined['lag_1'] / (combined['lag_7'] + 1e-5)


# --- Combined Categorical Features ---
combined['country_product'] = (combined['country'].astype(str) + '_' + combined['product'].astype(str)).astype('category').cat.codes
combined['store_product'] = (combined['store'].astype(str) + '_' + combined['product'].astype(str)).astype('category').cat.codes
combined['country_store'] = (combined['country'].astype(str) + '_' + combined['store'].astype(str)).astype('category').cat.codes

# --- Time Encoding (Cyclic) ---
combined['month_sin'] = np.sin(2 * np.pi * combined['month'].dt.month / 12)
combined['month_cos'] = np.cos(2 * np.pi * combined['month'].dt.month / 12)
combined['day_of_week_sin'] = np.sin(2 * np.pi * combined['day_of_week'] / 7)
combined['day_of_week_cos'] = np.cos(2 * np.pi * combined['day_of_week'] / 7)

# --- Final Cleanup ---
combined.drop(columns=["date_x", "date_y"], inplace=True, errors='ignore')
combined.fillna(0, inplace=True)

# --- Downcast to save memory ---
for col in combined.select_dtypes(include=['float64']).columns:
    combined[col] = pd.to_numeric(combined[col], downcast='float')
for col in combined.select_dtypes(include=['int64']).columns:
    combined[col] = pd.to_numeric(combined[col], downcast='integer')

combined.fillna(0, inplace=True)

  econ_data = pd.concat(econ_frames).reset_index()


In [13]:
#splitting it for pretraining
X = combined.loc[combined.index.isin(train.index)].copy()
y = train.loc[X.index, 'num_sold'].fillna(0)  # Make sure indices align

# Drop target from features
X_final = X.drop(columns=['num_sold'])

# Split into train and validation sets (e.g., 80/20 split)
X_train_final, X_val_final, y_train, y_val = train_test_split(
    X_final, y, test_size=0.2, random_state=42
)

In [17]:
#Now to do the machine learning
month_transformer = FunctionTransformer(lambda x: x.astype('int64').values.reshape(-1, 1))

# Use the correct dataset without the target column
numeric_cols = X_train_final.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X_train_final.select_dtypes(include=['object', 'category']).columns.tolist()

# Remove 'month' from numeric if present
if 'month' in numeric_cols:
    numeric_cols.remove('month')

# Define preprocessing pipeline
preprocessor = ColumnTransformer([
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy='mean')),
        ("scaler", StandardScaler())
    ]), numeric_cols),
    
    ("month", Pipeline([
        ("month_int", month_transformer),
        ("scaler", StandardScaler())
    ]), ['month']),
    
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy='constant', fill_value='missing')),
        ("onehot", OneHotEncoder(handle_unknown='ignore'))
    ]), categorical_cols)
])

ridge_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("ridge", Ridge())
])

In [19]:
# Hyperparameter tuning
param_grid = {
    "ridge__alpha": [0.01, 0.1, 1.0, 10.0, 100.0]
}

search = GridSearchCV(ridge_pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
X_train_final[categorical_cols] = X_train_final[categorical_cols].astype(str)
X_test_final[categorical_cols] = X_test_final[categorical_cols].astype(str)
search.fit(X_train_final, y_train)

# Best model
best_model = search.best_estimator_
y_pred = best_model.predict(X_val_final)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print(f"Best Alpha: {search.best_params_['ridge__alpha']}")
print(f"Validation RMSE: {rmse:.4f}")

Best Alpha: 100.0
Validation RMSE: 692.6372


In [21]:
test_preds = best_model.predict(X_test_final)

submission = pd.DataFrame({
    'id': test['id'],            # Use 'id' from test, not 'date'
    'num_sold': test_preds.round().astype(int)  # Round predictions before converting to int
})

In [22]:
#The Final File
submission.to_csv('submission.csv', index=False)