In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import lightgbm as lgb

In [9]:
train = pd.read_csv('../data/train.csv', parse_dates=['date'])
test = pd.read_csv('../data/test.csv', parse_dates=['date'])
stores = pd.read_csv('../data/stores.csv')
oil = pd.read_csv('../data/oil.csv', parse_dates=['date'])
holidays = pd.read_csv('../data/holidays_events.csv', parse_dates=['date'])
transactions = pd.read_csv('../data/transactions.csv', parse_dates=['date'])

In [10]:
def prepare_data(train_df, stores_df, oil_df, holidays_df):
    df = train_df.merge(stores_df, on='store_nbr', how='left')
    
    oil_df['dcoilwtico'] = oil_df['dcoilwtico'].ffill()
    df = df.merge(oil_df, on='date', how='left')
    
    nat_holidays = holidays_df[(holidays_df['locale'] == 'National') & 
                               (holidays_df['transferred'] == False)]
    nat_holidays = nat_holidays.drop_duplicates('date')[['date', 'type']]
    nat_holidays = nat_holidays.rename(columns={'type': 'holiday_type'})
    
    df = df.merge(nat_holidays, on='date', how='left')
    df['is_holiday'] = df['holiday_type'].notnull().astype(int)
    
    return df

df = prepare_data(train, stores, oil, holidays)

In [11]:
df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,dcoilwtico,holiday_type,is_holiday
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,Quito,Pichincha,D,13,,Holiday,1
1,1,2013-01-01,1,BABY CARE,0.0,0,Quito,Pichincha,D,13,,Holiday,1
2,2,2013-01-01,1,BEAUTY,0.0,0,Quito,Pichincha,D,13,,Holiday,1
3,3,2013-01-01,1,BEVERAGES,0.0,0,Quito,Pichincha,D,13,,Holiday,1
4,4,2013-01-01,1,BOOKS,0.0,0,Quito,Pichincha,D,13,,Holiday,1


In [12]:
def create_features(df):
    # 1. Basic Time Features
    df['day_of_week'] = df['date'].dt.dayofweek
    df['day_of_month'] = df['date'].dt.day
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    
    # 2. Fourier Features (Captures the 7-day weekly cycle perfectly)
    df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    
    # 3. The Earthquake Fix (April 16, 2016)
    # We tell the model how many weeks have passed since the disaster 
    # to help it "forget" the artificial spike in sales.
    earthquake_date = pd.to_datetime('2016-04-16')
    df['weeks_since_earthquake'] = (df['date'] - earthquake_date).dt.days // 7
    df['weeks_since_earthquake'] = df['weeks_since_earthquake'].clip(lower=0) 

    # 4. Better Lags & Rolling windows
    # We'll stick to lag_16 for safety, but add a 14-day rolling mean
    df['lag_16'] = df.groupby(['store_nbr', 'family'])['sales'].transform(lambda x: x.shift(16))
    df['rolling_mean_14'] = df.groupby(['store_nbr', 'family'])['lag_16'].transform(lambda x: x.rolling(14).mean())

    return df

df = create_features(df)

In [14]:
# We'll train on everything before August 2017
# And validate on the first 15 days of August 2017
train_set = df[df['date'] < '2017-08-01'].dropna()
val_set = df[(df['date'] >= '2017-08-01') & (df['date'] <= '2017-08-15')]

# Define features and target
features = ['store_nbr', 'onpromotion', 'dcoilwtico', 'day_of_week', 
            'day_sin', 'day_cos', 'weeks_since_earthquake', 'lag_16', 'rolling_mean_14']
target = 'sales'

X_train, y_train = train_set[features], train_set[target]
X_val, y_val = val_set[features], val_set[target]

In [15]:
# Apply Log Transformation to the target
# This helps the model focus on percentage errors (better for RMSLE)
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

print("Target transformed to log scale.")

Target transformed to log scale.


In [16]:
# 1. Prepare Test set similarly to Train set
# We need to concatenate them to calculate lags correctly for the test period
full_df = pd.concat([train, test], axis=0).reset_index(drop=True)

# 2. Re-apply the merging and feature logic
# (Using the functions we defined earlier)
full_df = prepare_data(full_df, stores, oil, holidays)
full_df = create_features(full_df)

# 3. Separate them back out
train_final = full_df[full_df['sales'].notnull()]
test_final = full_df[full_df['sales'].isnull()]

print(f"Final training rows: {len(train_final)}")
print(f"Final test rows (should be 28512): {len(test_final)}")

Final training rows: 3000888
Final test rows (should be 28512): 28512


In [None]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor


families = train_final['family'].unique()

print(f"Starting Per-Family Training for {len(families)} families...")
import lightgbm as lgb

features = ['store_nbr', 'onpromotion', 'dcoilwtico', 'day_of_week', 
            'day_sin', 'day_cos', 'weeks_since_earthquake', 'lag_16', 'rolling_mean_14']

all_test_preds = []
test_ids = []

print("Training 33 specialized LightGBM models...")

lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.8,
}

for fam in families:
    train_fam = train_final[train_final['family'] == fam]
    test_fam = test_final[test_final['family'] == fam]
    
    # Prepare datasets for LightGBM
    dtrain = lgb.Dataset(train_fam[features], label=np.log1p(train_fam['sales']))
    
    # Train
    model = lgb.train(lgb_params, dtrain, num_boost_round=200)
    
    # Predict
    preds = np.expm1(model.predict(test_fam[features]))
    
    all_test_preds.extend(preds)
    test_ids.extend(test_fam['id'])
    print(f"âœ… LightGBM Finished: {fam}")

Starting Per-Family Training for 33 families...
Training 33 specialized LightGBM models...
âœ… LightGBM Finished: AUTOMOTIVE
âœ… LightGBM Finished: BABY CARE
âœ… LightGBM Finished: BEAUTY
âœ… LightGBM Finished: BEVERAGES
âœ… LightGBM Finished: BOOKS
âœ… LightGBM Finished: BREAD/BAKERY
âœ… LightGBM Finished: CELEBRATION
âœ… LightGBM Finished: CLEANING
âœ… LightGBM Finished: DAIRY
âœ… LightGBM Finished: DELI
âœ… LightGBM Finished: EGGS
âœ… LightGBM Finished: FROZEN FOODS
âœ… LightGBM Finished: GROCERY I
âœ… LightGBM Finished: GROCERY II
âœ… LightGBM Finished: HARDWARE
âœ… LightGBM Finished: HOME AND KITCHEN I
âœ… LightGBM Finished: HOME AND KITCHEN II
âœ… LightGBM Finished: HOME APPLIANCES
âœ… LightGBM Finished: HOME CARE
âœ… LightGBM Finished: LADIESWEAR
âœ… LightGBM Finished: LAWN AND GARDEN
âœ… LightGBM Finished: LINGERIE
âœ… LightGBM Finished: LIQUOR,WINE,BEER
âœ… LightGBM Finished: MAGAZINES
âœ… LightGBM Finished: MEATS
âœ… LightGBM Finished: PERSONAL CARE
âœ… LightGBM Finished: PET

In [19]:
# 5. Create the final submission file
submission_v2 = pd.DataFrame({
    'id': np.array(test_ids).astype(int),
    'sales': np.array(all_test_preds)
})

# Sort by ID to make sure it's in the right order for Kaggle
submission_v2 = submission_v2.sort_values('id')

submission_v2.to_csv('../submissions/submission_per_family.csv', index=False)
print("\nðŸš€ New submission file saved: ../submissions/submission_per_family.csv")


ðŸš€ New submission file saved: ../submissions/submission_per_family.csv
