### IMPORTING LIBRARIES

In [1]:
from future_engineering import merge_dataframe, future_engineering

from modeling import train_model

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
#Exploratory Data Analysis Libraries
import pandas as pd
import numpy as np

In [4]:
#Statistical Valuator
from sklearn.metrics import mean_squared_log_error

### IMPORTING DATA AND RUNNING FUNCTIONS

In [5]:
train = pd.read_csv('train.csv', parse_dates=['date'])
test = pd.read_csv('test.csv', parse_dates=['date'])
oil = pd.read_csv('oil.csv', parse_dates=['date'])
holidays = pd.read_csv('holidays_events.csv', parse_dates=['date'])
transactions = pd.read_csv('transactions.csv', parse_dates=['date'])
stores = pd.read_csv('stores.csv')

In [6]:
df = merge_dataframe(train, test, oil, holidays, transactions, stores)

Dataset shape after merges: (3029400, 13)


In [7]:
df = future_engineering(df)

In [8]:
model, X_val, y_val, X_test = train_model(df)

### SUBMISSION

In [9]:
# Validation performance
val_pred = model.predict(X_val)
rmsle = np.sqrt(mean_squared_log_error(y_val, np.maximum(0, val_pred)))
print("Validation RMSLE:", rmsle)

#Generate Submission
# Predict on test set
test['sales'] = np.maximum(0, model.predict(X_test))

# Create submission
submission = test[['id', 'sales']]
submission.to_csv('submission.csv', index=False)

Validation RMSLE: 1.2386956788053445


In [10]:
def merge_features(df, oil, holidays, transactions, stores):

  # --- Merge Oil Prices ---
  oil['dcoilwtico'] = oil['dcoilwtico'].ffill()
  df = df.merge(oil, on='date', how='left')

  # --- Merge Holidays ---
  # Ensure 'date' column in holidays is datetime type before merging
  holidays['date'] = pd.to_datetime(holidays['date'])
  # Filter holidays to include only those present in the main dataframe's date range
  holidays_filtered = holidays[holidays['date'].isin(df['date'].unique())]
  # Create a column 'is_holiday' to indicate if a date is a holiday
  holidays_filtered['is_holiday'] = 1
  # Select only 'date' and 'is_holiday' columns and drop duplicates based on 'date'
  holidays_processed = holidays_filtered[['date', 'is_holiday']].drop_duplicates(subset='date')
  # Merge with the main dataframe
  df = df.merge(holidays_processed, on='date', how='left')
  # Fill NaN values in 'is_holiday' with 0, indicating non-holidays
  df['is_holiday'] = df['is_holiday'].fillna(0)

  # --- Merge Transactions ---
  # Ensure 'date' column in transactions is datetime type before merging
  transactions['date'] = pd.to_datetime(transactions['date'])
  # Filter transactions to include only those present in the main dataframe's date and store_nbr combinations
  transactions_filtered = transactions[transactions.set_index(['date', 'store_nbr']).index.isin(df.set_index(['date', 'store_nbr']).index)]
  # Select only 'date', 'store_nbr', and 'transactions' columns and drop duplicates
  transactions_processed = transactions_filtered[['date', 'store_nbr', 'transactions']].drop_duplicates(subset=['date', 'store_nbr'])
  # Merge with the main dataframe
  df = df.merge(transactions_processed, on=['date', 'store_nbr'], how='left')
  # Fill NaN values in 'transactions' with 0
  df['transactions'] = df['transactions'].fillna(0)


  # --- Merge Store Metadata ---
  # Select only 'store_nbr', 'city', 'state', and 'type' columns from the stores dataframe
  stores_processed = stores[['store_nbr', 'city', 'state', 'type']].drop_duplicates(subset='store_nbr')
  # Merge with the main dataframe
  df = df.merge(stores_processed, on='store_nbr', how='left')


  print("Dataset shape after merges:", df.shape)

  return df