In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

# Load the data
train = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv')
test = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv')
stores = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')
oil = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv')
holidays = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv')
transactions = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv')
sample_submission = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv')

# Preprocessing - Keeping only data from 2016 onwards
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])
train = train[train['date'] >= '2016-01-01']

# Adding features for year, month, and holidays
train['holiday'] = train['date'].isin(holidays['date'])
test['holiday'] = test['date'] == pd.to_datetime('2017-08-24')
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train['weekday'] = train['date'].dt.weekday
test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day
test['weekday'] = test['date'].dt.weekday

# Drop date for training and test sets
train = train.drop(columns=['date'])
test = test.drop(columns=['date'])

# One-hot encoding for categorical columns
object_cols = train.select_dtypes(include=['object']).columns
train = pd.get_dummies(train, columns=object_cols, drop_first=True)
test = pd.get_dummies(test, columns=object_cols, drop_first=True)

# Aligning train and test sets
train, test = train.align(test, join='left', axis=1, fill_value=0)

# Features and target setup
X = train.drop(columns=['sales'])
y = train['sales']

# Log transformation of the target to reduce the effect of outliers
y_log = np.log1p(y)

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y_log, test_size=0.2, random_state=42)

# Scaling
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_train_scaled = scaler_X.fit_transform(X_train)
X_val_scaled = scaler_X.transform(X_val)
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
y_val_scaled = scaler_y.transform(y_val.values.reshape(-1, 1))

# Reshaping data for LSTM input
X_train_scaled = X_train_scaled.reshape(X_train_scaled.shape[0], 1, X_train_scaled.shape[1])
X_val_scaled = X_val_scaled.reshape(X_val_scaled.shape[0], 1, X_val_scaled.shape[1])

# Converting data to PyTorch tensors 
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_scaled, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val_scaled, dtype=torch.float32)

# Define the LSTM model in PyTorch
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)  
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)  
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# LSTM Model parameters
input_size = X_train_tensor.shape[2]
hidden_size = 50
num_layers = 2
output_size = 1

# Instantiate the LSTM model, define loss and optimizer
lstm_model = LSTMModel(input_size, hidden_size, num_layers, output_size)
loss_function = nn.MSELoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)

# Training the LSTM model
num_epochs = 50
for epoch in range(num_epochs):
    lstm_model.train()
    optimizer.zero_grad()
    y_train_pred = lstm_model(X_train_tensor)
    loss = loss_function(y_train_pred, y_train_tensor)
    loss.backward()
    optimizer.step()

    # Validation step
    lstm_model.eval()
    with torch.no_grad():
        y_val_pred = lstm_model(X_val_tensor)
        val_loss = loss_function(y_val_pred, y_val_tensor)
        y_val_pred_inverse = scaler_y.inverse_transform(y_val_pred.numpy())
        y_val_inverse = scaler_y.inverse_transform(y_val_tensor.numpy())
        val_rmsle = calculate_rmsle(np.expm1(y_val_inverse), np.expm1(y_val_pred_inverse))
    
    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {loss.item()}, Validation Loss: {val_loss.item()}, Validation RMSLE: {val_rmsle}')

# Extracting features from LSTM
with torch.no_grad():
    lstm_train_features = lstm_model(X_train_tensor).numpy()
    lstm_val_features = lstm_model(X_val_tensor).numpy()

# Combining LSTM features with original dataset
train_lstm_df = pd.DataFrame(lstm_train_features, columns=[f'lstm_feat_{i}' for i in range(lstm_train_features.shape[1])])
val_lstm_df = pd.DataFrame(lstm_val_features, columns=[f'lstm_feat_{i}' for i in range(lstm_val_features.shape[1])])

X_train_combined = pd.concat([X_train.reset_index(drop=True), train_lstm_df], axis=1)
X_val_combined = pd.concat([X_val.reset_index(drop=True), val_lstm_df], axis=1)

# Setting up CatBoost model with fixed parameters (iterations=1000, depth=8, learning_rate=0.1)
catboost_model = CatBoostRegressor(
    iterations=1000,
    depth=8,
    learning_rate=0.1,
    loss_function='RMSE',
    verbose=100
)

# Training CatBoost model
catboost_model.fit(X_train_combined, y_train, eval_set=(X_val_combined, y_val), early_stopping_rounds=50)

# Predictions and evaluation
y_val_pred_catboost = catboost_model.predict(X_val_combined)
val_rmsle_final = calculate_rmsle(np.expm1(y_val), np.expm1(y_val_pred_catboost))
print(f'Final Validation RMSLE: {val_rmsle_final}')

# Test data predictions
X_test_scaled = scaler_X.transform(test.drop(columns=['sales'], errors='ignore'))  
X_test_scaled = X_test_scaled.reshape(X_test_scaled.shape[0], 1, X_test_scaled.shape[1])
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

# RMSLE function
def calculate_rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))


# Extracting LSTM features for test data
with torch.no_grad():
    lstm_test_features = lstm_model(X_test_tensor).numpy()

# Combining LSTM features with the original test features
test_lstm_df = pd.DataFrame(lstm_test_features, columns=[f'lstm_feat_{i}' for i in range(lstm_test_features.shape[1])])
X_test_combined = pd.concat([test.reset_index(drop=True), test_lstm_df], axis=1)

# Making predictions on the test set with the CatBoost model
test_predictions_catboost = catboost_model.predict(X_test_combined)

# Converting predictions back from log1p scale
test['sales'] = np.expm1(test_predictions_catboost)

# Handling negative values in predictions (if any)
test['sales'] = np.where(test['sales'] < 0, 0, test['sales'])

# submission file
submission = test[['id', 'sales']]
submission.to_csv('submission_new.csv', index=False)
print("Submission file generated.")


  train['holiday'] = train['date'].isin(holidays['date'])


Epoch 1/50, Training Loss: 0.11507447063922882, Validation Loss: 0.11152812093496323, Validation RMSLE: 3.91860294342041
Epoch 2/50, Training Loss: 0.11169958114624023, Validation Loss: 0.10825351625680923, Validation RMSLE: 3.860646963119507
Epoch 3/50, Training Loss: 0.10842482000589371, Validation Loss: 0.1050691083073616, Validation RMSLE: 3.803440570831299
Epoch 4/50, Training Loss: 0.10524023324251175, Validation Loss: 0.10196571797132492, Validation RMSLE: 3.7468485832214355
Epoch 5/50, Training Loss: 0.10213664174079895, Validation Loss: 0.09893510490655899, Validation RMSLE: 3.690746784210205
Epoch 6/50, Training Loss: 0.0991058275103569, Validation Loss: 0.0959700495004654, Validation RMSLE: 3.6350207328796387
Epoch 7/50, Training Loss: 0.09614058583974838, Validation Loss: 0.09306420385837555, Validation RMSLE: 3.57956600189209
Epoch 8/50, Training Loss: 0.09323455393314362, Validation Loss: 0.09021230787038803, Validation RMSLE: 3.524292469024658
Epoch 9/50, Training Loss: 