In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


In [2]:
import os
import pandas as pd

# Function to read CSV normally
def read_csv_clean(file_path):
    df = pd.read_csv(file_path)  # no need to drop anything
    return df

# List of CSV files
csv_files = [
    "/Users/karma/Desktop/store-sales-time-series-forecasting/holidays_events.csv",
    "/Users/karma/Desktop/store-sales-time-series-forecasting/train.csv",
    "/Users/karma/Desktop/store-sales-time-series-forecasting/stores.csv"
]

# Dictionary to store DataFrames
dataframes = {}

# Loop to read each CSV and print info + head
for file in csv_files:
    name = os.path.basename(file).split('.')[0]
    df = read_csv_clean(file)
    dataframes[name] = df
    
    print(f"\n=== {name} ===")
    print("\n.info():")
    print(df.info())
    print("\n.head():")
    print(df.head())



=== holidays_events ===

.info():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   date         350 non-null    object
 1   type         350 non-null    object
 2   locale       350 non-null    object
 3   locale_name  350 non-null    object
 4   description  350 non-null    object
 5   transferred  350 non-null    bool  
dtypes: bool(1), object(5)
memory usage: 14.1+ KB
None

.head():
         date     type    locale locale_name                    description  \
0  2012-03-02  Holiday     Local       Manta             Fundacion de Manta   
1  2012-04-01  Holiday  Regional    Cotopaxi  Provincializacion de Cotopaxi   
2  2012-04-12  Holiday     Local      Cuenca            Fundacion de Cuenca   
3  2012-04-14  Holiday     Local    Libertad      Cantonizacion de Libertad   
4  2012-04-21  Holiday     Local    Riobamba      Cantonizacion de Riobam

In [3]:
#Type does not affect the sales so I will drop one to avoid complications.
dataframes['stores'] = dataframes['stores'].drop(columns=['type'])

In [4]:
dataframes["stores"].head()

Unnamed: 0,store_nbr,city,state,cluster
0,1,Quito,Pichincha,13
1,2,Quito,Pichincha,13
2,3,Quito,Pichincha,8
3,4,Quito,Pichincha,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,4


In [5]:
# Ensure date columns are datetime
for df_name in ['train', 'holidays_events']:
    dataframes[df_name]['date'] = pd.to_datetime(dataframes[df_name]['date'])

# Merge train with stores
df_combined = pd.merge(
    dataframes['train'], 
    dataframes['stores'], 
    on='store_nbr', 
    how='left'
)

# Merge with holidays_events
df_combined = pd.merge(
    df_combined, 
    dataframes['holidays_events'], 
    on='date', 
    how='left'
)



In [6]:
df_combined.head()


Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,city,state,cluster,type,locale,locale_name,description,transferred
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,Quito,Pichincha,13,Holiday,National,Ecuador,Primer dia del ano,False
1,1,2013-01-01,1,BABY CARE,0.0,0,Quito,Pichincha,13,Holiday,National,Ecuador,Primer dia del ano,False
2,2,2013-01-01,1,BEAUTY,0.0,0,Quito,Pichincha,13,Holiday,National,Ecuador,Primer dia del ano,False
3,3,2013-01-01,1,BEVERAGES,0.0,0,Quito,Pichincha,13,Holiday,National,Ecuador,Primer dia del ano,False
4,4,2013-01-01,1,BOOKS,0.0,0,Quito,Pichincha,13,Holiday,National,Ecuador,Primer dia del ano,False


In [7]:
df_combined.isnull().sum()

id                   0
date                 0
store_nbr            0
family               0
sales                0
onpromotion          0
city                 0
state                0
cluster              0
type           2551824
locale         2551824
locale_name    2551824
description    2551824
transferred    2551824
dtype: int64

In [8]:
#I want to work on type(working/holiday) because generally this affects a lot on sales.
#Transferred column makes my analysis a bit complicate, I will see how many True values are present and decide to keep it remove it.
# ----------------------------
# 1. Inspect 'transferred' column
# ----------------------------
print("Transferred value counts:")
print(df_combined["transferred"].value_counts(dropna=False))

# ----------------------------
# 2. Create 'day_off' based on holiday/event type
# ----------------------------
day_off_categories = ["Holiday", "Event", "Additional", "Bridge"]
df_combined["day_off"] = df_combined["type"].isin(day_off_categories)

# ----------------------------
# 3. Copy combined DataFrame
# ----------------------------
df_final = df_combined.copy()

# ----------------------------
# 4. Drop unnecessary columns
# ----------------------------
drop_cols = [
    "store_nbr", "family", "city", "state", "cluster",
    "type", "locale", "locale_name", "description", "transferred"
]
df_final.drop(columns=drop_cols, inplace=True, errors='ignore')  # errors='ignore' if column missing

# ----------------------------
# 5. Add date features
# ----------------------------
df_final['date'] = pd.to_datetime(df_final['date'])
df_final['day'] = df_final['date'].dt.day
df_final['month'] = df_final['date'].dt.month
df_final['weekday'] = df_final['date'].dt.weekday

# ----------------------------
# 6. Check final dataset
# ----------------------------
print("\nFinal dataset info:")
print(df_final.info())
print("\nFinal dataset preview:")
print(df_final.tail())



Transferred value counts:
transferred
NaN      2551824
False     486486
True       16038
Name: count, dtype: int64

Final dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3054348 entries, 0 to 3054347
Data columns (total 8 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           int64         
 1   date         datetime64[ns]
 2   sales        float64       
 3   onpromotion  int64         
 4   day_off      bool          
 5   day          int32         
 6   month        int32         
 7   weekday      int32         
dtypes: bool(1), datetime64[ns](1), float64(1), int32(3), int64(2)
memory usage: 131.1 MB
None

Final dataset preview:
              id       date     sales  onpromotion  day_off  day  month  \
3054343  3000883 2017-08-15   438.133            0     True   15      8   
3054344  3000884 2017-08-15   154.553            1     True   15      8   
3054345  3000885 2017-08-15  2419.729          148     True   15      8   
305

In [9]:
# Aggregate sales, promotions, and day_off per id per date
df_daily = df_final.groupby(['date']).agg({
    'sales': 'sum',          # total sales per id per date
    'onpromotion': 'sum',    # total promotions per id per date
    'day_off': 'max',        # if any row is day_off, mark as 1
    'day': 'first',          # keep day
    'month': 'first',        # keep month
    'weekday': 'first'       # keep weekday
}).reset_index()

# Rename column
df_daily.rename(columns={'sales': 'total_sales'}, inplace=True)

# Convert day_off to integer
df_daily['day_off'] = df_daily['day_off'].astype(int)

# Sort by id and date
df_daily = df_daily.sort_values(['date']).reset_index(drop=True)

# Preview
print(df_daily.head(20))


         date    total_sales  onpromotion  day_off  day  month  weekday
0  2013-01-01    2511.618999            0        1    1      1        1
1  2013-01-02  496092.417944            0        0    2      1        2
2  2013-01-03  361461.231124            0        0    3      1        3
3  2013-01-04  354459.677093            0        0    4      1        4
4  2013-01-05  477350.121229            0        0    5      1        5
5  2013-01-06  519695.401088            0        0    6      1        6
6  2013-01-07  336122.801066            0        0    7      1        0
7  2013-01-08  318347.777981            0        0    8      1        1
8  2013-01-09  302530.809018            0        0    9      1        2
9  2013-01-10  258982.003049            0        0   10      1        3
10 2013-01-11  289737.685085            0        0   11      1        4
11 2013-01-12  403258.212011            0        0   12      1        5
12 2013-01-13  464638.547998            0        0   13      1  

In [10]:
df_daily.isnull().sum()

date           0
total_sales    0
onpromotion    0
day_off        0
day            0
month          0
weekday        0
dtype: int64

In [11]:
#Lets make sure to sort the dataframe on date, so that it can be fed to LSTM.
df_daily = df_daily.sort_values('date')

In [12]:
# Features and target
features = ['onpromotion', 'day_off', 'day', 'month', 'weekday']
target = 'total_sales'

In [13]:
# Encode boolean 0/1
df_daily['day_off'] = df_daily['day_off'].astype(int)

In [14]:
# Scale features and target to make sure biasness is avoided against smaller numbers. Also, activation fn used in LSTM outputs values between 0 to 1 or -1 to 1, so helps to converge faster by using scaler.

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_X.fit_transform(df_daily[features])
y_scaled = scaler_y.fit_transform(df_daily[[target]])

In [15]:
#Create a sequence that is acceptable to LSTM nn (3D input(samples, timesteps, features))
def create_sequences(X, y, seq_length=30):
    """
    Create sequences of features and target for LSTM
    seq_length: number of past days to use for prediction
    """
    X_seq, y_seq = [], []
    for i in range(len(X) - seq_length):
        X_seq.append(X[i:i+seq_length])
        y_seq.append(y[i+seq_length])
    return np.array(X_seq), np.array(y_seq)

SEQ_LENGTH = 30  # use past 30 days to predict next day
X_lstm, y_lstm = create_sequences(X_scaled, y_scaled, SEQ_LENGTH)

print("X_lstm shape:", X_lstm.shape)
print("y_lstm shape:", y_lstm.shape)

X_lstm shape: (1654, 30, 5)
y_lstm shape: (1654, 1)


In [16]:
#LSTMs learn from the ordered sequence of past values instead of treating each day as independent.
#Therefore, we have 1654 training sequences of 30 days each.

In [17]:
#Input:  30 past days of features → Output: sales on day 31


In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split
import numpy as np
import os

# =============================
# Device setup
# =============================
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("✅ Using Apple MPS:", device)
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("✅ Using NVIDIA GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("💻 Using CPU only")

# =============================
# Example LSTM model
# =============================
class SalesLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(SalesLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = out[:, -1, :]     # last timestep output
        out = self.fc(out)
        return out

# =============================
# Early Stopping
# =============================
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0.0, save_path="best_model.pth"):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = float("inf")
        self.early_stop = False
        self.save_path = save_path

    def __call__(self, val_loss, model):
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
            torch.save(model.state_dict(), self.save_path)  # Save best weights
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

# =============================
# Hyperparameters
# =============================
input_size = 5      # number of features
hidden_size = 64
num_layers = 2
output_size = 1
num_epochs = 50
batch_size = 64
learning_rate = 0.001

# =============================
# Train/Validation split
# (Assumes X_lstm, y_lstm already created)
# =============================
X_train, X_val, y_train, y_val = train_test_split(X_lstm, y_lstm, test_size=0.2, shuffle=False)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
X_val_tensor   = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_tensor   = torch.tensor(y_val, dtype=torch.float32).to(device)

train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(TensorDataset(X_val_tensor, y_val_tensor), batch_size=batch_size, shuffle=False)

# =============================
# Define model, loss, optimizer
# =============================
model = SalesLSTM(input_size, hidden_size, num_layers, output_size).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

early_stopping = EarlyStopping(patience=7, min_delta=1e-4, save_path="best_lstm_model.pth")

# =============================
# TensorBoard Writer
# =============================
log_dir = "runs/sales_lstm"
os.makedirs(log_dir, exist_ok=True)
writer = SummaryWriter(log_dir)

# =============================
# Training Loop
# =============================
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    # Training
    for X_batch, y_batch in train_loader:
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)

    # ✅ Log to TensorBoard
    writer.add_scalars("Loss", {"Train": avg_train_loss, "Validation": avg_val_loss}, epoch)

    print(f"Epoch [{epoch+1}/{num_epochs}] | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

    # Early stopping
    early_stopping(avg_val_loss, model)
    if early_stopping.early_stop:
        print(f"⏹️ Early stopping at epoch {epoch+1}")
        break

# Close TensorBoard writer
writer.close()

# =============================
# Load Best Model After Training
# =============================
model.load_state_dict(torch.load("best_lstm_model.pth"))
print("✅ Best model loaded from checkpoint")


✅ Using Apple MPS: mps
Epoch [1/50] | Train Loss: 0.0157 | Val Loss: 0.0162
Epoch [2/50] | Train Loss: 0.0059 | Val Loss: 0.0087
Epoch [3/50] | Train Loss: 0.0054 | Val Loss: 0.0091
Epoch [4/50] | Train Loss: 0.0051 | Val Loss: 0.0076
Epoch [5/50] | Train Loss: 0.0049 | Val Loss: 0.0062
Epoch [6/50] | Train Loss: 0.0045 | Val Loss: 0.0055
Epoch [7/50] | Train Loss: 0.0041 | Val Loss: 0.0078
Epoch [8/50] | Train Loss: 0.0041 | Val Loss: 0.0086
Epoch [9/50] | Train Loss: 0.0039 | Val Loss: 0.0084
Epoch [10/50] | Train Loss: 0.0037 | Val Loss: 0.0081
Epoch [11/50] | Train Loss: 0.0036 | Val Loss: 0.0089
Epoch [12/50] | Train Loss: 0.0036 | Val Loss: 0.0061
Epoch [13/50] | Train Loss: 0.0036 | Val Loss: 0.0060
⏹️ Early stopping at epoch 13
✅ Best model loaded from checkpoint


In [20]:
#model structure
model = SalesLSTM(input_size, hidden_size, num_layers, output_size).to(device)

# Load saved weights
model.load_state_dict(torch.load("best_lstm_model.pth"))
model.eval()  # set to evaluation mode
print("✅ Model loaded for forecasting")


✅ Model loaded for forecasting


In [21]:
#Lets load the saved model for forecasting
#first prepare the input as expected by the model
print("X_lstm shape:", X_lstm.shape)
print("y_lstm shape:", y_lstm.shape)

X_lstm shape: (1654, 30, 5)
y_lstm shape: (1654, 1)


In [25]:
#take the last sequence from X_lstm
last_sequence = X_lstm[-1]
print(last_sequence.shape)
last_sequence = torch.tensor(last_sequence, dtype=torch.float32).to(device)
# Add batch dimension: (1, seq_length, input_size)
last_sequence = last_sequence.unsqueeze(0)

print(last_sequence.shape)

(30, 5)
torch.Size([1, 30, 5])


In [26]:
with torch.no_grad():
    next_day_scaled = model(last_sequence)
print("Next day scaled prediction:", next_day_scaled.item())


Next day scaled prediction: 0.22807924449443817


In [27]:
# Convert prediction to numpy
next_day_scaled_np = next_day_scaled.cpu().numpy()  # shape (1,1)

# Inverse transform using scaler_y
next_day = scaler_y.inverse_transform(next_day_scaled_np)

print("Next day forecast (original scale):", next_day[0][0])



Next day forecast (original scale): 749544.3
