<a href="https://colab.research.google.com/github/helinatefera/10xWeek4/blob/task-2/notebooks/task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [56]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from statsmodels.tsa.stattools import adfuller
from sklearn.preprocessing import MinMaxScaler
import datetime as datetime

In [5]:
clean_data = pd.read_csv('/content/clean_data.csv')

In [6]:
clean_data.head()

Unnamed: 0,Date,Store,Store_Type,Store_Status,Promo,Promo2,School_Holiday,Customers,Sales,DayOfWeek,is_holiday
0,2021-01-01,1,supermarket,1,0,0,0,124,0,4,1
1,2021-01-02,1,pharmacy,1,0,0,0,87,0,5,0
2,2021-01-03,1,supermarket,1,1,0,0,74,778,6,0
3,2021-01-04,1,supermarket,0,1,0,0,0,0,0,0
4,2021-01-05,1,pharmacy,1,1,0,0,76,1002,1,0


In [9]:
X = clean_data.drop("Sales", axis=1)
y = clean_data["Sales"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", RandomForestRegressor(n_estimators=50, random_state=42)),
    ]
)

pipeline.fit(X_train, y_train)

cross_val_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring="neg_mean_absolute_error")
print(f"Cross-validated MAE: {-cross_val_scores.mean()}")

Cross-validated MAE: 50.11510273972603


In [10]:
clean_data['Date'] = pd.to_datetime(clean_data['Date'])
clean_data['Year'] = clean_data['Date'].dt.year
clean_data['Month'] = clean_data['Date'].dt.month
clean_data['WeekOfYear'] = clean_data['Date'].dt.isocalendar().week
clean_data['DayOfMonth'] = clean_data['Date'].dt.day
clean_data['IsWeekend'] = clean_data['DayOfWeek'].isin([6, 7]).astype(int)  # 1 for Saturday and Sunday
clean_data['IsMonthStart'] = clean_data['Date'].dt.is_month_start.astype(int)
clean_data['IsMonthEnd'] = clean_data['Date'].dt.is_month_end.astype(int)
clean_data['PromoDuration'] = clean_data.groupby('Store')['Promo'].cumsum()  # Count consecutive promo days
clean_data['PromoOverlap'] = ((clean_data['Promo'] == 1) & (clean_data['Promo2'] == 1)).astype(int)

In [11]:
clean_data = clean_data.sort_values("Date")
clean_data = clean_data[["Date", "Sales"]].set_index("Date")

In [37]:
clean_data["Sales"] = pd.to_numeric(clean_data["Sales"], errors="coerce")
clean_data = clean_data.dropna()

In [38]:
result = adfuller(clean_data["Sales"])
print(f"ADF Statistic: {result[0]}")
print(f"p-value: {result[1]}")
if result[1] > 0.05:
    print("clean_data is not stationary. Differencing the clean_data.")
    clean_data["Sales_diff"] = clean_data["Sales"].diff().dropna()
else:
    print("clean_data is stationary.")

ADF Statistic: -105.63828297796654
p-value: 0.0
clean_data is stationary.


In [39]:
def create_supervised_data(data, window_size):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:i+window_size])
        y.append(data[i+window_size])
    return np.array(X), np.array(y)

In [43]:
window_size = 30
scaler = MinMaxScaler(feature_range=(-1, 1))
data_scaled = scaler.fit_transform(clean_data["Sales"].values.reshape(-1, 1))

In [44]:
X, y = create_supervised_data(data_scaled.flatten(), window_size)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
X_train, y_train = torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32)
X_test, y_test = torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32)

In [52]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

In [53]:
input_size = 1
hidden_size = 50
num_layers = 2
output_size = 1

model = LSTMModel(input_size, hidden_size, num_layers, output_size)

In [73]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

batch_size = 64
epochs = 10

train_data = DataLoader(TensorDataset(X_train.unsqueeze(2), y_train), batch_size=batch_size, shuffle=True)
test_data = DataLoader(TensorDataset(X_test.unsqueeze(2), y_test), batch_size=batch_size)

In [74]:
for epoch in range(epochs):
    model.train()
    for batch_X, batch_y in train_data:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y.unsqueeze(1))
        loss.backward()
        optimizer.step()

    model.eval()
    test_loss = 0
    with torch.no_grad():
        for batch_X, batch_y in test_data:
            outputs = model(batch_X)
            test_loss += criterion(outputs, batch_y.unsqueeze(1)).item()

    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {loss.item():.4f}, Test Loss: {test_loss/len(test_data):.4f}")

Epoch 1/10, Train Loss: 0.2916, Test Loss: 0.2625
Epoch 2/10, Train Loss: 0.2534, Test Loss: 0.2622
Epoch 3/10, Train Loss: 0.1480, Test Loss: 0.2626
Epoch 4/10, Train Loss: 0.3879, Test Loss: 0.2640
Epoch 5/10, Train Loss: 0.2459, Test Loss: 0.2636
Epoch 6/10, Train Loss: 0.2719, Test Loss: 0.2632
Epoch 7/10, Train Loss: 0.2687, Test Loss: 0.2627
Epoch 8/10, Train Loss: 0.1946, Test Loss: 0.2634
Epoch 9/10, Train Loss: 0.2411, Test Loss: 0.2633
Epoch 10/10, Train Loss: 0.2647, Test Loss: 0.2627


In [75]:
model.eval()
with torch.no_grad():
    predictions = model(X_test.unsqueeze(2)).squeeze().numpy()
    print("Predictions:", predictions)
    print("True Values:", y_test)
    true_values = y_test.numpy()

Predictions: [-0.6728376  -0.6364746  -0.65823865 ... -0.6490074  -0.6247119
 -0.6805071 ]
True Values: tensor([-1.0000, -1.0000, -0.7881,  ...,  0.1076,  0.0809, -1.0000])


In [77]:
input_features = [0.1, 0.2, 0.15, 0.25, 0.3]

features = np.array(input_features, dtype=np.float32)
features_tensor = torch.tensor(features).view(1, -1, 1)
with torch.no_grad():
    prediction = model(features_tensor)
print(f"Predicted sales: {prediction.item()}")

Predicted sales: -0.6936050653457642
