In [None]:
import pickle
from google.colab import drive

drive.mount('/content/drive')

# Load the saved EDA data
with open('/content/drive/MyDrive/EDA_Data.pkl', 'rb') as f:
    store_train = pickle.load(f)

print("EDA data loaded successfully for ML pipeline!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
EDA data loaded successfully for ML pipeline!


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler ,LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
import joblib
import requests
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import logging
import warnings

In [None]:
# Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

logger.info("Libraries loaded and logging initialized.")

In [None]:
#handling missing values for numerical columns
num_cols = store_train.select_dtypes(include=np.number).columns
imputer_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])
store_train[num_cols] = imputer_pipeline.fit_transform(store_train[num_cols])
logger.info("Missing values handled using median imputation.")


In [None]:
# Convert 'Date' to datetime
store_train['Date'] = pd.to_datetime(store_train['Date'])

# Extract date features
store_train['Year'] = store_train['Date'].dt.year
store_train['Month'] = store_train['Date'].dt.month
store_train['Day'] = store_train['Date'].dt.day
store_train['DayOfWeek'] = store_train['Date'].dt.dayofweek
store_train['IsWeekend'] = store_train['DayOfWeek'].isin([5, 6]).astype(int)
store_train['WeekOfYear'] = store_train['Date'].dt.isocalendar().week.astype(int)
store_train['IsMonthStart'] = store_train['Date'].dt.is_month_start.astype(int)
store_train['IsMonthEnd'] = store_train['Date'].dt.is_month_end.astype(int)

logger.info("Datetime features extracted from 'Date'.")

In [None]:
# Fill missing values
store_train['CompetitionDistance'].fillna(store_train['CompetitionDistance'].median(), inplace=True)

# Encode categorical columns
store_train= pd.get_dummies(store_train, columns=['StoreType', 'Assortment', 'StateHoliday', 'PromoInterval'], drop_first=True)

# Drop unnecessary columns
store_train.drop(['Date', 'Customers'], axis=1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  store_train['CompetitionDistance'].fillna(store_train['CompetitionDistance'].median(), inplace=True)


In [None]:
# prepare feature and target #

In [None]:


X = store_train.drop('Sales', axis=1)
y = store_train['Sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Scale the features using StandardScaler #
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Preprocessing
numeric_features = X.select_dtypes(include=np.number).columns
categorical_features = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

In [None]:

print(X.shape, y.shape, type(X), type(y))

(1017209, 31) (1017209,) <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>


In [None]:
# Build Sklearn Pipeline #

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Create Random Forest model
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=12,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)




In [None]:
model.fit(X_train, y_train)

In [None]:

import datetime

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestRegressor(n_estimators=50,random_state=42))
])
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE:", rmse)



In [None]:
r2 = r2_score(y_test, preds)
print(f"R^2 Score: {r2:.2f}")


NameError: name 'preds' is not defined

In [None]:
# Choose and Defend Loss Function (Task 2.3)#
# RMSE (Root Mean Squared Error) is chosen because
#  It penalizes large errors more than small ones, which is useful for business-sensitive tasks like sales forecasting.
# gives more weight to extreme predictions, which reflects risk better.
#It's in the same units as the target (Sales), making it easier to interpret.




In [None]:
#Serialize Model #

In [None]:
timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
model_Ml = f'model-{timestamp}.pkl'

joblib.dump(pipeline, model_Ml)
print(f"Serialized model saved as {model_Ml}")
