<a href="https://colab.research.google.com/github/kaisarfardin6620/Sales-Forecasting-Regression/blob/main/Sales_Forecasting_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pycaret[full]
!pip install pycaret[mlops]
!pip install pycaret[time-series]

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.preprocessing import StandardScaler
from pycaret.regression import setup, compare_models, create_model, tune_model, evaluate_model, finalize_model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df =  pd.read_csv('/content/drive/MyDrive/Dataset/sales.csv')

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.head(3)

In [None]:
df.tail(3)

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df['Date'] = pd.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.month
df['Quarter'] = df['Date'].dt.quarter
reference_date = df['Date'].min()
df['DaysSince'] = (df['Date'] - reference_date).dt.days
df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)
df['DayOfWeek'] = df['Date'].dt.dayofweek
df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)

In [None]:
df.drop('Date', axis=1, inplace=True)

In [None]:
df.head(3)

In [None]:
df.hist(figsize=(10, 8))
plt.tight_layout()
plt.show()

In [None]:
sns.pairplot(df)
plt.suptitle("Scatter Plot Matrix", y=1.02)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=df)
plt.title("Box Plots of Numerical Features")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.violinplot(data=df)
plt.title("Violin Plots of Numerical Features")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
df.plot(kind='density', subplots=True, layout=(4, 3), figsize=(15, 10), sharex=False)
plt.suptitle("Density Plots of Numerical Features", y=1.02)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df.select_dtypes(include=np.number).corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()

plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols):
    plt.subplot(len(numerical_cols)//3 + 1, 3, i + 1)
    sns.boxplot(y=df[col])
    plt.title(f'Box Plot of {col}')
plt.tight_layout()
plt.show()

In [None]:
def remove_outliers_iqr_all(df):
    df_cleaned = df.copy()
    numerical_cols = df.select_dtypes(include=np.number).columns.tolist()

    for col in numerical_cols:
        Q1 = df_cleaned[col].quantile(0.25)
        Q3 = df_cleaned[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        df_cleaned = df_cleaned[(df_cleaned[col] >= lower_bound) & (df_cleaned[col] <= upper_bound)]

    return df_cleaned

df_cleaned_all = remove_outliers_iqr_all(df.copy())

print(f"Original DataFrame shape: {df.shape}")
print(f"DataFrame shape after removing outliers from all numerical columns: {df_cleaned_all.shape}")

In [None]:
x = df.drop('Profit', axis=1)
y = df['Profit']

In [None]:
categorical_cols = ['Product'] if 'Product' in x.columns else []
numerical_cols = x.select_dtypes(include=['float64', 'int64']).columns.tolist()

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_cols),
        ('num', StandardScaler(), numerical_cols)
    ],
    remainder='passthrough'
)

In [None]:
x_train_raw, x_test_raw, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
def evaluate_model(name, model, x_test, y_test):
    predictions = model.predict(x_test)

    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    print(f"Metrics for {name}")
    print(f"Mean Squared Error (MSE)      : {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE) : {rmse:.4f}")
    print(f"Mean Absolute Error (MAE)     : {mae:.4f}")
    print(f"R-squared (R2)                : {r2:.4f}")
    print("------------------------")

    return {
        "Model": name,
        "MSE": mse,
        "RMSE": rmse,
        "MAE": mae,
        "R2": r2
    }

metrics = []

In [None]:
pipeline_lr = Pipeline([('preprocessor', preprocessor), ('Linearregressor', LinearRegression())])
pipeline_lr.fit(x_train_raw, y_train)
metrics.append(evaluate_model("Linear Regression", pipeline_lr, x_test_raw, y_test))

In [None]:
y_pred_lr = pipeline_lr.predict(x_test_raw)
residuals_lr = y_test - y_pred_lr

plt.figure(figsize=(8, 6))
sns.histplot(residuals_lr, kde=True)
plt.title('Residuals Distribution for Linear Regression')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(y_pred_lr, residuals_lr, alpha=0.7)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Profit')
plt.ylabel('Residuals')
plt.title('Residuals vs. Predicted for Linear Regression')
plt.show()

In [None]:
pipeline_dt = Pipeline([('preprocessor', preprocessor), ('DecisionTreeRegressor', DecisionTreeRegressor())])
pipeline_dt.fit(x_train_raw, y_train)
metrics.append(evaluate_model("DecisionTreeRegressor (Before Tuning)", pipeline_dt, x_test_raw, y_test))

In [None]:
y_pred_dt = pipeline_dt.predict(x_test_raw)
residuals_dt = y_test - y_pred_dt

plt.figure(figsize=(8, 6))
sns.histplot(residuals_dt, kde=True)
plt.title('Residuals Distribution for DecisionTreeRegressor')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(y_pred_dt, residuals_dt, alpha=0.7)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Profit')
plt.ylabel('Residuals')
plt.title('Residuals vs. Predicted for DecisionTreeRegressor')
plt.show()

In [None]:
pipeline_rf = Pipeline([('preprocessor', preprocessor), ('RandomForestRegressor', RandomForestRegressor())])
pipeline_rf.fit(x_train_raw, y_train)
metrics.append(evaluate_model("RandomForestRegressor (Before Tuning)", pipeline_rf, x_test_raw, y_test))

In [None]:
y_pred_rf = pipeline_rf.predict(x_test_raw)
residuals_rf = y_test - y_pred_rf

plt.figure(figsize=(8, 6))
sns.histplot(residuals_rf, kde=True)
plt.title('Residuals Distribution for RandomForestRegressor')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(y_pred_rf, residuals_rf, alpha=0.7)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Profit')
plt.ylabel('Residuals')
plt.title('Residuals vs. Predicted for RandomForestRegressor')
plt.show()

In [None]:
pipeline_svr = Pipeline([('preprocessor', preprocessor), ('SVR', SVR())])
pipeline_svr.fit(x_train_raw, y_train)
metrics.append(evaluate_model("SVR (Before Tuning)", pipeline_svr, x_test_raw, y_test))

In [None]:
y_pred_svr = pipeline_svr.predict(x_test_raw)
residuals_svr = y_test - y_pred_svr

plt.figure(figsize=(8, 6))
sns.histplot(residuals_svr, kde=True)
plt.title('Residuals Distribution for SVR')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(y_pred_svr, residuals_svr, alpha=0.7)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Profit')
plt.ylabel('Residuals')
plt.title('Residuals vs. Predicted for SVR')
plt.show()

In [None]:
pipeline_knn = Pipeline([('preprocessor', preprocessor), ('KNeighborsRegressor', KNeighborsRegressor())])
pipeline_knn.fit(x_train_raw, y_train)
metrics.append(evaluate_model("KNeighborsRegressor (Before Tuning)", pipeline_knn, x_test_raw, y_test))

In [None]:
y_pred_knn = pipeline_knn.predict(x_test_raw)
residuals_knn = y_test - y_pred_knn

plt.figure(figsize=(8, 6))
sns.histplot(residuals_knn, kde=True)
plt.title('Residuals Distribution for KNeighborsRegressor')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(y_pred_knn, residuals_knn, alpha=0.7)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Profit')
plt.ylabel('Residuals')
plt.title('Residuals vs. Predicted for KNeighborsRegressor')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
bar_width = 0.15
df_metrics = pd.DataFrame(metrics)
bar_positions = [i for i in range(len(df_metrics["Model"]))]

regression_metrics = ["MSE", "RMSE", "MAE", "R2"]

for i, metric in enumerate(regression_metrics):
    ax.bar([p + bar_width * i for p in bar_positions], df_metrics[metric], width=bar_width, label=metric)

ax.set_xlabel('Models')
ax.set_ylabel('Scores')
ax.set_title('Performance Metrics for Different Models')
ax.set_xticks([p + bar_width * (len(regression_metrics) - 1) / 2 for p in bar_positions])
ax.set_xticklabels(df_metrics["Model"], rotation=45, ha='right')
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
models = {
    'LinearRegression': {
        'model': LinearRegression(),
        'params': {}
    },
    'DecisionTreeRegressor': {
        'model': DecisionTreeRegressor(random_state=42),
        'params': {
            'max_depth': [None, 3, 5, 10],
            'min_samples_split': [2, 5, 10]
        }
    },
    'RandomForestRegressor': {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 5, 10],
            'min_samples_split': [2, 5]
        }
    },
    'SVR': {
        'model': SVR(),
        'params': {
            'kernel': ['linear', 'rbf'],
            'C': [0.1, 1, 10],
            'epsilon': [0.1, 0.2, 0.5]
        }
    },
    'KNeighborsRegressor': {
        'model': KNeighborsRegressor(),
        'params': {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance'],
            'p': [1, 2]
        }
    }
}

In [None]:
best_models = {}

for name, mp in models.items():
    print(f"\nTraining and tuning {name}...")

    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', mp['model'])
    ])

    grid = GridSearchCV(pipe,
                        param_grid={'regressor__' + key: val for key, val in mp['params'].items()},
                        cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid.fit(x_train_raw, y_train)

    best_models[name] = grid.best_estimator_
    print(f"Best parameters for {name}: {grid.best_params_}")

    y_pred = grid.predict(x_test_raw)

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    print(f"{name} Performance on Test Data:")
    print(f"R²: {r2:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")

    plt.figure(figsize=(6, 4))
    plt.scatter(y_test, y_pred, alpha=0.7)
    plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
    plt.xlabel('Actual Profit')
    plt.ylabel('Predicted Profit')
    plt.title(f'{name} - Actual vs Predicted Profit')
    plt.show()

In [None]:
reg_setup = setup(data = df, target = 'Profit', session_id=123)

In [None]:
best_model = compare_models()

In [None]:
from pycaret.regression import evaluate_model

In [None]:
evaluate_model(best_model)