In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import VotingRegressor, BaggingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

In [30]:
# Load the dataset
data = pd.read_csv('data.csv')

In [32]:
data.head()

Unnamed: 0,Date,Product Name,Category,Units Sold,Price,Revenue,Discount,Units Returned,Location,Platform
0,2020-01-06,Whey Protein,Protein,143,31.98,4573.14,0.03,2,Canada,Walmart
1,2020-01-06,Vitamin C,Vitamin,139,42.51,5908.89,0.04,0,UK,Amazon
2,2020-01-06,Fish Oil,Omega,161,12.91,2078.51,0.25,0,Canada,Amazon
3,2020-01-06,Multivitamin,Vitamin,140,16.07,2249.8,0.08,0,Canada,Walmart
4,2020-01-06,Pre-Workout,Performance,157,35.47,5568.79,0.25,3,Canada,iHerb


In [34]:
# Extract year and month from Date
data['Date'] = pd.to_datetime(data['Date'])
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month

# Drop unnecessary columns
data = data.drop(columns=['Product Name', 'Date'])

# Define features and target
numerical_cols = ['Units Sold', 'Price', 'Discount', 'Units Returned', 'Year', 'Month']
categorical_cols = ['Category', 'Location', 'Platform']
features = numerical_cols + categorical_cols
target = 'Revenue'

# Split features and target
X = data[features]
y = data[target]

In [36]:
# Preprocessing: One-hot encode categorical variables and scale numerical variables
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
    ])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply preprocessing
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Get feature names after one-hot encoding
cat_encoder = preprocessor.named_transformers_['cat']
cat_feature_names = cat_encoder.get_feature_names_out(categorical_cols)
feature_names = numerical_cols + list(cat_feature_names)

# Convert processed data to DataFrame for easier handling
X_train = pd.DataFrame(X_train, columns=feature_names)
X_test = pd.DataFrame(X_test, columns=feature_names)

In [38]:
# Initialize base linear regression model
base_lr = LinearRegression()
base_lr.fit(X_train,y_train)

y_pred_lr = base_lr.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print(f" Base LR - MSE: {mse_lr:.2f}, R²: {r2_lr:.4f}")



 Base LR - MSE: 28927.00, R²: 0.9937


In [40]:
# Initialize ensemble models
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=0.1)

voting_model = VotingRegressor(estimators=[
    ('base_lr', base_lr),
    ('ridge', ridge),
    ('lasso', lasso)
])

voting_model.fit(X_train, y_train)
y_pred_voting = voting_model.predict(X_test)

mse_voting = mean_squared_error(y_test, y_pred_voting)
r2_voting = r2_score(y_test, y_pred_voting)

print(f"📊 Voting Regressor - MSE: {mse_voting:.2f}, R²: {r2_voting:.4f}")


📊 Voting Regressor - MSE: 28923.44, R²: 0.9937


In [42]:
# ----- Bagging Regressor -----
bagging_model = BaggingRegressor(
    estimator=LinearRegression(),
    n_estimators=10,
    max_samples=0.8,
    random_state=42
)

bagging_model.fit(X_train, y_train)
y_pred_bagging = bagging_model.predict(X_test)

mse_bagging = mean_squared_error(y_test, y_pred_bagging)
r2_bagging = r2_score(y_test, y_pred_bagging)

print(f"📦 Bagging Regressor - MSE: {mse_bagging:.2f}, R²: {r2_bagging:.4f}")

📦 Bagging Regressor - MSE: 29009.94, R²: 0.9937


In [44]:
# AdaBoost using LinearRegression as weak learner
ada_lin = AdaBoostRegressor(
    estimator=LinearRegression(),
    n_estimators=50,
    learning_rate=0.5,
    random_state=42
)

ada_lin.fit(X_train, y_train)
y_pred_ada_lin = ada_lin.predict(X_test)

# Evaluate
mse_ada_lin = mean_squared_error(y_test, y_pred_ada_lin)
r2_ada_lin = r2_score(y_test, y_pred_ada_lin)

print(f"⚡ AdaBoost (Linear Regression base) - MSE: {mse_ada_lin:.2f}, R²: {r2_ada_lin:.4f}")

⚡ AdaBoost (Linear Regression base) - MSE: 29078.31, R²: 0.9937


In [46]:
# Evaluate models
models = {
    'Linear Regression': y_pred_lr,
    'Bagging': y_pred_bagging,
    'AdaBoost': y_pred_ada_lin,
    'Voting': y_pred_voting
}
r2_scores = {}
for name, y_pred in models.items():
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    r2_scores[name] = r2
    print(f"{name}:")
    print(f"  Mean Squared Error: {mse:.2f}")
    print(f"  R² Score: {r2:.2f}")

# --- Generate Data for Figures ---


Linear Regression:
  Mean Squared Error: 28927.00
  R² Score: 0.99
Bagging:
  Mean Squared Error: 29009.94
  R² Score: 0.99
AdaBoost:
  Mean Squared Error: 29078.31
  R² Score: 0.99
Voting:
  Mean Squared Error: 28923.44
  R² Score: 0.99


In [48]:
dt_model = DecisionTreeRegressor(random_state=42)

# Combine into ensemble
ensemble_model = VotingRegressor(estimators=[
    ('base_lr', base_lr),
    ('dt', dt_model)
])

# Train ensemble model
ensemble_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_ensemble = ensemble_model.predict(X_test)
mse_ensemble = mean_squared_error(y_test, y_pred_ensemble)
r2_ensemble = r2_score(y_test, y_pred_ensemble)

# Print metrics
print("Ensemble Model Performance:")
print("MSE:", mse_ensemble)
print("R² Score:", r2_ensemble)

Ensemble Model Performance:
MSE: 9260.438736271759
R² Score: 0.997995267472155
