In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_curve
from xgboost import XGBClassifier
import lightgbm as lgb
from imblearn.over_sampling import SMOTE  # For Handling Class Imbalance

In [2]:
# Load dataset
file_path = "diversified_ecommerce_dataset.csv"  # Update path if needed
df = pd.read_csv(file_path)

In [3]:
# Define target variable (Market Fit: 1 if Popularity Index > 70, else 0)
df["Market Fit"] = (df["Popularity Index"] > 85).astype(int)

# Drop unnecessary columns
df.drop(["Product ID", "Product Name", "Supplier ID", "Popularity Index"], axis=1, inplace=True)

# Define categorical and numerical features
categorical_features = ["Category", "Customer Age Group", "Customer Location", "Customer Gender", "Shipping Method", "Seasonality"]
numerical_features = ["Price", "Discount", "Tax Rate", "Stock Level", "Shipping Cost", "Return Rate"]

# Reduce dataset size to avoid memory issues (Use 10,000 rows)
df_sample = df.sample(n=20000, random_state=42)
df_sample.fillna(df_sample.median(numeric_only=True), inplace=True)

In [4]:
# Feature Engineering - New Features
df_sample["Price_Discount_Ratio"] = df_sample["Price"] / (df_sample["Discount"] + 1)
df_sample["Stock_Shipping_Ratio"] = df_sample["Stock Level"] / (df_sample["Shipping Cost"] + 1)
df_sample["Price_Tax_Ratio"] = df_sample["Price"] / (df_sample["Tax Rate"] + 1)
df_sample["Stock_Return_Ratio"] = df_sample["Stock Level"] / (df_sample["Return Rate"] + 1)

# Add new numerical features
numerical_features.extend(["Price_Discount_Ratio", "Stock_Shipping_Ratio", "Price_Tax_Ratio", "Stock_Return_Ratio"])

# Define features and target
X = df_sample.drop("Market Fit", axis=1)
y = df_sample["Market Fit"]

In [5]:
# Encode categorical variables
X_encoded = pd.get_dummies(X, columns=categorical_features)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [6]:
# Normalize numerical features using Power Transformer
scaler = PowerTransformer()
X_train_resampled[numerical_features] = scaler.fit_transform(X_train_resampled[numerical_features] + 1e-6)
X_test[numerical_features] = scaler.transform(X_test[numerical_features] + 1e-6)

# Feature Selection - Use Random Forest Feature Importance
rf_temp = RandomForestClassifier(n_estimators=100, random_state=42)
rf_temp.fit(X_train_resampled, y_train_resampled)
feature_importances = pd.Series(rf_temp.feature_importances_, index=X_train_resampled.columns)
top_features = feature_importances.nlargest(20).index  # Increase Top Features for better accuracy

X_train_resampled = X_train_resampled[top_features]
X_test = X_test[top_features]

In [7]:
# Define hyperparameter grid for tuning
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.03, 0.1]
}

# Initialize XGBoost model
xgb_model = XGBClassifier(random_state=42)

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(xgb_model, param_grid, scoring='accuracy', cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_resampled, y_train_resampled)

# Best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best hyperparameters: {'learning_rate': 0.03, 'max_depth': 7, 'n_estimators': 500}


In [8]:
# Train the best model found
best_xgb = grid_search.best_estimator_
best_xgb.fit(X_train_resampled, y_train_resampled)

In [9]:
# Make predictions
y_pred = best_xgb.predict(X_test)

# Model evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.848
              precision    recall  f1-score   support

           0       0.85      1.00      0.92      3397
           1       0.22      0.00      0.01       603

    accuracy                           0.85      4000
   macro avg       0.54      0.50      0.46      4000
weighted avg       0.75      0.85      0.78      4000



In [10]:
X_test

Unnamed: 0,Seasonality_No,Seasonality_Yes,Shipping Method_Overnight,Shipping Method_Express,Customer Gender_Female,Customer Gender_Non-Binary,Shipping Method_Standard,Customer Gender_Male,Price_Discount_Ratio,Price,Shipping Cost,Price_Tax_Ratio,Return Rate,Stock Level,Stock_Shipping_Ratio,Stock_Return_Ratio,Tax Rate,Discount,Category_Home Appliances,Customer Age Group_25-34
687578,True,False,False,False,False,True,True,False,1.831637,0.431534,1.185275,0.099999,-0.574217,-0.649971,-0.928590,-0.213762,0.665855,-1.668823,False,False
505006,False,True,False,True,False,True,False,False,1.999059,1.012697,-1.300738,0.743079,-1.204089,0.166460,1.020207,0.909684,0.059643,-1.668823,False,True
381859,False,True,False,True,True,False,False,False,-0.739189,-0.348964,-0.801511,-0.474219,-0.830681,0.381880,0.675094,0.721710,0.665855,1.337627,False,True
742738,False,True,False,True,True,False,False,False,-0.071259,0.857259,-1.492121,1.716348,-1.768847,1.363504,1.747764,2.449505,-1.569862,0.865647,True,False
43836,True,False,False,True,False,False,False,True,-1.633103,-1.529289,-0.006346,-1.574702,-1.358917,1.531299,0.657070,1.893427,0.665855,0.865647,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11897,False,True,True,False,False,True,False,False,-0.797002,-0.701267,0.817513,0.098303,0.864932,1.181285,0.176379,0.231395,-1.569862,0.865647,True,False
575699,False,True,True,False,False,True,False,False,-1.428186,-1.688721,0.519413,-1.685288,-0.941169,-0.711743,-0.747641,-0.031275,0.059643,-0.193103,False,False
693196,True,False,False,False,True,False,True,False,-0.291057,-0.184321,0.450054,-0.346284,-1.579303,0.903069,0.211707,1.852558,0.665855,0.360231,False,True
14062,False,True,True,False,False,True,False,False,-0.106793,0.221151,0.002356,-0.047370,-0.183563,-0.895541,-0.686205,-0.639224,0.665855,0.360231,False,False


In [10]:
import joblib
from sklearn.pipeline import Pipeline

# Save model
joblib.dump(best_xgb, 'market_fit_model.pkl')

# Save preprocessing components
preprocessing_assets = {
    'scaler': scaler,
    'top_features': top_features,
    'categorical_features': categorical_features
}
joblib.dump(preprocessing_assets, 'preprocessing.pkl')

['preprocessing.pkl']

In [24]:
X_train.columns

Index(['Price', 'Discount', 'Tax Rate', 'Stock Level', 'Shipping Cost',
       'Return Rate', 'Price_Discount_Ratio', 'Stock_Shipping_Ratio',
       'Price_Tax_Ratio', 'Stock_Return_Ratio', 'Category_Apparel',
       'Category_Books', 'Category_Electronics', 'Category_Footwear',
       'Category_Home Appliances', 'Customer Age Group_18-24',
       'Customer Age Group_25-34', 'Customer Age Group_35-44',
       'Customer Age Group_45-54', 'Customer Age Group_55+',
       'Customer Location_Berlin, Germany',
       'Customer Location_Cape Town, South Africa',
       'Customer Location_Chicago, USA', 'Customer Location_Dubai, UAE',
       'Customer Location_Houston, USA', 'Customer Location_London, UK',
       'Customer Location_Los Angeles, USA', 'Customer Location_Mumbai, India',
       'Customer Location_New York, USA', 'Customer Location_Paris, France',
       'Customer Location_Phoenix, USA', 'Customer Location_Singapore',
       'Customer Location_Sydney, Australia', 'Customer Locati

In [25]:
X_train

Unnamed: 0,Price,Discount,Tax Rate,Stock Level,Shipping Cost,Return Rate,Price_Discount_Ratio,Stock_Shipping_Ratio,Price_Tax_Ratio,Stock_Return_Ratio,...,"Customer Location_Tokyo, Japan","Customer Location_Toronto, Canada",Customer Gender_Female,Customer Gender_Male,Customer Gender_Non-Binary,Shipping Method_Express,Shipping Method_Overnight,Shipping Method_Standard,Seasonality_No,Seasonality_Yes
249364,72.02,25,5,72,48.28,12.90,2.770000,1.461039,12.003333,5.179856,...,False,False,True,False,False,True,False,False,False,True
443780,733.82,5,15,427,7.22,2.48,122.303333,51.946472,45.863750,122.701149,...,False,False,False,False,True,False,True,False,False,True
585678,1803.63,5,8,248,40.40,2.69,300.605000,5.990338,200.403333,67.208672,...,False,False,True,False,False,False,False,True,True,False
625769,1165.45,10,5,412,20.05,9.00,105.950000,19.572447,194.241667,41.200000,...,False,True,False,False,True,True,False,False,False,True
997730,1380.74,20,15,74,24.28,8.36,65.749524,2.927215,86.296250,7.905983,...,False,False,False,True,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357941,974.76,15,8,196,17.93,17.03,60.922500,10.353936,108.306667,10.870771,...,False,False,True,False,False,False,True,False,False,True
394399,1893.75,20,8,218,21.69,3.82,90.178571,9.607757,210.416667,45.228216,...,False,False,False,False,True,False,True,False,False,True
714497,1805.10,0,12,192,43.13,6.85,1805.100000,4.350782,138.853846,24.458599,...,False,False,False,True,False,False,True,False,True,False
852873,582.27,10,15,72,23.66,8.80,52.933636,2.919708,36.391875,7.346939,...,False,False,False,False,True,True,False,False,False,True
