In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_absolute_error
import joblib

In [3]:
# Load the synthetic dataset and display info
df = pd.read_csv("synthetic_data.csv")
df.info()
df.describe()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Customer_ID                  200 non-null    object 
 1   Name                         200 non-null    object 
 2   Order_Frequency              200 non-null    int64  
 3   Average_Time_Between_Orders  140 non-null    float64
 4   Lowest_Time_Between_Orders   140 non-null    float64
 5   has_peak                     200 non-null    int64  
 6   has_knife                    200 non-null    int64  
 7   has_pivot                    200 non-null    int64  
dtypes: float64(2), int64(4), object(2)
memory usage: 12.6+ KB


Unnamed: 0,Customer_ID,Name,Order_Frequency,Average_Time_Between_Orders,Lowest_Time_Between_Orders,has_peak,has_knife,has_pivot
0,C0001,John Smith,1,,,1,0,0
1,C0002,Jane Doe,2,200.0,200.0,1,1,0
2,C0003,Robert Johnson,1,,,0,0,1
3,C0004,Lucy Hernandez,3,130.0,60.0,1,1,1
4,C0005,Michael Brown,2,220.0,180.0,1,0,1


In [4]:
# Clean the dataset:
# Replace "N/A" with NaN for easier numeric processing
df = df.replace("N/A", np.nan)

# Convert the Average_Time_Between_Orders and Lowest_Time_Between_Orders columns to numeric, coercing errors to NaN
df['Average_Time_Between_Orders'] = pd.to_numeric(df['Average_Time_Between_Orders'], errors='coerce')
df['Lowest_Time_Between_Orders'] = pd.to_numeric(df['Lowest_Time_Between_Orders'], errors='coerce')

# Drop rows where Lowest_Time_Between_Orders is NaN (since we need it as a target)
df = df.dropna(subset=['Lowest_Time_Between_Orders'])
df.describe()

Unnamed: 0,Order_Frequency,Average_Time_Between_Orders,Lowest_Time_Between_Orders,has_peak,has_knife,has_pivot
count,140.0,140.0,140.0,140.0,140.0,140.0
mean,2.957143,155.035714,102.214286,0.814286,0.664286,0.671429
std,0.829938,50.177729,77.113556,0.390272,0.473935,0.47138
min,2.0,60.0,20.0,0.0,0.0,0.0
25%,2.0,110.0,40.0,1.0,0.0,0.0
50%,3.0,150.0,60.0,1.0,1.0,1.0
75%,4.0,200.0,190.0,1.0,1.0,1.0
max,4.0,240.0,240.0,1.0,1.0,1.0


In [5]:
# Define features and target for the regression model (timing model)
features = ['Order_Frequency', 'Average_Time_Between_Orders']
X = df[features]
y_reg = df['Lowest_Time_Between_Orders']

# Split data into training and test sets for regression
X_train, X_test, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.2, random_state=42)

In [6]:
# Train a baseline LinearRegression model for comparison
baseline_model = LinearRegression()
baseline_model.fit(X_train, y_train_reg)

# Predict on the test set using the baseline model
y_pred_reg = baseline_model.predict(X_test)

# Calculate MAE for the baseline model
baseline_mae = mean_absolute_error(y_test_reg, y_pred_reg)
print("Baseline MAE:", baseline_mae)

Baseline MAE: 17.517824557397258


In [7]:
# Train a RandomForestRegressor for potentially better performance
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train_reg)

# Predict on the test set using the random forest model
y_pred_rf = rf_model.predict(X_test)

# Calculate MAE for the random forest model
rf_mae = mean_absolute_error(y_test_reg, y_pred_rf)
print("Random Forest MAE:", rf_mae)

Random Forest MAE: 4.687635990002069


In [8]:
# Check feature importances from the random forest model
importances = rf_model.feature_importances_
for feat, imp in zip(features, importances):
    print(f"{feat}: {imp:.4f}")

Order_Frequency: 0.8699
Average_Time_Between_Orders: 0.1301


In [9]:
# Set up a parameter grid for hyperparameter tuning of the random forest model
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=3, # 3 fold cross validation
    verbose=1,
    n_jobs=-1
)

# Fit the grid search on the training data
grid_search.fit(X_train, y_train_reg)
print("Best parameters:", grid_search.best_params_)
print("Best score (negative MAE):", grid_search.best_score_)


Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}
Best score (negative MAE): -8.484423839900813


In [10]:
# Extract the best estimator (random forest model) from the grid search
best_rf_model = grid_search.best_estimator_

# Predict with the tuned model on the test set
y_pred_best = best_rf_model.predict(X_test)

# Calculate the final MAE after tuning
final_mae = mean_absolute_error(y_test_reg, y_pred_best)
print("Final MAE after tuning:", final_mae)

Final MAE after tuning: 4.845719320362176


In [11]:
# Check feature importances of the tuned model
importances = best_rf_model.feature_importances_
for feat, imp in zip(features, importances):
    print(f"{feat}: {imp:.4f}")

Order_Frequency: 0.8663
Average_Time_Between_Orders: 0.1337


In [12]:
# Save the tuned random forest model to a file
joblib.dump(best_rf_model, "rf_model.pkl")

['rf_model.pkl']

In [13]:
# Reload the original synthetic data for the classification task
df = pd.read_csv("synthetic_data.csv")

# Replace "N/A" with NaN again for consistency
df = df.replace("N/A", np.nan)

# Convert columns to numeric where needed
df['Average_Time_Between_Orders'] = pd.to_numeric(df['Average_Time_Between_Orders'], errors='coerce')
df['Lowest_Time_Between_Orders'] = pd.to_numeric(df['Lowest_Time_Between_Orders'], errors='coerce')

# Fill missing values in numeric columns with their respective means
df['Average_Time_Between_Orders'] = df['Average_Time_Between_Orders'].fillna(df['Average_Time_Between_Orders'].mean())
df['Lowest_Time_Between_Orders'] = df['Lowest_Time_Between_Orders'].fillna(df['Lowest_Time_Between_Orders'].mean())
df.head()

Unnamed: 0,Customer_ID,Name,Order_Frequency,Average_Time_Between_Orders,Lowest_Time_Between_Orders,has_peak,has_knife,has_pivot
0,C0001,John Smith,1,155.035714,102.214286,1,0,0
1,C0002,Jane Doe,2,200.0,200.0,1,1,0
2,C0003,Robert Johnson,1,155.035714,102.214286,0,0,1
3,C0004,Lucy Hernandez,3,130.0,60.0,1,1,1
4,C0005,Michael Brown,2,220.0,180.0,1,0,1


In [14]:
def determine_next_product(row):
    ''' This function uses simple logic to recommend a next product based on owned products '''
    
    has_peak = row['has_peak']
    has_knife = row['has_knife']
    has_pivot = row['has_pivot']

    # If they have none, recommend pivot
    if has_peak == 0 and has_knife == 0 and has_pivot == 0:
        return "pivot"
    # If they have pivot but not knife, recommend knife
    if has_pivot == 1 and has_knife == 0:
        return "knife"
    # If they have peak but not knife, recommend knife
    if has_peak == 1 and has_knife == 0:
        return "knife"
    # If they have pivot and knife but not peak, recommend peak
    if has_pivot == 1 and has_knife == 1 and has_peak == 0:
        return "peak"
    # If they have knife but not pivot, recommend pivot
    if has_knife == 1 and has_pivot == 0:
        return "pivot"
    # If they have peak but nothing else, recommend pivot
    if has_peak == 1 and has_knife == 0 and has_pivot == 0:
        return "pivot"
    # If they have peak and pivot but not knife, recommend knife
    if has_peak == 1 and has_pivot == 1 and has_knife == 0:
        return "knife"
    # If they have peak and knife but not pivot, recommend pivot
    if has_peak == 1 and has_knife == 1 and has_pivot == 0:
        return "pivot"

    # If they have all three, just pick pivot as a default
    return "pivot"

In [15]:
# Apply the function to determine the next product for each customer
df['Next_Product'] = df.apply(determine_next_product, axis=1)
df.head()

Unnamed: 0,Customer_ID,Name,Order_Frequency,Average_Time_Between_Orders,Lowest_Time_Between_Orders,has_peak,has_knife,has_pivot,Next_Product
0,C0001,John Smith,1,155.035714,102.214286,1,0,0,knife
1,C0002,Jane Doe,2,200.0,200.0,1,1,0,pivot
2,C0003,Robert Johnson,1,155.035714,102.214286,0,0,1,knife
3,C0004,Lucy Hernandez,3,130.0,60.0,1,1,1,pivot
4,C0005,Michael Brown,2,220.0,180.0,1,0,1,knife


In [16]:
# For the classification model, we use product ownership as features and Next_Product as the target
features = ['has_peak', 'has_knife', 'has_pivot']
X = df[features]
y = df['Next_Product']

# Encode the target variable since it's categorical
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split into training and test sets for classification
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [17]:
# Train a RandomForestClassifier to predict the next product
product_model = RandomForestClassifier(n_estimators=100, random_state=42)
product_model.fit(X_train, y_train)

In [18]:
# Predict on the test set
y_pred = product_model.predict(X_test)

# Compute the accuracy of the classification model
acc = accuracy_score(y_test, y_pred)
print("Classification Accuracy:", acc)

Classification Accuracy: 1.0


In [19]:
# Check feature importances for the classification model
importances = product_model.feature_importances_
for feat, imp in zip(features, importances):
    print(f"{feat}: {imp:.4f}")

has_peak: 0.2330
has_knife: 0.6714
has_pivot: 0.0956


In [20]:
# Test the product model on a new customer
new_customer = pd.DataFrame({
    'has_peak': [1],
    'has_knife': [1],
    'has_pivot': [0]
})

# Predict the next product for this new customer
new_pred = product_model.predict(new_customer)

# Convert the numeric prediction back to a product name
predicted_product = le.inverse_transform(new_pred)
print("Predicted next product to advertise:", predicted_product[0])

Predicted next product to advertise: pivot


In [None]:
# Save the product model and label encoder for later use
joblib.dump(product_model, "product_model.pkl")
joblib.dump(le, "label_encoder.pkl")

['label_encoder.pkl']