### import library


In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

### 2. Data Preparation & Cleaning

In [34]:
# 2. Load and Prepare the Data
# Load the dataset you provided
df = pd.read_csv('dataset (1).csv')
# Drop the 'Gender' column based on EDA findings
df = df.drop('Gender', axis=1)

# Define features (X) and target (y)
X = df.drop('Price', axis=1)
y = df['Price']

### 3. Preprocessing Setup

In [35]:
# Manually map the 'Size' feature as it's ordinal
size_mapping = {'small': 0, 'medium': 1, 'large': 2}
X['Size'] = X['Size'].map(size_mapping)

# Identify column types for the preprocessor
# 'Size' is now numeric and will be scaled with other numerical features
numerical_features = ['Ingredients_Cost', 'Time_Taken', 'Amount', 'Size']
categorical_features = ['Sold_On', 'Design_Complexity']

# Create a preprocessor to apply different transformations to different columns
# - StandardScaler for numerical features
# - OneHotEncoder for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])


###  4. Model Definition





In [36]:
# We'll use a RandomForestRegressor, a powerful and reliable model for this task
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# 5. Create and Train the Pipeline
# A pipeline chains the preprocessor and the model, simplifying the workflow
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', model)])

# Split data into 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the entire pipeline on the training data
print("Training the RandomForestRegressor model...")
pipeline.fit(X_train, y_train)
print("Model training complete.")
print("-" * 40)

Training the RandomForestRegressor model...
Model training complete.
----------------------------------------


### 5. Evaluate the Model

In [37]:
# Use the trained pipeline to make predictions on the unseen test data
y_pred = pipeline.predict(X_test)

# Calculate and print the performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("Model Evaluation Results on Test Data:")
print(f"Mean Absolute Error (MAE): ${mae:.2f}")
print(f"Root Mean Squared Error (RMSE): ${rmse:.2f}")
print("-" * 40)
print(f"On average, the model's price predictions are off by about ${mae:.2f}.")

Model Evaluation Results on Test Data:
Mean Absolute Error (MAE): $9.76
Root Mean Squared Error (RMSE): $12.61
----------------------------------------
On average, the model's price predictions are off by about $9.76.


### 6. Prediction on a New Cake

In [38]:
# Let's predict the price for a new, hypothetical cake
new_cake_data = pd.DataFrame({
    'Sold_On': ['Friday'],
    'Size': ['medium'],
    'Ingredients_Cost': [75],
    'Design_Complexity': ['complex'],
    'Time_Taken': [4],
    'Amount': [2]
})

# Manually map the 'Size' feature in the new data
size_mapping = {'small': 0, 'medium': 1, 'large': 2}
new_cake_data['Size'] = new_cake_data['Size'].map(size_mapping)


# Use the trained pipeline to predict the price
predicted_price = pipeline.predict(new_cake_data)

print("\n--- Prediction for a New Cake ---")
print("Cake Features:")
print(new_cake_data)
print(f"\nPredicted Price: ${predicted_price[0]:.2f}")


--- Prediction for a New Cake ---
Cake Features:
  Sold_On  Size  Ingredients_Cost Design_Complexity  Time_Taken  Amount
0  Friday     1                75           complex           4       2

Predicted Price: $111.67
