In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns

In [42]:
df = pd.read_csv("clothes_price_prediction_data.csv")

# Filter
df = df.drop("Brand", axis=1)

In [43]:
# Features and target
X = df.drop("Price", axis=1)
y = df["Price"]

# Identify numeric and categorical features
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Create a more sophisticated preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ],
    remainder='passthrough'
)

# Build pipeline with hyperparameter tuning
pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("regressor", RandomForestRegressor(random_state=42))
])

# Define parameter grid for hyperparameter tuning
param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5],
    'regressor__min_samples_leaf': [1, 2]
}

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(
    pipeline, 
    param_grid=param_grid,
    cv=5, 
    scoring='neg_root_mean_squared_error',
    n_jobs=-1
)

# Train model with hyperparameter tuning
grid_search.fit(X_train, y_train)

# Get best model
best_model = grid_search.best_estimator_

# Predict and evaluate
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Best parameters: {grid_search.best_params_}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.4f}")

# Feature importance analysis
if hasattr(best_model['regressor'], 'feature_importances_'):
    # Get feature names after preprocessing
    ohe = best_model['preprocess'].transformers_[1][1]
    cat_feature_names = ohe.get_feature_names_out(categorical_cols)
    feature_names = np.concatenate([numeric_cols, cat_feature_names])
    
    # Get feature importances
    importances = best_model['regressor'].feature_importances_
    
    # Sort feature importances
    indices = np.argsort(importances)[::-1]
    
    # Print top 10 features
    print("\nTop 10 most important features:")
    for i in range(min(10, len(feature_names))):
        if i < len(indices):
            print(f"{i+1}. {feature_names[indices[i]]}: {importances[indices[i]]:.4f}")

Best parameters: {'regressor__max_depth': 10, 'regressor__min_samples_leaf': 2, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 200}
RMSE: 54.79
R² Score: -0.0244

Top 10 most important features:
1. Color_Green: 0.0511
2. Size_M: 0.0490
3. Category_Jacket: 0.0469
4. Category_Shoes: 0.0465
5. Material_Nylon: 0.0450
6. Size_L: 0.0448
7. Material_Wool: 0.0442
8. Color_Yellow: 0.0434
9. Size_XXL: 0.0429
10. Material_Polyester: 0.0422
