In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import time
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, r2_score
import optuna

In [12]:
df = sns.load_dataset('diamonds')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [13]:
df.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [14]:
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [15]:
le = LabelEncoder()
for col in ['cut', 'color', 'clarity']:
    df[col] = le.fit_transform(df[col])

## Split Features and Target

In [16]:
X = df.drop('price', axis=1)
y = df['price']

## Standardize for SVM (Crucial for distance-based models)

In [17]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Train/Test Split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [19]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((43152, 9), (10788, 9), (43152,), (10788,))

## Small subset for SVM to save time (SVR scales poorly O(n^2))

In [20]:
X_train_small = X_train[:5000]
y_train_small = y_train[:5000]

## Dictionary to store results

In [21]:
model_performance = {}

def screen_model(model, name):
    start_time = time.time()
    
    # SVR is too slow on 50k+ rows; we use the small subset defined earlier
    if name == "SVR":
        model.fit(X_train_small, y_train_small)
    else:
        model.fit(X_train, y_train)
        
    duration = time.time() - start_time
    predictions = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    
    model_performance[name] = {
        'MAE': round(mae, 2),
        'R2': round(r2, 4),
        'Time_Sec': round(duration, 2)
    }
    print(f"Finished {name} in {duration:.2f}s")

## Define the models

In [26]:
models_to_test = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42, n_jobs=-1),
    "XGBoost": XGBRegressor(random_state=42),
    "SVR": SVR()
}

# Execute screening
for name, model in models_to_test.items():
    screen_model(model, name)

# Display results as a sorted DataFrame for better visibility
perf_df = pd.DataFrame(model_performance).T.sort_values(by='R2', ascending=False)
print("\n--- Model Screening Results ---")
print(perf_df)

Finished Linear Regression in 0.19s
Finished Decision Tree in 0.27s
Finished Random Forest in 8.51s
Finished XGBoost in 0.31s
Finished SVR in 1.58s

--- Model Screening Results ---
                       MAE      R2  Time_Sec
Random Forest       268.09  0.9815      8.51
XGBoost             277.94  0.9813      0.31
Decision Tree       355.79  0.9663      0.27
Linear Regression   858.71  0.8851      0.19
SVR                2340.18  0.0298      1.58


In [28]:
tuning_comparison = {}
# Grid Search will test 2 * 2 * 2 = 8 combinations
param_grid = {
    'n_estimators': [100, 300],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 6]
}

def update_comparison(technique, best_score, duration, best_params):
    tuning_comparison[technique] = {
        'Best_R2': round(best_score, 4),
        'Time_Sec': round(duration, 2),
        'Best_Params': best_params
    }

## TECHNIQUE 1: GRID SEARCH

In [29]:
print("Running Grid Search...")
start = time.time()
gs = GridSearchCV(XGBRegressor(random_state=42), param_grid, cv=3, n_jobs=-1)
gs.fit(X_train, y_train)
update_comparison('Grid Search', gs.best_score_, time.time()-start, gs.best_params_)

Running Grid Search...


## TECHNIQUE 2: RANDOM SEARCH

In [30]:
print("Running Random Search...")
start = time.time()
# n_iter=4 means we only test 50% of the grid, aiming for speed
rs = RandomizedSearchCV(XGBRegressor(random_state=42), param_grid, n_iter=4, cv=3, n_jobs=-1, random_state=42)
rs.fit(X_train, y_train)
update_comparison('Random Search', rs.best_score_, time.time()-start, rs.best_params_)

Running Random Search...


## TECHNIQUE 3: OPTUNA (Bayesian)

In [31]:
print("Running Optuna...")
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10)
    }
    model = XGBRegressor(**params, random_state=42)
    score = cross_val_score(model, X_train, y_train, cv=3, n_jobs=-1).mean()
    return score

start = time.time()
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15) # Optuna explores 15 different smarter combinations
update_comparison('Optuna', study.best_value, time.time()-start, study.best_params)

[I 2025-12-18 10:40:35,665] A new study created in memory with name: no-name-d82162ab-776a-41df-b3a9-7bdceb563d75


Running Optuna...


[I 2025-12-18 10:40:36,532] Trial 0 finished with value: 0.9757645913317071 and parameters: {'n_estimators': 325, 'learning_rate': 0.06363931386220371, 'max_depth': 3}. Best is trial 0 with value: 0.9757645913317071.
[I 2025-12-18 10:40:37,455] Trial 1 finished with value: 0.9803290387098031 and parameters: {'n_estimators': 263, 'learning_rate': 0.15056641678077354, 'max_depth': 4}. Best is trial 1 with value: 0.9803290387098031.
[I 2025-12-18 10:40:42,102] Trial 2 finished with value: 0.9780450877753765 and parameters: {'n_estimators': 235, 'learning_rate': 0.26889102002093745, 'max_depth': 10}. Best is trial 1 with value: 0.9803290387098031.
[I 2025-12-18 10:40:46,442] Trial 3 finished with value: 0.9786085719894421 and parameters: {'n_estimators': 475, 'learning_rate': 0.2235514750992825, 'max_depth': 8}. Best is trial 1 with value: 0.9803290387098031.
[I 2025-12-18 10:40:48,147] Trial 4 finished with value: 0.9801024756250148 and parameters: {'n_estimators': 391, 'learning_rate': 0

## Final Result Display

In [34]:
comparison_df = pd.DataFrame(tuning_comparison).T
comparison_df

Unnamed: 0,Best_R2,Time_Sec,Best_Params
Grid Search,0.9813,11.88,"{'learning_rate': 0.05, 'max_depth': 6, 'n_est..."
Random Search,0.981,4.44,"{'n_estimators': 300, 'max_depth': 6, 'learnin..."
Optuna,0.9808,33.14,"{'n_estimators': 298, 'learning_rate': 0.07202..."
