In [52]:
import pandas as pd
import numpy as np
import time
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import r2_score, mean_absolute_error
from xgboost import XGBRegressor
import optuna

In [53]:
df = sns.load_dataset('diamonds')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [54]:
df.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [55]:
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [56]:
X = df.drop('price', axis=1)
y = df['price']

In [57]:
numeric_features = ['carat', 'depth', 'table', 'x', 'y', 'z']
categorical_features = ['cut', 'color', 'clarity']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [59]:
X_train_pre = preprocessor.fit_transform(X_train)
X_test_pre = preprocessor.transform(X_test)

## TECHNIQUE 1 - Grid Search

In [60]:
print("Running Grid Search...")
grid_params = {
    'n_estimators': [100, 300],
    'max_depth': [3, 6],
    'learning_rate': [0.05, 0.1]
}
gs = GridSearchCV(XGBRegressor(random_state=42), grid_params, cv=3, n_jobs=-1)
start = time.time()
gs.fit(X_train_pre, y_train)
results.append({'Method': 'Grid Search', 'R2': r2_score(y_test, gs.predict(X_test_pre)), 'Time': time.time()-start})

Running Grid Search...


In [61]:
print(results)

[{'Model': 'Linear Regression', 'CV R2 Mean': 0.6469452898841019, 'CV R2 Std': 0.036393335311928766, 'Test R2': 0.6494754192267795, 'Test RMSE': 1331071.4167895124}, {'Model': 'Ridge Regression', 'CV R2 Mean': 0.6474089162581866, 'CV R2 Std': 0.0364366802058925, 'Test R2': 0.6485802570327498, 'Test RMSE': 1332769.9639418828}, {'Model': 'Lasso Regression', 'CV R2 Mean': 0.6469453222416697, 'CV R2 Std': 0.03639359585442679, 'Test R2': 0.6494746214108793, 'Test RMSE': 1331072.9315896071}, {'Model': 'Random Forest', 'CV R2 Mean': 0.600100429569008, 'CV R2 Std': 0.03135952309936972, 'Test R2': 0.6115321143409216, 'Test RMSE': 1401263.0789821919}, {'Model': 'Gradient Boosting', 'CV R2 Mean': 0.5712970499434329, 'CV R2 Std': 0.029786693588798388, 'Test R2': 0.6646855642239725, 'Test RMSE': 1301871.871671099}, {'Method': 'Grid Search', 'R2': 0.9814138968372863, 'Time': 14.922741889953613}]


## TECHNIQUE 2: Random Search (Stochastic)

In [62]:
print("Running Random Search...")
rs_params = {
    'n_estimators': np.arange(100, 1000, 100),
    'max_depth': np.arange(3, 10),
    'learning_rate': [0.01, 0.05, 0.1, 0.2]
}
rs = RandomizedSearchCV(XGBRegressor(random_state=42), rs_params, n_iter=15, cv=3, n_jobs=-1, random_state=42)
start = time.time()
rs.fit(X_train_pre, y_train)
results.append({'Method': 'Random Search', 'R2': r2_score(y_test, rs.predict(X_test_pre)), 'Time': time.time()-start})

Running Random Search...


In [63]:
results

[{'Model': 'Linear Regression',
  'CV R2 Mean': 0.6469452898841019,
  'CV R2 Std': 0.036393335311928766,
  'Test R2': 0.6494754192267795,
  'Test RMSE': 1331071.4167895124},
 {'Model': 'Ridge Regression',
  'CV R2 Mean': 0.6474089162581866,
  'CV R2 Std': 0.0364366802058925,
  'Test R2': 0.6485802570327498,
  'Test RMSE': 1332769.9639418828},
 {'Model': 'Lasso Regression',
  'CV R2 Mean': 0.6469453222416697,
  'CV R2 Std': 0.03639359585442679,
  'Test R2': 0.6494746214108793,
  'Test RMSE': 1331072.9315896071},
 {'Model': 'Random Forest',
  'CV R2 Mean': 0.600100429569008,
  'CV R2 Std': 0.03135952309936972,
  'Test R2': 0.6115321143409216,
  'Test RMSE': 1401263.0789821919},
 {'Model': 'Gradient Boosting',
  'CV R2 Mean': 0.5712970499434329,
  'CV R2 Std': 0.029786693588798388,
  'Test R2': 0.6646855642239725,
  'Test RMSE': 1301871.871671099},
 {'Method': 'Grid Search',
  'R2': 0.9814138968372863,
  'Time': 14.922741889953613},
 {'Method': 'Random Search',
  'R2': 0.9817410364287652,

## TECHNIQUE 3: Optuna (Bayesian / TPE)

In [64]:
print("Running Optuna Optimization...")
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0)
    }
    model = XGBRegressor(**params, random_state=42)
    model.fit(X_train_pre, y_train)
    return r2_score(y_test, model.predict(X_test_pre))

start = time.time()
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)
results.append({'Method': 'Optuna (TPE)', 'R2': study.best_value, 'Time': time.time()-start})

[I 2025-12-17 20:49:57,859] A new study created in memory with name: no-name-c4780acc-5273-4d70-a16b-5ee96a918fe2


Running Optuna Optimization...


[I 2025-12-17 20:50:07,276] Trial 0 finished with value: 0.982037960347009 and parameters: {'n_estimators': 370, 'max_depth': 12, 'learning_rate': 0.016278065815288685, 'subsample': 0.5257942762665099}. Best is trial 0 with value: 0.982037960347009.
[I 2025-12-17 20:50:11,028] Trial 1 finished with value: 0.9674399099811106 and parameters: {'n_estimators': 884, 'max_depth': 3, 'learning_rate': 0.012358265417399232, 'subsample': 0.9361110218649838}. Best is trial 0 with value: 0.982037960347009.
[I 2025-12-17 20:50:23,304] Trial 2 finished with value: 0.9803984861408199 and parameters: {'n_estimators': 693, 'max_depth': 10, 'learning_rate': 0.09078137417549816, 'subsample': 0.8063659773178105}. Best is trial 0 with value: 0.982037960347009.
[I 2025-12-17 20:50:23,914] Trial 3 finished with value: 0.9777680719980072 and parameters: {'n_estimators': 132, 'max_depth': 4, 'learning_rate': 0.12092850359492918, 'subsample': 0.6862068986917194}. Best is trial 0 with value: 0.982037960347009.
[

In [65]:
resultsltresultsts

[{'Model': 'Linear Regression',
  'CV R2 Mean': 0.6469452898841019,
  'CV R2 Std': 0.036393335311928766,
  'Test R2': 0.6494754192267795,
  'Test RMSE': 1331071.4167895124},
 {'Model': 'Ridge Regression',
  'CV R2 Mean': 0.6474089162581866,
  'CV R2 Std': 0.0364366802058925,
  'Test R2': 0.6485802570327498,
  'Test RMSE': 1332769.9639418828},
 {'Model': 'Lasso Regression',
  'CV R2 Mean': 0.6469453222416697,
  'CV R2 Std': 0.03639359585442679,
  'Test R2': 0.6494746214108793,
  'Test RMSE': 1331072.9315896071},
 {'Model': 'Random Forest',
  'CV R2 Mean': 0.600100429569008,
  'CV R2 Std': 0.03135952309936972,
  'Test R2': 0.6115321143409216,
  'Test RMSE': 1401263.0789821919},
 {'Model': 'Gradient Boosting',
  'CV R2 Mean': 0.5712970499434329,
  'CV R2 Std': 0.029786693588798388,
  'Test R2': 0.6646855642239725,
  'Test RMSE': 1301871.871671099},
 {'Method': 'Grid Search',
  'R2': 0.9814138968372863,
  'Time': 14.922741889953613},
 {'Method': 'Random Search',
  'R2': 0.9817410364287652,

## Final Comparison Table

In [68]:
comparison_df = pd.DataFrame(results)
print("\n--- FINAL COMPARISON ---")
print(comparison_df.sort_values(by='R2', ascending=False))


--- FINAL COMPARISON ---
               Model  CV R2 Mean  CV R2 Std   Test R2     Test RMSE  \
7                NaN         NaN        NaN       NaN           NaN   
8                NaN         NaN        NaN       NaN           NaN   
6                NaN         NaN        NaN       NaN           NaN   
5                NaN         NaN        NaN       NaN           NaN   
0  Linear Regression    0.646945   0.036393  0.649475  1.331071e+06   
1   Ridge Regression    0.647409   0.036437  0.648580  1.332770e+06   
2   Lasso Regression    0.646945   0.036394  0.649475  1.331073e+06   
3      Random Forest    0.600100   0.031360  0.611532  1.401263e+06   
4  Gradient Boosting    0.571297   0.029787  0.664686  1.301872e+06   

          Method        R2        Time  
7   Optuna (TPE)  0.982038  208.488969  
8   Optuna (TPE)  0.981982  147.818002  
6  Random Search  0.981741   63.883991  
5    Grid Search  0.981414   14.922742  
0            NaN       NaN         NaN  
1            NaN 

In [69]:
comparison_df.sort_values(by='R2', ascending=False)

Unnamed: 0,Model,CV R2 Mean,CV R2 Std,Test R2,Test RMSE,Method,R2,Time
7,,,,,,Optuna (TPE),0.982038,208.488969
8,,,,,,Optuna (TPE),0.981982,147.818002
6,,,,,,Random Search,0.981741,63.883991
5,,,,,,Grid Search,0.981414,14.922742
0,Linear Regression,0.646945,0.036393,0.649475,1331071.0,,,
1,Ridge Regression,0.647409,0.036437,0.64858,1332770.0,,,
2,Lasso Regression,0.646945,0.036394,0.649475,1331073.0,,,
3,Random Forest,0.6001,0.03136,0.611532,1401263.0,,,
4,Gradient Boosting,0.571297,0.029787,0.664686,1301872.0,,,
