### XGBoost vs Random Forest

In [4]:

import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score


# Load the data

df = pd.read_csv('diamonds.csv')

# Encode the categorical variables
le = LabelEncoder()
df['cut'] = le.fit_transform(df['cut'])
df['color'] = le.fit_transform(df['color'])
df['clarity'] = le.fit_transform(df['clarity'])


X = df.drop('price', axis=1)
y = df['price']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=42)

# Define the hyperparameters to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Create the Random Forest regressor
rf = RandomForestRegressor()

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X, y)  # X and y are your feature matrix and target vector
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Train the best model on the entire dataset
best_model.fit(X, y)
score = best_model.score(X, y)
print('R-squared:', score)
r2_score(y_test, best_model.predict(X_test))


In [None]:
import xgboost as xgb


# Define the hyperparameters to search
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'n_estimators': [100, 200, 300]
}

# Create the XGBoost regressor
xgb_model = xgb.XGBRegressor(random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X, y)  # X and y are your feature matrix and target vector
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Train the best model on the entire dataset
best_model.fit(X, y)
