In [None]:
import csv
import requests
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay

In [None]:
import pandas as pd

# Direct download URL for the Google Drive file
CSV_URL = 'https://drive.google.com/uc?export=download&id=1iyhG9KK2SpLS1BxP6axyTMZw7RMjWgwR'

try:
    # Read the CSV file directly into a DataFrame
    rent_apartments_dataset = pd.read_csv(CSV_URL)
except Exception as e:
    print("Error reading the CSV file:", e)

In [None]:
    print(rent_apartments_dataset.head(10))


In [None]:
# Print all column names as a list
print(rent_apartments_dataset.columns.tolist())

rent_apartments_dataset
Exp - 1 XGBoost regressor

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler


# Prepare features and target
X = rent_apartments_dataset.drop('price', axis=1)  # use all other columns as features
y = rent_apartments_dataset['price']  # 'price' as the target variable

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the XGBoost regressor
xgb_regressor = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 100)

# Fit the model
xgb_regressor.fit(X_train, y_train)

# Making predictions
y_pred = xgb_regressor.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the evaluation results
print(f'Mean Squared Error: {mse:.2f}')
print(f'Mean Absolute Error: {mae:.2f}')
print(f'R2 Score: {r2:.2f}')


Mean Squared Error: 0.26
Mean Absolute Error: 0.36
R2 Score: 0.75


In [None]:
# k- fold cross validation


import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler


# Initialize the XGBoost regressor
xgb_regressor = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 100)

# Perform 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
cv_results = cross_val_score(xgb_regressor, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

# Calculate mean and standard deviation of the cross-validated MSE
mean_cv_mse = -cv_results.mean()
std_cv_mse = cv_results.std()

# Print cross-validation results
print(f'10-fold Cross-validated Mean Squared Error: {mean_cv_mse:.2f}')
print(f'10-fold Cross-validated Standard Deviation of MSE: {std_cv_mse:.2f}')

# Fit the model on the full training set
xgb_regressor.fit(X_train, y_train)

# Making predictions on the test set
y_pred = xgb_regressor.predict(X_test)

# Evaluating the model on the test set
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the evaluation results
print(f'Test Set Mean Squared Error: {mse:.2f}')
print(f'Test Set Mean Absolute Error: {mae:.2f}')
print(f'Test Set R2 Score: {r2:.2f}')

10-fold Cross-validated Mean Squared Error: 0.28
10-fold Cross-validated Standard Deviation of MSE: 0.03
Test Set Mean Squared Error: 0.26
Test Set Mean Absolute Error: 0.36
Test Set R2 Score: 0.75


rent_apartments_dataset Exp 2 hyperparameter tuning for the XGBoost regressor

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler

# Assuming rent_apartments_dataset is loaded and available

# Prepare features and target
X = rent_apartments_dataset.drop('price', axis=1)  # use all other columns as features
y = rent_apartments_dataset['price']  # 'price' as the target variable

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Set up the parameter grid to tune
params = {
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 150, 200],
    'colsample_bytree': [0.3, 0.5, 0.7],
    'subsample': [0.6, 0.8, 1.0]
}

# Initialize the XGBoost regressor
xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror')

# Setup the grid search
grid_search = GridSearchCV(estimator=xgb_regressor, param_grid=params, cv=3, scoring='neg_mean_squared_error', verbose=1)

# Fit grid search
grid_search.fit(X_train, y_train)

# Print best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best MSE score found: ", -grid_search.best_score_)

# Use the best estimator to make predictions
y_pred = grid_search.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the evaluation results
print(f'Mean Squared Error: {mse:.2f}')
print(f'Mean Absolute Error: {mae:.2f}')
print(f'R2 Score: {r2:.2f}')


rent_apartments_dataset Exp 3 GridSearchCV for XGBoost Regressor

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler

# Assuming rent_apartments_dataset is already loaded and contains the data

# Prepare features and target
X = rent_apartments_dataset.drop('price', axis=1)  # use all other columns as features
y = rent_apartments_dataset['price']  # 'price' as the target variable

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the parameter grid
param_grid = {
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [50, 100, 150, 200],
    'colsample_bytree': [0.3, 0.5, 0.7],
    'subsample': [0.6, 0.8, 1.0]
}

# Initialize the XGBoost regressor
xgb_regressor = xgb.XGBRegressor(objective ='reg:squarederror', random_state=42)

# Setup the GridSearchCV
grid_search = GridSearchCV(estimator=xgb_regressor, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2)

# Fit grid search to the data
grid_search.fit(X_train, y_train)

# Best parameters and best MSE from GridSearchCV
best_params = grid_search.best_params_
best_mse = -grid_search.best_score_  # Convert from negative MSE to positive MSE

# Use the best estimator to make predictions
best_regressor = grid_search.best_estimator_
y_pred = best_regressor.predict(X_test)

# Evaluate the best model using regression metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the evaluation and best parameter results
print("Best parameters found: ", best_params)
print("Best MSE from CV: ", best_mse)
print(f'Mean Squared Error: {mse:.2f}')
print(f'Mean Absolute Error: {mae:.2f}')
print(f'R2 Score: {r2:.2f}')


rent_apartments_dataset Exp 4 Bayesian optimization for your XGBoost model

In [None]:
pip install bayesian-optimization


In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from bayes_opt import BayesianOptimization

# Function to be optimized
def xgb_evaluate(max_depth, learning_rate, n_estimators, colsample_bytree, subsample):
    params = {
        'eval_metric': 'rmse',
        'max_depth': int(max_depth),
        'learning_rate': learning_rate,
        'n_estimators': int(n_estimators),
        'colsample_bytree': colsample_bytree,
        'subsample': subsample,
        'objective': 'reg:squarederror',
        'silent': 1,
    }
    xgb_reg = xgb.XGBRegressor(**params)
    xgb_reg.fit(X_train, y_train)
    predictions = xgb_reg.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    return -mse

# Prepare features and target
X = rent_apartments_dataset.drop('price', axis=1)  # use all other columns as features
y = rent_apartments_dataset['price']  # 'price' as the target variable

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Bayesian optimization
xgb_bo = BayesianOptimization(xgb_evaluate, {
    'max_depth': (3, 10),
    'learning_rate': (0.01, 0.3),
    'n_estimators': (50, 300),
    'colsample_bytree': (0.3, 0.9),
    'subsample': (0.5, 1.0)
}, random_state=0)

xgb_bo.maximize(init_points=2, n_iter=10)

# Extract the best parameters
best_params = xgb_bo.max['params']
best_params['max_depth'] = int(best_params['max_depth'])
best_params['n_estimators'] = int(best_params['n_estimators'])
print("Best parameters: ", best_params)

# Re-train the model with the best parameters
best_xgb = xgb.XGBRegressor(**best_params)
best_xgb.fit(X_train, y_train)
best_y_pred = best_xgb.predict(X_test)

# Evaluating the model
best_mse = mean_squared_error(y_test, best_y_pred)
best_mae = mean_absolute_error(y_test, best_y_pred)
best_r2 = r2_score(y_test, best_y_pred)

# Print the evaluation results
print(f'Optimized Mean Squared Error: {best_mse:.2f}')
print(f'Optimized Mean Absolute Error: {best_mae:.2f}')
print(f'Optimized R2 Score: {best_r2:.2f}')


In [None]:
pip install matplotlib seaborn


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Data preparation
data = {
    'Experiment': ['XGBoost Regressor', 'Hyperparameter Tuning', 'GridSearchCV', 'Bayesian Optimization'],
    'MSE': [0.26, 0.19, 0.19, 0.19],
    'MAE': [0.36, 0.30, 0.30, 0.30],
    'R2 Score': [0.75, 0.81, 0.82, 0.82]
}

df = pd.DataFrame(data)

# Setting the style
sns.set(style="whitegrid")

# Setting the plot size
plt.figure(figsize=(10, 6))

# Plotting MSE
plt.plot(df['Experiment'], df['MSE'], marker='o', label='MSE', color='blue')

# Plotting MAE
plt.plot(df['Experiment'], df['MAE'], marker='o', label='MAE', color='green')

# Plotting R2 Score
plt.plot(df['Experiment'], df['R2 Score'], marker='o', label='R2 Score', color='red')

# Adding titles and labels
plt.title('Performance Comparison of XGBoost Models', fontsize=14)
plt.xlabel('Experiment', fontsize=12)
plt.ylabel('Values', fontsize=12)
plt.legend(title='Metrics')

# Show the plot
plt.show()


adult_dataset

In [None]:
# Direct download URL for the Google Drive file
CSV_URL = 'https://drive.google.com/uc?export=download&id=1K2zmMuv0mj-vTdVmL7skUD_uFND0gg0U'

try:
    # Read the CSV file directly into a DataFrame
    adult_dataset = pd.read_csv(CSV_URL)
except Exception as e:
    print("Error reading the CSV file:", e)




In [None]:
    print(adult_dataset.head(10))


   age  fnlwgt  education-num  sex  capital-gain  capital-loss  \
0   22  231912             11    0             0             0   
1   79  165209              9    1             0             0   
2   43   47818             13    0             0             0   
3   29  565769              1    1             0             0   
4   57  222216             11    0             0             0   
5   51  136823              9    0             0             0   
6   52  203392             10    1          5013             0   
7   71  162297              9    0             0             0   
8   28  187479             10    1             0             0   
9   25  186294              9    0             0             0   

   hours-per-week  native-country  income  workclass_Other  ...  \
0              37              39       0                0  ...   
1              40              39       0                0  ...   
2              40              39       0                0  ...   
3    

In [None]:
# Print all column names as a list
print(adult_dataset.columns.tolist())

Exp1 XGBoost - adult_dataset


In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder


# Encode 'income' if it is categorical
le = LabelEncoder()
adult_dataset['income'] = le.fit_transform(adult_dataset['income'])

# Prepare features and target
X = adult_dataset.drop('income', axis=1)
y = adult_dataset['income']

# Handle categorical variables if any (example using get_dummies)
X = pd.get_dummies(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost classifier
classifier = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')

# Fit the model
classifier.fit(X_train, y_train)

# Making predictions
y_pred = classifier.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
classification_metrics = classification_report(y_test, y_pred, target_names=['<=50K', '>50K'])

print(f'Accuracy: {accuracy:.2f}')
print("Classification Report:")
print(classification_metrics)


Accuracy: 0.84
Classification Report:
              precision    recall  f1-score   support

       <=50K       0.86      0.83      0.84      2342
        >50K       0.83      0.86      0.85      2333

    accuracy                           0.84      4675
   macro avg       0.85      0.84      0.84      4675
weighted avg       0.85      0.84      0.84      4675



In [None]:
# k- fold cross validation

# Perform K-Fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
cv_results = cross_val_score(classifier, X_train, y_train, cv=kf, scoring='accuracy')

# Calculate mean and standard deviation of the cross-validated accuracy
mean_cv_accuracy = cv_results.mean()
std_cv_accuracy = cv_results.std()

# Print cross-validation results
print(f'10-fold Cross-validated Accuracy: {mean_cv_accuracy:.2f}')
print(f'10-fold Cross-validated Standard Deviation of Accuracy: {std_cv_accuracy:.2f}')

# Fit the model on the full training set
classifier.fit(X_train, y_train)

# Making predictions on the test set
y_pred = classifier.predict(X_test)

# Evaluating the model on the test set
accuracy = accuracy_score(y_test, y_pred)
classification_metrics = classification_report(y_test, y_pred, target_names=['<=50K', '>50K'])

# Print the evaluation results
print(f'Test Set Accuracy: {accuracy:.2f}')
print("Classification Report:")
print(classification_metrics)

10-fold Cross-validated Accuracy: 0.84
10-fold Cross-validated Standard Deviation of Accuracy: 0.01
Test Set Accuracy: 0.84
Classification Report:
              precision    recall  f1-score   support

       <=50K       0.86      0.83      0.84      2342
        >50K       0.83      0.86      0.85      2333

    accuracy                           0.84      4675
   macro avg       0.85      0.84      0.84      4675
weighted avg       0.85      0.84      0.84      4675



Exp 2 Hyperparameter tuning for the XGBoost regressor - adult_dataset

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Encode 'income' if it is categorical
le = LabelEncoder()
adult_dataset['income'] = le.fit_transform(adult_dataset['income'])

# Prepare features and target
X = adult_dataset.drop('income', axis=1)
y = adult_dataset['income']

# Handle categorical variables if any (example using get_dummies)
X = pd.get_dummies(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')

# Define hyperparameter grid
param_grid = {
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'colsample_bytree': [0.3, 0.5, 0.7, 1],
    'subsample': [0.6, 0.8, 1.0]
}

# Setup the grid search
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1)

# Fit grid search
grid_search.fit(X_train, y_train)

# Print best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy found: ", grid_search.best_score_)

# Use the best estimator to make predictions
y_pred = grid_search.best_estimator_.predict(X_test)

# Evaluating the model on the test set
accuracy = accuracy_score(y_test, y_pred)
classification_metrics = classification_report(y_test, y_pred, target_names=['<=50K', '>50K'])

print(f'Accuracy on Test Set: {accuracy:.2f}')
print("Classification Report on Test Set:")
print(classification_metrics)


Exp 3 GridSearchCV for XGBoost adult_dataset

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Assuming 'income' is the target and it is categorical
le = LabelEncoder()
adult_dataset['income'] = le.fit_transform(adult_dataset['income'])

# Prepare features and target
X = adult_dataset.drop('income', axis=1)
y = adult_dataset['income']

# Handle categorical variables if any (example using get_dummies)
X = pd.get_dummies(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')

# Define hyperparameter grid
param_grid = {
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 150, 200],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'subsample': [0.6, 0.8, 1.0]
}

# Setup the grid search
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1)

# Fit grid search
grid_search.fit(X_train, y_train)

# Print best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy found: ", grid_search.best_score_)

# Use the best estimator to make predictions
y_pred = grid_search.best_estimator_.predict(X_test)

# Evaluating the model on the test set
accuracy = accuracy_score(y_test, y_pred)
classification_metrics = classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1'])

print(f'Accuracy on Test Set: {accuracy:.2f}')
print("Classification Report on Test Set:")
print(classification_metrics)


Fitting 3 folds for each of 432 candidates, totalling 1296 fits


Exp 4 Bayesian optimization for XGBoost model adult_dataset

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer


# Assuming 'income' is the target and it is categorical
le = LabelEncoder()
adult_dataset['income'] = le.fit_transform(adult_dataset['income'])

# Prepare features and target
X = adult_dataset.drop('income', axis=1)
y = adult_dataset['income']

# Handle categorical variables if any (example using get_dummies)
X = pd.get_dummies(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define search spaces
search_spaces = {
    'max_depth': Integer(3, 10),
    'learning_rate': Real(0.01, 0.2),
    'n_estimators': Integer(50, 300),
    'colsample_bytree': Real(0.3, 1.0),
    'subsample': Real(0.6, 1.0)
}

# Initialize the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')

# Initialize Bayesian optimization
opt = BayesSearchCV(xgb_classifier, search_spaces, n_iter=32, scoring='accuracy', cv=3, n_jobs=-1, random_state=0)

# Fit the model
opt.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters found: ", opt.best_params_)
print("Best score found: ", opt.best_score_)

# Use the best estimator to make predictions
y_pred = opt.best_estimator_.predict(X_test)

# Evaluate
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy on Test Set: {accuracy:.2f}')
