In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
import psycopg2

# Load environment variables from .env file
load_dotenv()

# Database connection parameters
db_params = {
    "host": os.getenv("LOCAL_HOST"),
    "user": os.getenv("LOCAL_USER"),
    "password": os.getenv("LOCAL_PW"),
    "port": os.getenv("LOCAL_PORT"),
    "dbname": os.getenv("LOCAL_DB")
}

try:
    # Establish a connection to the database
    conn = psycopg2.connect(**db_params)
    print("Database connection successful")

    # SQL query to select all data from the golden_table in the dev schema
    sql_query = "SELECT * FROM dev.golden_table;"

    # Load the data into a pandas DataFrame
    df = pd.read_sql_query(sql_query, conn)

    # Close the database connection
    conn.close()

    # Display the first few rows of the DataFrame
    print("Golden table loaded into DataFrame:")
    print(df.info())

except Exception as e:
    print(f"An error occurred: {e}")


Database connection successful


  df = pd.read_sql_query(sql_query, conn)


Golden table loaded into DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23038 entries, 0 to 23037
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   school_name             23038 non-null  object 
 1   school_type             23038 non-null  object 
 2   enrollment              22863 non-null  float64
 3   grade_eight_enrollment  21613 non-null  float64
 4   math_counts             22507 non-null  float64
 5   math_high_pct           22507 non-null  float64
 6   math_low_pct            19960 non-null  float64
 7   read_counts             22386 non-null  float64
 8   read_high_pct           22386 non-null  float64
 9   read_low_pct            19907 non-null  float64
 10  pct_hhi_150k_200k       23038 non-null  float64
 11  pct_hhi_220k_plus       23038 non-null  float64
 12  avg_natwalkind          23038 non-null  float64
 13  total_10_14             23038 non-null  int64  
 14  pc

In [5]:
from pycaret.regression import *

# Drop rows where the target variable is null
df= df.dropna()


# Setting up the PyCaret environment
# We will try to predict 'math_high_pct'
s = setup(data=df, 
          target='math_high_pct', 
          session_id=123,
          ignore_features=['school_name', 'ncessch', 'dupe_rank'],
          numeric_features=['enrollment', 'grade_eight_enrollment', 
                            'schools_in_zip',
                            'math_counts', 'math_low_pct', 'read_counts', 
                            'read_high_pct', 'read_low_pct',  
                            'avg_natwalkind', 'total_10_14', 'pct_10_14', 
                            'total_pop', 
                            'hhi_150k_200k', 'hhi_220k_plus',
                            'pct_hhi_150k_200k', 'pct_hhi_220k_plus'
                             ],
          categorical_features=['school_type'],
          use_gpu=True, # Set to True if you have a compatible GPU and environment
          normalize=True,
          transformation=True,
          remove_outliers=True,
          feature_selection=True)

# Compare baseline models
best_model = compare_models()

# Print the best model
print(best_model)

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [I

Unnamed: 0,Description,Value
0,Session id,123
1,Target,math_high_pct
2,Target type,Regression
3,Original data shape,"(19191, 21)"
4,Transformed data shape,"(18519, 4)"
5,Transformed train set shape,"(12761, 4)"
6,Transformed test set shape,"(5758, 4)"
7,Ignore features,3
8,Numeric features,16
9,Categorical features,1


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce GTX 1660 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,6.7708,147.3247,12.1316,0.7013,0.5948,0.8114,18.066
lightgbm,Light Gradient Boosting Machine,6.8329,150.0175,12.2421,0.6958,0.5964,0.8109,1.825
gbr,Gradient Boosting Regressor,7.1292,152.7229,12.3517,0.6905,0.6064,0.8504,1.587
xgboost,Extreme Gradient Boosting,7.0257,163.0555,12.7628,0.6693,0.604,0.8151,1.308
rf,Random Forest Regressor,6.76,175.8357,13.2567,0.6431,0.6061,0.7885,1.488
knn,K Neighbors Regressor,6.9291,175.9811,13.2602,0.6431,0.615,0.8326,1.094
et,Extra Trees Regressor,6.7392,194.9646,13.9572,0.604,0.6321,0.7887,1.161
ada,AdaBoost Regressor,13.0443,259.2676,16.0846,0.4749,0.6697,0.9667,1.097
dt,Decision Tree Regressor,7.2042,280.6986,16.7402,0.4305,0.7714,0.798,0.984
lr,Linear Regression,13.462,331.405,18.1995,0.3279,0.7691,1.3685,1.039


<catboost.core.CatBoostRegressor object at 0x0000029D4C5D9EA0>


In [None]:
# Tune the best model with a more extensive search
tuned_best_model = tune_model(best_model, n_iter=50, search_library='optuna')

# Print tuned model
print(tuned_best_model)

### Advanced Tuning with a Custom Grid

For more control, you can define a custom search grid for Optuna. This allows you to specify the exact ranges and distributions for the hyperparameters you want to tune.

In [6]:
# Define a custom search grid for Optuna
# This is an example for RandomForestRegressor, adjust if your best model is different
custom_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Tune the model with the custom grid
tuned_with_grid = tune_model(best_model, custom_grid=custom_grid, n_iter=50)

# Print the model tuned with a custom grid
print(tuned_with_grid)

Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 50 candidates, totalling 500 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041002 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2573
[LightGBM] [Info] Number of data points in the train set: 11484, number of used features: 16
[LightGBM] [Info] Start training from score 48.235371
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003155 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2572
[LightGBM] [Info] Number of data points in the train set: 11484, number of used features: 16
[LightGBM] [Info] Start training from score 48.423894
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000880 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total B

ValueError: 
All the 500 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
500 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Leo\miniconda3\envs\milestone2\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Leo\miniconda3\envs\milestone2\lib\site-packages\pycaret\internal\pipeline.py", line 279, in fit
    clone(self.steps[-1][1]), X, y, **last_step_params["fit"]
  File "c:\Users\Leo\miniconda3\envs\milestone2\lib\site-packages\sklearn\base.py", line 91, in clone
    return _clone_parametrized(estimator, safe=safe)
  File "c:\Users\Leo\miniconda3\envs\milestone2\lib\site-packages\sklearn\base.py", line 125, in _clone_parametrized
    new_object = klass(**new_object_params)
TypeError: CatBoostRegressor.__init__() got an unexpected keyword argument 'min_samples_split'


In [None]:
# Finalize the model
# This trains the model on the entire dataset including the hold-out set
final_model = finalize_model(tuned_with_grid)
print(final_model)

In [None]:
# Save the final model pipeline
save_model(final_model, 'final_math_high_pct_model')

In [None]:
# Create an ensemble of the tuned model (bagging)
ensembled_model = ensemble_model(tuned_best_model)

# Print ensembled model
print(ensembled_model)

In [None]:
# Plot residuals
plot_model(ensembled_model, plot='residuals')

In [None]:
# Plot feature importance
plot_model(ensembled_model, plot='feature')