In [1]:
# pip install frozendict 
# pip install pyarrow
# pip install -e ./modeling_package
# pip install ray[default]

In [2]:
# add p-value to feature selection

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from standard_modeling.utilities import StatsModelsRegressor, gen_scorer

In [5]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
import numpy as np

# Load example data
X, y = load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit a simple OLS model
model = StatsModelsRegressor(model_class=sm.OLS)
model.fit(X_train, y_train)

# Print model summary (statsmodels feature)
print(model.summary)

# Make predictions (sklearn-like)
y_pred = model.predict(X_test)

# Calculate R^2 score
score = model.score(X_test, y_test)
print(f"R^2 Score: {score:.4f}")

# Use in a scikit-learn pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', StatsModelsRegressor(model_class=sm.OLS))
])

# Perform cross-validation (sklearn feature)
cv_scores = cross_val_score(pipeline, X, y, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean():.4f}")

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.528
Model:                            OLS   Adj. R-squared:                  0.514
Method:                 Least Squares   F-statistic:                     38.25
Date:                Mon, 17 Mar 2025   Prob (F-statistic):           5.41e-50
Time:                        21:52:21   Log-Likelihood:                -1906.1
No. Observations:                 353   AIC:                             3834.
Df Residuals:                     342   BIC:                             3877.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        151.3456      2.902     52.155      0.0

In [6]:
scorer = gen_scorer('r2')
scorer(y_test, y_pred)

0.45260276297192037