In [1]:
import numpy as np
import pymc as pm
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression


In [17]:
import numpy as np
import pymc as pm
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Prepare data
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:
# Initialize and train models
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)
probs_rfc = rfc.predict_proba(X_test)[:, 1]

xgb = XGBClassifier(random_state=42)
xgb.fit(X_train, y_train)
probs_xgb = xgb.predict_proba(X_test)[:, 1]

lgr = LogisticRegression(random_state=42)
lgr.fit(X_train, y_train)
probs_lgr = lgr.predict_proba(X_test)[:, 1]


In [19]:
with pm.Model() as model:
    # Priors for model weights
    weights = pm.Dirichlet('weights', a=np.array([1., 1., 1.]))  # Uniform priors for simplicity

    # Model predictions as deterministic functions of weights and individual model predictions
    model_prediction = pm.Deterministic('prediction', weights[0] * probs_rfc + 
                                                     weights[1] * probs_xgb + 
                                                     weights[2] * probs_lgr)

    # Likelihood (sampling distribution) of observations
    observed = pm.Bernoulli('obs', p=model_prediction, observed=y_test)

    # Posterior distribution
    # trace = pm.sample(2000, return_inferencedata=False)
    # Posterior distribution
    trace = pm.sample(2000, return_inferencedata=True)


# View the trace summary to inspect model performance and weight values
print(pm.summary(trace))


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [weights]


Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 2 seconds.


                  mean     sd  hdi_3%  hdi_97%  mcse_mean  mcse_sd  ess_bulk  \
prediction[0]    0.854  0.036   0.786    0.919      0.000    0.000    5614.0   
prediction[1]    0.906  0.027   0.855    0.954      0.000    0.000    4212.0   
prediction[2]    0.163  0.045   0.086    0.253      0.001    0.000    5450.0   
prediction[3]    0.938  0.016   0.908    0.967      0.000    0.000    5565.0   
prediction[4]    0.989  0.004   0.981    0.996      0.000    0.000    4204.0   
...                ...    ...     ...      ...        ...      ...       ...   
prediction[198]  0.079  0.019   0.048    0.115      0.000    0.000    4704.0   
prediction[199]  0.956  0.015   0.927    0.982      0.000    0.000    3788.0   
weights[0]       0.334  0.158   0.039    0.613      0.003    0.002    3427.0   
weights[1]       0.507  0.135   0.252    0.750      0.002    0.001    5148.0   
weights[2]       0.160  0.114   0.000    0.364      0.002    0.001    3592.0   

                 ess_tail  r_hat  
pred

In [46]:
with model:
    ppc = pm.sample_posterior_predictive(trace)

# print(ppc['posterior_predictive']['obs'])

# # Obtain the mean prediction from the posterior predictive distribution
mean_prediction = np.mean(ppc['posterior_predictive']['obs'], axis=0)
print(len(mean_prediction))
final_predictions = (mean_prediction > 0.5).astype(int)
len(final_predictions)


Sampling: [obs]


2000


2000

In [52]:
mp = np.mean(mean_prediction, axis=0)
len(mp)

final_predictions = (mp > 0.5).astype(int)
print(final_predictions)


<xarray.DataArray 'obs' (obs_dim_2: 200)> Size: 2kB
array([1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 1])
Coordinates:
  * obs_dim_2  (obs_dim_2) int64 2kB 0 1 2 3 4 5 6 ... 194 195 196 197 198 199


In [53]:
from sklearn.metrics import accuracy_score, f1_score

print("Accuracy:", accuracy_score(y_test, final_predictions))
print("F1 Score:", f1_score(y_test, final_predictions))


Accuracy: 0.905
F1 Score: 0.9064039408866995
