In [25]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn.model_selection import GridSearchCV

In [26]:
df = pd.read_csv('cosine_data.csv')
df.head()

Unnamed: 0,mode_substrt,mode_pool,median_esveg92,median_temp,median_do,median_current,median_depth,median_cond,snag_present,riprap_present,inout_present,flooded_present,cluster
0,2.0,4,3.0,24.8,6.0,0.02,1.6,287.0,1.0,0.0,0.0,1.0,5
1,2.0,4,2.0,28.1,12.8,0.06,0.45,507.0,0.0,0.0,0.0,0.0,4
2,1.0,4,2.0,25.3,7.2,0.04,0.5,561.0,0.0,0.0,0.0,0.0,4
3,1.0,4,1.0,26.7,5.8,0.02,0.6,327.0,1.0,0.0,1.0,0.0,4
4,2.0,4,1.0,21.7,7.3,0.03,0.9,470.0,0.0,0.0,1.0,0.0,4


In [27]:
X = df.drop(columns = ['cluster'])
y = df['cluster'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state=5)

In [28]:
cat_cols = ['mode_pool', 'snag_present', 'riprap_present', 'inout_present', 'flooded_present', 'snag_present', 'mode_substrt']
num_cols = ['median_esveg92', 'median_temp', 'median_do', 'median_current', 'median_depth', 'median_cond']

scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [29]:
ohe = OneHotEncoder(drop = 'first', sparse_output = False).set_output(transform = 'pandas')
X_train = X_train.join(ohe.fit_transform(X_train[['mode_substrt', 'mode_pool']]))
X_test = X_test.join(ohe.transform(X_test[['mode_substrt', 'mode_pool']]))

X_train.drop(columns = ['mode_substrt', 'mode_pool'], inplace=True)
X_test.drop(columns = ['mode_substrt', 'mode_pool'], inplace=True)

## Stack Tree

In [30]:
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier

In [31]:
estimators = [
    ('dt1', DecisionTreeClassifier(random_state=1)),
    ('dt2', DecisionTreeClassifier(random_state=2)),
    ('dt3', DecisionTreeClassifier(random_state=3)),
    ('dt4', DecisionTreeClassifier(random_state=4)),
    ('dt5', DecisionTreeClassifier(random_state=5))
]

stack = StackingClassifier(
    estimators=estimators,
    final_estimator=DecisionTreeClassifier(), 
    passthrough=True
)

In [47]:
from sklearn.model_selection import RandomizedSearchCV

# shared grid for each dt
shared_grid = {
    'max_depth': [None, 2, 5],
    'min_samples_split': [2, 3, 5],
    'min_impurity_decrease': [0.0, 0.01, 0.1]
}

# build param_distributions
param_distributions = {}
for i in range(5):  # for dt1 to dt5
    for param, values in shared_grid.items():
        param_distributions[f'dt{i + 1}__{param}'] = values

# add final estimator's params
for param, values in shared_grid.items():
    param_distributions[f'final_estimator__{param}'] = values

In [48]:
param_distributions

{'dt1__max_depth': [None, 2, 5],
 'dt1__min_samples_split': [2, 3, 5],
 'dt1__min_impurity_decrease': [0.0, 0.01, 0.1],
 'dt2__max_depth': [None, 2, 5],
 'dt2__min_samples_split': [2, 3, 5],
 'dt2__min_impurity_decrease': [0.0, 0.01, 0.1],
 'dt3__max_depth': [None, 2, 5],
 'dt3__min_samples_split': [2, 3, 5],
 'dt3__min_impurity_decrease': [0.0, 0.01, 0.1],
 'dt4__max_depth': [None, 2, 5],
 'dt4__min_samples_split': [2, 3, 5],
 'dt4__min_impurity_decrease': [0.0, 0.01, 0.1],
 'dt5__max_depth': [None, 2, 5],
 'dt5__min_samples_split': [2, 3, 5],
 'dt5__min_impurity_decrease': [0.0, 0.01, 0.1],
 'final_estimator__max_depth': [None, 2, 5],
 'final_estimator__min_samples_split': [2, 3, 5],
 'final_estimator__min_impurity_decrease': [0.0, 0.01, 0.1]}

In [49]:
kappa_scorer = make_scorer(cohen_kappa_score)

random_search = RandomizedSearchCV(
    estimator=stack,  # your StackingClassifier
    param_distributions=param_distributions,
    n_iter=30,  # number of random combinations to try
    scoring=kappa_scorer,
    cv=5,
    random_state=42,
    verbose=1,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [50]:
from sklearn.metrics import confusion_matrix

# Use the best estimator to predict on the validation/test set
y_pred = random_search.best_estimator_.predict(X_train)

# Then compute the confusion matrix
cm = confusion_matrix(y_train, y_pred)
cm

array([[451,   6,   0,   1,   6],
       [  6, 236,  49,   3,  34],
       [  6,  61,  95,   3,   1],
       [ 12,  18,   5,  63,  34],
       [  4,  12,   0,  15,  61]], dtype=int64)

In [51]:
random_search.best_score_

0.6778317092933056

In [52]:
from sklearn.metrics import confusion_matrix

# Use the best estimator to predict on the validation/test set
y_pred = random_search.best_estimator_.predict(X_test)

# Then compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[188,   0,   1,   4,   3],
       [  2,  85,  31,   1,  14],
       [  2,  20,  40,   1,   1],
       [ 12,  13,   3,  18,  22],
       [  2,   5,   0,   6,  33]], dtype=int64)

In [53]:
import numpy as np

In [54]:
# Total number of samples
n = cm.sum()

# Observed agreement
po = np.trace(cm) / n

# Expected agreement
row_totals = cm.sum(axis=1)
col_totals = cm.sum(axis=0)
pe = np.sum(row_totals * col_totals) / n**2

# Cohen's kappa
kappa = (po - pe) / (1 - pe)

In [55]:
kappa

0.6186486145300764