In [39]:
import pandas as pd

from sklearn.model_selection import test_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import cohen_kappa_score, make_scorer

In [40]:
df = pd.read_csv('cosine_data.csv')
df.head()

Unnamed: 0,mode_substrt,mode_pool,median_esveg92,median_temp,median_do,median_current,median_depth,median_cond,snag_present,riprap_present,inout_present,flooded_present,cluster
0,2.0,4,3.0,24.8,6.0,0.02,1.6,287.0,1.0,0.0,0.0,1.0,5
1,2.0,4,2.0,28.1,12.8,0.06,0.45,507.0,0.0,0.0,0.0,0.0,4
2,1.0,4,2.0,25.3,7.2,0.04,0.5,561.0,0.0,0.0,0.0,0.0,4
3,1.0,4,1.0,26.7,5.8,0.02,0.6,327.0,1.0,0.0,1.0,0.0,4
4,2.0,4,1.0,21.7,7.3,0.03,0.9,470.0,0.0,0.0,1.0,0.0,4


In [41]:
X = df.drop(columns = ['cluster'])
y = df['cluster'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state=5)

In [42]:
ohe = OneHotEncoder(sparse_output = False).set_output(transform = 'pandas')
y_train = ohe.fit_transform(pd.DataFrame(y_train))
y_test = ohe.transform(pd.DataFrame(y_test))

In [43]:
cat_cols = ['mode_pool', 'snag_present', 'riprap_present', 'inout_present', 'flooded_present', 'snag_present', 'mode_substrt']
num_cols = ['median_esveg92', 'median_temp', 'median_do', 'median_current', 'median_depth', 'median_cond']

scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [44]:
ohe = OneHotEncoder(drop = 'first', sparse_output = False).set_output(transform = 'pandas')
X_train = X_train.join(ohe.fit_transform(X_train[['mode_substrt', 'mode_pool']]))
X_test = X_test.join(ohe.transform(X_test[['mode_substrt', 'mode_pool']]))

X_train.drop(columns = ['mode_substrt', 'mode_pool'], inplace=True)
X_test.drop(columns = ['mode_substrt', 'mode_pool'], inplace=True)

## Tree 1

In [45]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

clf_1 = DecisionTreeClassifier(class_weight="balanced")

param_grid = {
    'max_depth': [None, 2, 5, 7, 10],
    'min_samples_split': [2, 3, 5, 10, 20],
    'min_impurity_decrease': [0.0, 0.01, 0.1],
    'ccp_alpha': [0.0, 0.1, 0.2, 0.5, 0.7, 1.0] 
}

kappa_scorer = make_scorer(cohen_kappa_score)
best_dt_cluster1 = GridSearchCV(estimator=clf_1, param_grid=param_grid, scoring=kappa_scorer, cv=5)
best_dt_cluster1.fit(X_train, y_train["cluster_1"])

y_pred = best_dt_cluster1.best_estimator_.predict(X_train)
cm = confusion_matrix(y_train["cluster_1"], y_pred)
cm

array([[698,  20],
       [ 13, 451]], dtype=int64)

## Tree 2

In [46]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

clf_2 = DecisionTreeClassifier(class_weight="balanced")

param_grid = {
    'max_depth': [None, 2, 5, 7, 10],
    'min_samples_split': [2, 3, 5, 10, 20],
    'min_impurity_decrease': [0.0, 0.01, 0.1],
    'ccp_alpha': [0.0, 0.1, 0.2, 0.5, 0.7, 1.0] 
}

kappa_scorer = make_scorer(cohen_kappa_score)
best_dt_cluster2 = GridSearchCV(estimator=clf_2, param_grid=param_grid, scoring=kappa_scorer, cv=5)
best_dt_cluster2.fit(X_train, y_train["cluster_2"])

y_pred = best_dt_cluster2.best_estimator_.predict(X_train)
cm = confusion_matrix(y_train["cluster_2"], y_pred)
cm

array([[795,  59],
       [  0, 328]], dtype=int64)

## Tree 3

In [47]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

clf_3 = DecisionTreeClassifier(class_weight="balanced")

param_grid = {
    'max_depth': [None, 2, 5, 7, 10],
    'min_samples_split': [2, 3, 5, 10, 20],
    'min_impurity_decrease': [0.0, 0.01, 0.1],
    'ccp_alpha': [0.0, 0.1, 0.2, 0.5, 0.7, 1.0] 
}

kappa_scorer = make_scorer(cohen_kappa_score)
best_dt_cluster3 = GridSearchCV(estimator=clf_3, param_grid=param_grid, scoring=kappa_scorer, cv=5)
best_dt_cluster3.fit(X_train, y_train["cluster_3"])

y_pred = best_dt_cluster3.best_estimator_.predict(X_train)
cm = confusion_matrix(y_train["cluster_3"], y_pred)
cm

array([[859, 157],
       [ 40, 126]], dtype=int64)

## Tree 4

In [48]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

clf_4 = DecisionTreeClassifier(class_weight="balanced")

param_grid = {
    'max_depth': [None, 2, 5, 7, 10],
    'min_samples_split': [2, 3, 5, 10, 20],
    'min_impurity_decrease': [0.0, 0.01, 0.1],
    'ccp_alpha': [0.0, 0.1, 0.2, 0.5, 0.7, 1.0] 
}

kappa_scorer = make_scorer(cohen_kappa_score)
best_dt_cluster4 = GridSearchCV(estimator=clf_4, param_grid=param_grid, scoring=kappa_scorer, cv=5)
best_dt_cluster4.fit(X_train, y_train["cluster_4"])

y_pred = best_dt_cluster4.best_estimator_.predict(X_train)
cm = confusion_matrix(y_train["cluster_4"], y_pred)
cm

array([[1023,   27],
       [   0,  132]], dtype=int64)

## Tree 5

In [49]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

clf_5 = DecisionTreeClassifier(class_weight="balanced")

param_grid = {
    'max_depth': [None, 2, 5, 7, 10],
    'min_samples_split': [2, 3, 5, 10, 20],
    'min_impurity_decrease': [0.0, 0.01, 0.1],
    'ccp_alpha': [0.0, 0.1, 0.2, 0.5, 0.7, 1.0] 
}

kappa_scorer = make_scorer(cohen_kappa_score)
best_dt_cluster5 = GridSearchCV(estimator=clf_5, param_grid=param_grid, scoring=kappa_scorer, cv=5)
best_dt_cluster5.fit(X_train, y_train["cluster_5"])

y_pred = best_dt_cluster5.best_estimator_.predict(X_train)
cm = confusion_matrix(y_train["cluster_5"], y_pred)
cm

array([[945, 145],
       [ 16,  76]], dtype=int64)

## Meta-Classifier

In [50]:
X_train['cluster_1'] = best_dt_cluster1.predict(X_train)
X_train['cluster_2'] = best_dt_cluster2.predict(X_train.drop(columns = ['cluster_1']))
X_train['cluster_3'] = best_dt_cluster3.predict(X_train.drop(columns = ['cluster_1', 'cluster_2']))
X_train['cluster_4'] = best_dt_cluster4.predict(X_train.drop(columns = ['cluster_1', 'cluster_2', 'cluster_3']))
X_train['cluster_5'] = best_dt_cluster5.predict(X_train.drop(columns = ['cluster_1', 'cluster_2', 'cluster_3', 'cluster_4']))
X_train

Unnamed: 0,median_esveg92,median_temp,median_do,median_current,median_depth,median_cond,snag_present,riprap_present,inout_present,flooded_present,...,mode_substrt_4.0,mode_pool_08,mode_pool_13,mode_pool_26,mode_pool_LG,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5
358,0.728222,0.328749,1.994878,-0.490111,-0.353596,-0.459650,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1081,-0.946405,-0.904116,-1.650241,-0.490111,-0.920211,-0.423405,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
820,-0.109092,-0.322908,-0.199633,1.944790,0.213020,-0.829339,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1545,-0.946405,0.328749,-0.162437,-0.490111,-0.353596,-0.118955,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
207,1.565536,0.540098,0.209513,-0.295319,0.213020,-0.872832,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1424,-0.946405,-1.555773,0.916220,-0.490111,0.071366,0.033270,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1142,-0.109092,-0.481419,-0.125242,-0.490111,-0.353596,-0.104458,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
998,-0.946405,-0.305296,-1.910607,-0.490111,-0.211942,2.555856,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
206,1.565536,0.046951,-1.166705,-0.490111,0.000539,-0.481396,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [62]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

clf_meta = DecisionTreeClassifier(class_weight="balanced")

param_grid = {
    'max_depth': [None, 2, 5, 7, 10],
    'min_samples_split': [2, 3, 5, 10, 20],
    'min_impurity_decrease': [0.0, 0.01, 0.1],
    'ccp_alpha': [0.0, 0.1, 0.2, 0.5, 0.7, 1.0] 
}

kappa_scorer = make_scorer(cohen_kappa_score)
best_dt_cluster_meta = GridSearchCV(estimator=clf_meta, param_grid=param_grid, scoring=kappa_scorer, cv=5)
best_dt_cluster_meta.fit(X_train, y_train.idxmax(axis=1))

y_pred = best_dt_cluster_meta.best_estimator_.predict(X_train)
cm = confusion_matrix(y_train.idxmax(axis=1), y_pred)
cm

array([[450,   2,   2,   5,   5],
       [  3, 316,   0,   9,   0],
       [  2,  42, 117,   2,   3],
       [  0,   0,   0, 132,   0],
       [  4,   8,   4,  11,  65]], dtype=int64)

## Testing

In [65]:
X_test['cluster_1'] = best_dt_cluster1.predict(X_test)
X_test['cluster_2'] = best_dt_cluster2.predict(X_test.drop(columns = ['cluster_1']))
X_test['cluster_3'] = best_dt_cluster3.predict(X_test.drop(columns = ['cluster_1', 'cluster_2']))
X_test['cluster_4'] = best_dt_cluster4.predict(X_test.drop(columns = ['cluster_1', 'cluster_2', 'cluster_3']))
X_test['cluster_5'] = best_dt_cluster5.predict(X_test.drop(columns = ['cluster_1', 'cluster_2', 'cluster_3', 'cluster_4']))
X_test

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- cluster_1
- cluster_2
- cluster_3
- cluster_4
- cluster_5


In [66]:
y_pred = best_dt_cluster_meta.best_estimator_.predict(X_test)
cm = confusion_matrix(y_test.idxmax(axis=1), y_pred)
cm

array([[187,   0,   0,   8,   1],
       [  1,  91,  23,  12,   6],
       [  0,  29,  33,   1,   1],
       [  8,   6,   2,  42,  10],
       [  1,  10,   1,  15,  19]], dtype=int64)

In [67]:
import numpy as np

In [68]:
# Total number of samples
n = cm.sum()

# Observed agreement
po = np.trace(cm) / n

# Expected agreement
row_totals = cm.sum(axis=1)
col_totals = cm.sum(axis=0)
pe = np.sum(row_totals * col_totals) / n**2

# Cohen's kappa
kappa = (po - pe) / (1 - pe)

In [69]:
kappa

0.6389403218914684