In [41]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import cohen_kappa_score, make_scorer

In [42]:
df = pd.read_csv('cosine_data.csv')
df.head()

Unnamed: 0,mode_substrt,mode_pool,median_esveg92,median_temp,median_do,median_current,median_depth,median_cond,snag_present,riprap_present,inout_present,flooded_present,cluster
0,2.0,4,3.0,24.8,6.0,0.02,1.6,287.0,1.0,0.0,0.0,1.0,5
1,2.0,4,2.0,28.1,12.8,0.06,0.45,507.0,0.0,0.0,0.0,0.0,4
2,1.0,4,2.0,25.3,7.2,0.04,0.5,561.0,0.0,0.0,0.0,0.0,4
3,1.0,4,1.0,26.7,5.8,0.02,0.6,327.0,1.0,0.0,1.0,0.0,4
4,2.0,4,1.0,21.7,7.3,0.03,0.9,470.0,0.0,0.0,1.0,0.0,4


In [43]:
X = df.drop(columns = ['cluster'])
y = df['cluster'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state=50)

In [44]:
ohe = OneHotEncoder(sparse_output = False).set_output(transform = 'pandas')
y_train = ohe.fit_transform(pd.DataFrame(y_train))
y_test = ohe.transform(pd.DataFrame(y_test))

In [45]:
cat_cols = ['mode_pool', 'snag_present', 'riprap_present', 'inout_present', 'flooded_present', 'snag_present', 'mode_substrt']
num_cols = ['median_esveg92', 'median_temp', 'median_do', 'median_current', 'median_depth', 'median_cond']

scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [46]:
ohe = OneHotEncoder(drop = 'first', sparse_output = False).set_output(transform = 'pandas')
X_train = X_train.join(ohe.fit_transform(X_train[['mode_substrt', 'mode_pool']]))
X_test = X_test.join(ohe.transform(X_test[['mode_substrt', 'mode_pool']]))

X_train.drop(columns = ['mode_substrt', 'mode_pool'], inplace=True)
X_test.drop(columns = ['mode_substrt', 'mode_pool'], inplace=True)

## Tree 1

In [47]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

clf_1 = DecisionTreeClassifier(class_weight="balanced")

param_grid = {
    'max_depth': [None, 2, 5, 7, 10],
    'min_samples_split': [2, 3, 5, 10, 20],
    'min_impurity_decrease': [0.0, 0.01, 0.1],
    'ccp_alpha': [0.0, 0.1, 0.2, 0.5, 0.7, 1.0] 
}

kappa_scorer = make_scorer(cohen_kappa_score)
best_dt_cluster1 = GridSearchCV(estimator=clf_1, param_grid=param_grid, scoring=kappa_scorer, cv=5)
best_dt_cluster1.fit(X_train, y_train["cluster_1"])

y_pred = best_dt_cluster1.best_estimator_.predict(X_train)
cm = confusion_matrix(y_train["cluster_1"], y_pred)
cm

array([[695,  21],
       [ 15, 451]], dtype=int64)

## Tree 2

In [48]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

clf_2 = DecisionTreeClassifier(class_weight="balanced")

param_grid = {
    'max_depth': [None, 2, 5, 7, 10],
    'min_samples_split': [2, 3, 5, 10, 20],
    'min_impurity_decrease': [0.0, 0.01, 0.1],
    'ccp_alpha': [0.0, 0.1, 0.2, 0.5, 0.7, 1.0] 
}

kappa_scorer = make_scorer(cohen_kappa_score)
best_dt_cluster2 = GridSearchCV(estimator=clf_2, param_grid=param_grid, scoring=kappa_scorer, cv=5)
best_dt_cluster2.fit(X_train, y_train["cluster_2"])

y_pred = best_dt_cluster2.best_estimator_.predict(X_train)
cm = confusion_matrix(y_train["cluster_2"], y_pred)
cm

array([[795,  57],
       [  2, 328]], dtype=int64)

## Tree 3

In [49]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

clf_3 = DecisionTreeClassifier(class_weight="balanced")

param_grid = {
    'max_depth': [None, 2, 5, 7, 10],
    'min_samples_split': [2, 3, 5, 10, 20],
    'min_impurity_decrease': [0.0, 0.01, 0.1],
    'ccp_alpha': [0.0, 0.1, 0.2, 0.5, 0.7, 1.0] 
}

kappa_scorer = make_scorer(cohen_kappa_score)
best_dt_cluster3 = GridSearchCV(estimator=clf_3, param_grid=param_grid, scoring=kappa_scorer, cv=5)
best_dt_cluster3.fit(X_train, y_train["cluster_3"])

y_pred = best_dt_cluster3.best_estimator_.predict(X_train)
cm = confusion_matrix(y_train["cluster_3"], y_pred)
cm

array([[853, 176],
       [ 31, 122]], dtype=int64)

## Tree 4

In [50]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

clf_4 = DecisionTreeClassifier(class_weight="balanced")

param_grid = {
    'max_depth': [None, 2, 5, 7, 10],
    'min_samples_split': [2, 3, 5, 10, 20],
    'min_impurity_decrease': [0.0, 0.01, 0.1],
    'ccp_alpha': [0.0, 0.1, 0.2, 0.5, 0.7, 1.0] 
}

kappa_scorer = make_scorer(cohen_kappa_score)
best_dt_cluster4 = GridSearchCV(estimator=clf_4, param_grid=param_grid, scoring=kappa_scorer, cv=5)
best_dt_cluster4.fit(X_train, y_train["cluster_4"])

y_pred = best_dt_cluster4.best_estimator_.predict(X_train)
cm = confusion_matrix(y_train["cluster_4"], y_pred)
cm

array([[1007,   39],
       [   0,  136]], dtype=int64)

## Tree 5

In [51]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

clf_5 = DecisionTreeClassifier(class_weight="balanced")

param_grid = {
    'max_depth': [None, 2, 5, 7, 10],
    'min_samples_split': [2, 3, 5, 10, 20],
    'min_impurity_decrease': [0.0, 0.01, 0.1],
    'ccp_alpha': [0.0, 0.1, 0.2, 0.5, 0.7, 1.0] 
}

kappa_scorer = make_scorer(cohen_kappa_score)
best_dt_cluster5 = GridSearchCV(estimator=clf_5, param_grid=param_grid, scoring=kappa_scorer, cv=5)
best_dt_cluster5.fit(X_train, y_train["cluster_5"])

y_pred = best_dt_cluster5.best_estimator_.predict(X_train)
cm = confusion_matrix(y_train["cluster_5"], y_pred)
cm

array([[943, 142],
       [ 15,  82]], dtype=int64)

## Meta-Classifier

In [52]:
X_train['cluster_1'] = best_dt_cluster1.predict(X_train)
X_train['cluster_2'] = best_dt_cluster2.predict(X_train.drop(columns = ['cluster_1']))
X_train['cluster_3'] = best_dt_cluster3.predict(X_train.drop(columns = ['cluster_1', 'cluster_2']))
X_train['cluster_4'] = best_dt_cluster4.predict(X_train.drop(columns = ['cluster_1', 'cluster_2', 'cluster_3']))
X_train['cluster_5'] = best_dt_cluster5.predict(X_train.drop(columns = ['cluster_1', 'cluster_2', 'cluster_3', 'cluster_4']))
X_train

Unnamed: 0,median_esveg92,median_temp,median_do,median_current,median_depth,median_cond,snag_present,riprap_present,inout_present,flooded_present,...,mode_substrt_4.0,mode_pool_08,mode_pool_13,mode_pool_26,mode_pool_LG,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5
1270,-0.945861,1.328087,-0.238118,-0.486823,-0.769422,0.316987,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
895,-0.945861,1.224681,-0.349336,-0.186026,-0.066161,0.960325,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1560,-0.945861,0.793825,-0.757136,0.816631,-0.206813,2.100787,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
123,1.581662,0.104455,-2.499551,-0.286292,0.777752,-0.618777,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1627,-0.945861,1.104041,1.800879,-0.486823,2.887534,-0.158206,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,0.739154,-2.101528,-1.572734,-0.486823,-0.066161,-2.424509,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
132,1.581662,0.552546,-1.572734,-0.386558,1.481013,0.624034,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1313,0.739154,-1.291518,0.243826,-0.486823,-0.206813,-0.399457,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
109,1.581662,0.431906,-0.757136,-0.386558,-0.136487,-0.092410,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [53]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

clf_meta = DecisionTreeClassifier(class_weight="balanced")

param_grid = {
    'max_depth': [None, 2, 5, 7, 10],
    'min_samples_split': [2, 3, 5, 10, 20],
    'min_impurity_decrease': [0.0, 0.01, 0.1],
    'ccp_alpha': [0.0, 0.1, 0.2, 0.5, 0.7, 1.0] 
}

kappa_scorer = make_scorer(cohen_kappa_score)
best_dt_cluster_meta = GridSearchCV(estimator=clf_meta, param_grid=param_grid, scoring=kappa_scorer, cv=5)
best_dt_cluster_meta.fit(X_train, y_train.idxmax(axis=1))

y_pred = best_dt_cluster_meta.best_estimator_.predict(X_train)
cm = confusion_matrix(y_train.idxmax(axis=1), y_pred)
cm

array([[447,   1,   3,  11,   4],
       [  4, 316,   1,   8,   1],
       [  0,  37, 112,   1,   3],
       [  0,   0,   0, 136,   0],
       [  3,  10,   4,  19,  61]], dtype=int64)

In [54]:
pd.DataFrame(best_dt_cluster_meta.cv_results_).sort_values("rank_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_ccp_alpha,param_max_depth,param_min_impurity_decrease,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
65,0.005019,0.000591,0.002583,0.000383,0.0,10,0.01,2,"{'ccp_alpha': 0.0, 'max_depth': 10, 'min_impur...",0.849557,0.878221,0.882844,0.878327,0.871698,0.872129,0.011832,1
66,0.005435,0.000730,0.002770,0.000323,0.0,10,0.01,3,"{'ccp_alpha': 0.0, 'max_depth': 10, 'min_impur...",0.849557,0.878221,0.882844,0.878327,0.871698,0.872129,0.011832,1
37,0.005063,0.000585,0.003156,0.000511,0.0,5,0.01,5,"{'ccp_alpha': 0.0, 'max_depth': 5, 'min_impuri...",0.849557,0.878221,0.882844,0.878327,0.871698,0.872129,0.011832,1
38,0.004805,0.000995,0.002226,0.000595,0.0,5,0.01,10,"{'ccp_alpha': 0.0, 'max_depth': 5, 'min_impuri...",0.849557,0.878221,0.882844,0.878327,0.871698,0.872129,0.011832,1
39,0.002849,0.000365,0.001540,0.000494,0.0,5,0.01,20,"{'ccp_alpha': 0.0, 'max_depth': 5, 'min_impuri...",0.849557,0.878221,0.882844,0.878327,0.871698,0.872129,0.011832,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246,0.002757,0.000310,0.001396,0.000503,0.5,2,0.01,3,"{'ccp_alpha': 0.5, 'max_depth': 2, 'min_impuri...",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,151
245,0.002724,0.000410,0.001457,0.000451,0.5,2,0.01,2,"{'ccp_alpha': 0.5, 'max_depth': 2, 'min_impuri...",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,151
244,0.002353,0.000375,0.001884,0.000283,0.5,2,0.0,20,"{'ccp_alpha': 0.5, 'max_depth': 2, 'min_impuri...",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,151
242,0.002729,0.000597,0.001775,0.000393,0.5,2,0.0,5,"{'ccp_alpha': 0.5, 'max_depth': 2, 'min_impuri...",0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,151


## Testing

In [55]:
X_test['cluster_1'] = best_dt_cluster1.predict(X_test)
X_test['cluster_2'] = best_dt_cluster2.predict(X_test.drop(columns = ['cluster_1']))
X_test['cluster_3'] = best_dt_cluster3.predict(X_test.drop(columns = ['cluster_1', 'cluster_2']))
X_test['cluster_4'] = best_dt_cluster4.predict(X_test.drop(columns = ['cluster_1', 'cluster_2', 'cluster_3']))
X_test['cluster_5'] = best_dt_cluster5.predict(X_test.drop(columns = ['cluster_1', 'cluster_2', 'cluster_3', 'cluster_4']))
X_test

Unnamed: 0,median_esveg92,median_temp,median_do,median_current,median_depth,median_cond,snag_present,riprap_present,inout_present,flooded_present,...,mode_substrt_4.0,mode_pool_08,mode_pool_13,mode_pool_26,mode_pool_LG,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5
1061,-0.103353,0.087221,-0.275191,-0.486823,-0.488118,-0.677262,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1161,-0.945861,0.362969,1.837951,-0.486823,-0.488118,-0.267866,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1205,-0.945861,1.035104,-0.423482,-0.486823,-0.910074,3.950383,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1396,-0.945861,0.483609,2.245751,-0.486823,-0.910074,-0.465253,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
29,1.581662,0.966167,0.799917,-0.085760,-0.417792,0.236570,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182,1.581662,-0.309167,-0.126900,0.616100,-0.741292,-0.903893,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
412,-0.103353,-0.515977,0.651626,2.320618,-0.066161,-0.231312,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
521,-0.103353,0.931699,1.430152,0.014505,-0.066161,-0.202070,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
733,-0.945861,0.535311,-0.015682,-0.486823,-0.066161,1.289304,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [56]:
y_pred = best_dt_cluster_meta.best_estimator_.predict(X_test)
cm = confusion_matrix(y_test.idxmax(axis=1), y_pred)
cm

array([[185,   0,   1,   6,   2],
       [  0, 103,  13,   8,   7],
       [  1,  43,  27,   6,   0],
       [  5,   9,   2,  35,  13],
       [  2,  13,   3,   9,  14]], dtype=int64)

In [57]:
import numpy as np

In [58]:
# Total number of samples
n = cm.sum()

# Observed agreement
po = np.trace(cm) / n

# Expected agreement
row_totals = cm.sum(axis=1)
col_totals = cm.sum(axis=0)
pe = np.sum(row_totals * col_totals) / n**2

# Cohen's kappa
kappa = (po - pe) / (1 - pe)

In [59]:
kappa

0.6153487014881821