In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import cohen_kappa_score, make_scorer

In [2]:
df = pd.read_csv('cosine_data.csv')
df.head()

Unnamed: 0,mode_substrt,mode_pool,median_esveg92,median_temp,median_do,median_current,median_depth,median_cond,snag_present,riprap_present,inout_present,flooded_present,cluster
0,2.0,4,3.0,24.8,6.0,0.02,1.6,287.0,1.0,0.0,0.0,1.0,5
1,2.0,4,2.0,28.1,12.8,0.06,0.45,507.0,0.0,0.0,0.0,0.0,4
2,1.0,4,2.0,25.3,7.2,0.04,0.5,561.0,0.0,0.0,0.0,0.0,4
3,1.0,4,1.0,26.7,5.8,0.02,0.6,327.0,1.0,0.0,1.0,0.0,4
4,2.0,4,1.0,21.7,7.3,0.03,0.9,470.0,0.0,0.0,1.0,0.0,4


In [3]:
X = df.drop(columns = ['cluster'])
y = df['cluster'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state=5)

In [4]:
cat_cols = ['mode_pool', 'snag_present', 'riprap_present', 'inout_present', 'flooded_present', 'snag_present', 'mode_substrt']
num_cols = ['median_esveg92', 'median_temp', 'median_do', 'median_current', 'median_depth', 'median_cond']

scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [5]:
ohe = OneHotEncoder(drop = 'first', sparse_output = False).set_output(transform = 'pandas')
X_train = X_train.join(ohe.fit_transform(X_train[['mode_substrt', 'mode_pool']]))
X_test = X_test.join(ohe.transform(X_test[['mode_substrt', 'mode_pool']]))

X_train.drop(columns = ['mode_substrt', 'mode_pool'], inplace=True)
X_test.drop(columns = ['mode_substrt', 'mode_pool'], inplace=True)

In [6]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(class_weight="balanced")

In [7]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [None, 2, 5, 7, 10],
    'min_samples_split': [2, 3, 5, 10, 20],
    'min_impurity_decrease': [0.0, 0.01, 0.1],
    'ccp_alpha': [0.0, 0.1, 0.2, 0.5, 0.7, 1.0] 
}

kappa_scorer = make_scorer(cohen_kappa_score)
grid_search_dt = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=kappa_scorer, cv=5)
grid_search_dt.fit(X_train, y_train)

0,1,2
,estimator,DecisionTreeC...ht='balanced')
,param_grid,"{'ccp_alpha': [0.0, 0.1, ...], 'max_depth': [None, 2, ...], 'min_impurity_decrease': [0.0, 0.01, ...], 'min_samples_split': [2, 3, ...]}"
,scoring,make_scorer(c...hod='predict')
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.01


In [8]:
grid_search_dt.best_score_

0.6729639375157078

In [9]:
from sklearn.metrics import confusion_matrix

# Use the best estimator to predict on the validation/test set
y_pred = grid_search_dt.best_estimator_.predict(X_test)

# Then compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[188,   0,   1,   3,   4],
       [  2,  85,  31,   1,  14],
       [  2,  20,  40,   1,   1],
       [ 12,  13,   3,  17,  23],
       [  2,   5,   0,   4,  35]], dtype=int64)

In [10]:
cohen_kappa_score(y_test, y_pred)

0.621490610081807