In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn import model_selection
from sklearn import linear_model
from sklearn import tree
plt.style.use('seaborn-v0_8')
%matplotlib inline

In [2]:
water_data = pd.read_csv('data/water_potability.csv')
water_data.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [3]:
water_data['ph'] = water_data['ph'].fillna(water_data.groupby('Potability')['ph'].transform('median'))
water_data['Sulfate'] = water_data['Sulfate'].fillna(water_data.groupby('Potability')['Sulfate'].transform('median'))
water_data['Trihalomethanes'] = water_data['Trihalomethanes'].fillna(water_data.groupby('Potability')['Trihalomethanes'].transform('median'))


In [4]:
X = water_data.drop('Potability', axis=1)
y = water_data['Potability']

In [5]:
X_train, X_valid, y_train, y_valid = model_selection.train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
model_bad = tree.DecisionTreeClassifier(
    criterion='entropy',
    class_weight='balanced',
    random_state=42
)
model_bad.fit(X_train, y_train)

y_train_pred = model_bad.predict(X_train)
y_valid_pred = model_bad.predict(X_valid)

print('Train F1 score: {:.2f}'.format(metrics.f1_score(y_train, y_train_pred)))
print('Valid F1 score: {:.2f}'.format(metrics.f1_score(y_valid, y_valid_pred)))

Train F1 score: 1.00
Valid F1 score: 0.67


In [6]:
skf = model_selection.StratifiedKFold(n_splits=5)
cv_metrics = model_selection.cross_validate(
    estimator=model_bad,
    X=X,
    y=y,
    cv=skf,
    scoring='f1',
    return_train_score=True
)
display(cv_metrics)

{'fit_time': array([0.04046416, 0.02067685, 0.02029991, 0.01882195, 0.01920986]),
 'score_time': array([0.00278711, 0.00125003, 0.00108814, 0.0010941 , 0.00094318]),
 'test_score': array([0.61445783, 0.6805293 , 0.62813102, 0.63601533, 0.70259481]),
 'train_score': array([1., 1., 1., 1., 1.])}

In [7]:
print('Train k-fold mean f1: {:.2f}'.format(np.mean(cv_metrics['train_score'])))
print('Valid k-fold mean f1: {:.2f}'.format(np.mean(cv_metrics['test_score'])))

Train k-fold mean f1: 1.00
Valid k-fold mean f1: 0.65


In [8]:
print('Current depth:', model_bad.get_depth())

Current depth: 27


In [9]:
model = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_depth=7,
    random_state=42,
    class_weight='balanced'
)
skf = model_selection.StratifiedGroupKFold(n_splits=5)
cv_metrics = model_selection.cross_validate(
    estimator=model,
    X=X,
    y=y,
    cv=skf,
    scoring='f1',
    return_train_score=True
)
print('Train k-fold mean f1: {:.2f}'.format(np.mean(cv_metrics['train_score'])))
print('Valid k-fold mean f1: {:.2f}'.format(np.mean(cv_metrics['test_score'])))

Train k-fold mean f1: 0.76
Valid k-fold mean f1: nan


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 371, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/sit