# This is a sample Jupyter Notebook

Below is an example of a code cell. 
Put your cursor into the cell and press Shift+Enter to execute it and select the next one, or click 'Run Cell' button.

Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

To learn more about Jupyter Notebooks in PyCharm, see [help](https://www.jetbrains.com/help/pycharm/ipython-notebook-support.html).
For an overview of PyCharm, go to Help -> Learn IDE features or refer to [our documentation](https://www.jetbrains.com/help/pycharm/getting-started.html).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, recall_score, precision_score
from itertools import product
from tqdm import tqdm
from joblib import Parallel, delayed

In [None]:
path = '..\data\parkinsons.data'
data = pd.read_csv(path, sep=',')

In [None]:
X = data.drop(columns=['name', 'status'])
y = data['status']

K - FOLD

In [None]:
# Chạy 1500 tổ hợp: 10 x 15 x 10
param_grid = {
    'max_depth': list(range(3, 11)),
    'min_samples_split': list(range(5, 20)),
    'min_samples_leaf': list(range(2, 11)),
}
n_estimators = 100 # 100 cây
random_state_base = 42

combos = []
for max_depth, min_samples_split, min_samples_leaf in product(
    param_grid['max_depth'],
    param_grid['min_samples_split'],
    param_grid['min_samples_leaf']
):
    if min_samples_leaf >= min_samples_split:
        continue
    combos.append({
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf
    })

def evaluate_combo(params):
    f1_scores = []
    precision_scores = []
    recall_scores = []

    rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=20, random_state=42)
    for split_id, (train_idx, test_idx) in enumerate(rskf.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=params['max_depth'],
            min_samples_split=params['min_samples_split'],
            min_samples_leaf=params['min_samples_leaf'],
            random_state=random_state_base + split_id,
            criterion='gini',
            max_features='sqrt',
            class_weight='balanced',
            n_jobs=-1
        )

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        f1_scores.append(f1_score(y_test, y_pred, average='binary'))
        precision_scores.append(precision_score(y_test, y_pred, average='binary'))
        recall_scores.append(recall_score(y_test, y_pred, average='binary'))

    return {
        **params,
        'n_estimators': n_estimators,
        'mean_f1': np.mean(f1_scores),
        'std_f1': np.std(f1_scores),
        'mean_precision': np.mean(precision_scores),
        'std_precision': np.std(precision_scores),
        'mean_recall': np.mean(recall_scores),
        'std_recall': np.std(recall_scores)
    }

# Tính toán
results = Parallel(n_jobs=-1)(
    delayed(evaluate_combo)(params) for params in tqdm(combos)
)

# Kết quả
df_result = pd.DataFrame(results)
df_result_sorted = df_result.sort_values(by='mean_f1', ascending=False)

In [None]:
df_result_sorted