# Import Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from lazypredict.Supervised import LazyClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

pd.set_option('display.max_colwidth', None)

# Import Dataset

In [2]:
# fetch dataset 
raw_df = pd.read_csv('dataset/Framingham Dataset.csv')

In [3]:
raw_df

Unnamed: 0,RANDID,SEX,TOTCHOL,AGE,SYSBP,DIABP,CURSMOKE,CIGPDAY,BMI,DIABETES,...,CVD,HYPERTEN,TIMEAP,TIMEMI,TIMEMIFC,TIMECHD,TIMESTRK,TIMECVD,TIMEDTH,TIMEHYP
0,2448,1,195.00,39,106.00,70.00,0,0.00,26.97,0,...,1,0,8766,6438,6438,6438,8766,6438,8766,8766
1,2448,1,209.00,52,121.00,66.00,0,0.00,,0,...,1,0,8766,6438,6438,6438,8766,6438,8766,8766
2,6238,2,250.00,46,121.00,81.00,0,0.00,28.73,0,...,0,0,8766,8766,8766,8766,8766,8766,8766,8766
3,6238,2,260.00,52,105.00,69.50,0,0.00,29.43,0,...,0,0,8766,8766,8766,8766,8766,8766,8766,8766
4,6238,2,237.00,58,108.00,66.00,0,0.00,28.50,0,...,0,0,8766,8766,8766,8766,8766,8766,8766,8766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11622,9998212,1,173.00,46,126.00,82.00,0,0.00,19.17,0,...,0,1,8766,8766,8766,8766,8766,8766,8766,0
11623,9998212,1,153.00,52,143.00,89.00,0,0.00,25.74,0,...,0,1,8766,8766,8766,8766,8766,8766,8766,0
11624,9999312,2,196.00,39,133.00,86.00,1,30.00,20.91,0,...,0,1,8766,8766,8766,8766,8766,8766,8766,4201
11625,9999312,2,240.00,46,138.00,79.00,1,20.00,26.39,0,...,0,1,8766,8766,8766,8766,8766,8766,8766,4201


# EDA

In [4]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11627 entries, 0 to 11626
Data columns (total 39 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   RANDID    11627 non-null  int64  
 1   SEX       11627 non-null  int64  
 2   TOTCHOL   11218 non-null  float64
 3   AGE       11627 non-null  int64  
 4   SYSBP     11627 non-null  float64
 5   DIABP     11627 non-null  float64
 6   CURSMOKE  11627 non-null  int64  
 7   CIGPDAY   11548 non-null  float64
 8   BMI       11575 non-null  float64
 9   DIABETES  11627 non-null  int64  
 10  BPMEDS    11034 non-null  float64
 11  HEARTRTE  11621 non-null  float64
 12  GLUCOSE   10187 non-null  float64
 13  educ      11332 non-null  float64
 14  PREVCHD   11627 non-null  int64  
 15  PREVAP    11627 non-null  int64  
 16  PREVMI    11627 non-null  int64  
 17  PREVSTRK  11627 non-null  int64  
 18  PREVHYP   11627 non-null  int64  
 19  TIME      11627 non-null  int64  
 20  PERIOD    11627 non-null  in

In [5]:
# Initialize empty lists for categorical, numerical, and binary features
categorical = []
numerical = []

# Iterate through columns and append to appropriate list
for column in raw_df.columns:
    if raw_df[column].dtype == 'object':
        categorical.append(column)
    elif raw_df[column].dtype in ['float64', 'int64']:
        numerical.append(column)
        
print('Categorical:', categorical)
print('Numerical:', numerical)


Categorical: []
Numerical: ['RANDID', 'SEX', 'TOTCHOL', 'AGE', 'SYSBP', 'DIABP', 'CURSMOKE', 'CIGPDAY', 'BMI', 'DIABETES', 'BPMEDS', 'HEARTRTE', 'GLUCOSE', 'educ', 'PREVCHD', 'PREVAP', 'PREVMI', 'PREVSTRK', 'PREVHYP', 'TIME', 'PERIOD', 'HDLC', 'LDLC', 'DEATH', 'ANGINA', 'HOSPMI', 'MI_FCHD', 'ANYCHD', 'STROKE', 'CVD', 'HYPERTEN', 'TIMEAP', 'TIMEMI', 'TIMEMIFC', 'TIMECHD', 'TIMESTRK', 'TIMECVD', 'TIMEDTH', 'TIMEHYP']


In [6]:
# # Calculate number of rows and columns for the grid
# num_cols = 3  # Number of columns in the grid
# num_rows_numerical = (len(numerical) + num_cols - 1) // num_cols  # Calculate number of rows needed for numerical features
# num_rows_categorical = (len(categorical) + num_cols - 1) // num_cols  # Calculate number of rows needed for categorical features

# # Plot numerical features in a grid
# plt.figure(figsize=(20, 4 * num_rows_numerical))  # Adjust height based on number of rows

# for i, feature in enumerate(numerical):
#     plt.subplot(num_rows_numerical, num_cols, i + 1)
#     sns.histplot(data=raw_df, x=feature, bins=15, hue='CVD')
#     plt.title(f'Histogram of {feature}')
#     plt.xlabel(feature)
#     plt.ylabel('Frequency')

# plt.tight_layout()
# plt.show()


# # Plot categorical features in a grid
# plt.figure(figsize=(20, 4 * num_rows_categorical))  # Adjust height based on number of rows

# for i, feature in enumerate(categorical):
#     plt.subplot(num_rows_categorical, num_cols, i + 1)
#     sns.countplot(y=feature, data=raw_df, hue='CVD')
#     plt.title(f'Bar Plot of {feature}')
#     plt.xlabel('Count')
#     plt.ylabel(feature)

# plt.tight_layout()
# plt.show()

In [7]:
# # Get value counts for all columns
# value_counts = {column: raw_df[column].value_counts() for column in raw_df.columns}

# # Print value counts for all columns
# for column, counts in value_counts.items():
#     print(f"Value counts for column '{column}':")
#     print(counts)
#     print()

# Preprocessing
## Handling Missing Values

In [8]:
# counter percentage missing value in dataset (Function)
def missing_value(data_frame):
    total = data_frame.isnull().sum().sort_values(ascending=False)
    percentage = (data_frame.isnull().sum() / data_frame.isnull().count() * 100).sort_values(ascending=False)
    dtypes = data_frame.dtypes
    return pd.concat([total, percentage, dtypes], axis=1, keys=['Total', 'Percentage', 'Dtypes'])

In [9]:
missing_value(raw_df)

Unnamed: 0,Total,Percentage,Dtypes
LDLC,8601,73.97,float64
HDLC,8600,73.97,float64
GLUCOSE,1440,12.38,float64
BPMEDS,593,5.1,float64
TOTCHOL,409,3.52,float64
educ,295,2.54,float64
CIGPDAY,79,0.68,float64
BMI,52,0.45,float64
HEARTRTE,6,0.05,float64
TIMEAP,0,0.0,int64


In [10]:
def impute_median(data):
    return data.fillna(data.median())

raw_df['TOTCHOL'] = raw_df['TOTCHOL'].transform(impute_median)
raw_df['CIGPDAY'] = raw_df['CIGPDAY'].transform(impute_median)
raw_df['BMI'] = raw_df['BMI'].transform(impute_median)
raw_df['BPMEDS'] = raw_df['BPMEDS'].transform(impute_median)
raw_df['HEARTRTE'] = raw_df['HEARTRTE'].transform(impute_median)
raw_df['GLUCOSE'] = raw_df['GLUCOSE'].transform(impute_median)
raw_df['educ'] = raw_df['educ'].transform(impute_median)

In [11]:
missing_value(raw_df)

Unnamed: 0,Total,Percentage,Dtypes
LDLC,8601,73.97,float64
HDLC,8600,73.97,float64
CVD,0,0.0,int64
DEATH,0,0.0,int64
ANGINA,0,0.0,int64
HOSPMI,0,0.0,int64
MI_FCHD,0,0.0,int64
ANYCHD,0,0.0,int64
STROKE,0,0.0,int64
RANDID,0,0.0,int64


## Outliers

In [12]:
# # Assuming `raw_df` is your dataframe and `numerical` is a list of numerical column names
# fig, axes = plt.subplots(nrows=num_rows_numerical, ncols=num_cols, figsize=(15, 5 * num_rows_numerical))

# # Flatten the axes array to make it easier to iterate over
# axes = axes.flatten()

# # Create box plots for each selected feature
# for i, feature in enumerate(numerical):
#     sns.boxplot(raw_df[feature], color='blue', ax=axes[i])
#     axes[i].set_title(f'{feature} Distribution')
#     axes[i].set_xlabel(f'Amount Spent on {feature}')
#     axes[i].set_ylabel('Amount')

# # Remove any extra subplots if there are fewer features than subplots
# for j in range(i + 1, num_rows_numerical * num_cols):
#     fig.delaxes(axes[j])

# # Adjust layout
# plt.tight_layout()
# plt.show()


In [13]:
raw_df = raw_df[raw_df['CIGPDAY'] <= 40]
raw_df = raw_df[(raw_df['DIABP'] > 40) & (raw_df['DIABP'] < 140)]
raw_df = raw_df[(raw_df['BMI'] <= 50) & (raw_df['BMI'] >= 20)]
raw_df = raw_df[raw_df['HEARTRTE'] <= 200]
raw_df = raw_df[raw_df['GLUCOSE'] < 450]
raw_df = raw_df[raw_df['TOTCHOL'] < 500]

In [14]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# import numpy as np

# # Assuming `raw_df` is your dataframe and `numerical` is a list of numerical column names
# fig, axes = plt.subplots(nrows=num_rows_numerical, ncols=num_cols, figsize=(15, 5 * num_rows_numerical))

# # Flatten the axes array to make it easier to iterate over
# axes = axes.flatten()

# # Create box plots for each selected feature
# for i, feature in enumerate(numerical):
#     sns.boxplot(raw_df[feature], color='blue', ax=axes[i])
#     axes[i].set_title(f'{feature} Distribution')
#     axes[i].set_xlabel(f'Amount Spent on {feature}')
#     axes[i].set_ylabel('Amount')

# # Remove any extra subplots if there are fewer features than subplots
# for j in range(i + 1, num_rows_numerical * num_cols):
#     fig.delaxes(axes[j])

# # Adjust layout
# plt.tight_layout()
# plt.show()


## Drop Absurd data

In [15]:
raw_df.drop(columns=['LDLC', 'HDLC', 'RANDID'], inplace=True)

In [16]:
filtered_df = raw_df[(raw_df['STROKE'] == 0) & (raw_df['CVD'] == 1) ]

indices_to_drop = filtered_df.index
raw_df.drop(indices_to_drop, inplace=True)

In [17]:
raw_df.drop(columns=['TIMEHYP', 'TIMEDTH', 'TIMESTRK', 'TIMECHD', 'TIMEMI', 'TIME', 'TIMEAP','TIMECVD', 'TIMEMIFC', 'PERIOD'], inplace=True)

In [18]:
raw_df.drop(columns=['CVD','ANGINA', 'ANYCHD', 'HOSPMI', 'MI_FCHD', 'DEATH'], inplace=True)

# Features Correlations

In [19]:
# # Calculate correlations with 'Class_Mapped'
# correlations = raw_df.corr()['CVD'].drop('CVD')

# # Plotting the correlations with explicit hue assignment
# plt.figure(figsize=(10, 6))
# sns.barplot(x=correlations.index, y=correlations.values, hue=correlations.index, dodge=False, palette='viridis', legend=False)

# plt.xlabel('Features')
# plt.ylabel('Correlation with Class_Mapped')
# plt.title('Correlation between Features and Class_Mapped')
# plt.xticks(rotation=45)
# plt.grid(True, axis='y', linestyle='--', alpha=0.7)

# plt.tight_layout()
# plt.show()

In [20]:
raw_df

Unnamed: 0,SEX,TOTCHOL,AGE,SYSBP,DIABP,CURSMOKE,CIGPDAY,BMI,DIABETES,BPMEDS,...,GLUCOSE,educ,PREVCHD,PREVAP,PREVMI,PREVSTRK,PREVHYP,ANGINA,CVD,HYPERTEN
0,1,195.00,39,106.00,70.00,0,0.00,26.97,0,0.00,...,77.00,4.00,0,0,0,0,0,0,1,0
1,1,209.00,52,121.00,66.00,0,0.00,25.48,0,0.00,...,92.00,4.00,0,0,0,0,0,0,1,0
2,2,250.00,46,121.00,81.00,0,0.00,28.73,0,0.00,...,76.00,2.00,0,0,0,0,0,0,0,0
3,2,260.00,52,105.00,69.50,0,0.00,29.43,0,0.00,...,86.00,2.00,0,0,0,0,0,0,0,0
4,2,237.00,58,108.00,66.00,0,0.00,28.50,0,0.00,...,71.00,2.00,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11621,1,185.00,40,141.00,98.00,0,0.00,25.60,0,0.00,...,72.00,3.00,0,0,0,0,1,0,0,1
11623,1,153.00,52,143.00,89.00,0,0.00,25.74,0,0.00,...,72.00,3.00,0,0,0,0,1,0,0,1
11624,2,196.00,39,133.00,86.00,1,30.00,20.91,0,0.00,...,80.00,3.00,0,0,0,0,0,0,0,1
11625,2,240.00,46,138.00,79.00,1,20.00,26.39,0,0.00,...,83.00,3.00,0,0,0,0,0,0,0,1


# Split

In [21]:
def split_data(X, y, test_size, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

# Imbalance

In [22]:
from imblearn.over_sampling import SMOTE

def oversample(df):
    X_sample = df.drop(columns=['STROKE'])
    y_sample = df['STROKE']
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_sample, y_sample)
    oversampled_df = pd.DataFrame(X_resampled, columns=X_sample.columns)
    oversampled_df['STROKE'] = y_resampled
    return oversampled_df.sample(frac=1, random_state=42).reset_index(drop=True)

def undersample(df):
    minority_class = df[df['STROKE'] == 1]
    majority_class = df[df['STROKE'] == 0]
    
    # Determine the size of the minority class
    n_minority = len(minority_class)
    
    # Sample from the majority class to match the size of the minority class
    majority_class_undersampled = majority_class.sample(n=n_minority, random_state=42)
    
    # Concatenate minority and undersampled majority classes
    undersampled_df = pd.concat([minority_class, majority_class_undersampled])
    
    return undersampled_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Standardize

In [23]:
def scale_data(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

# Classifier

In [24]:
def train_classifier(classifier, X_train, y_train):
    if classifier == 'dt':
        param_grid = {
            'criterion': ['gini', 'entropy'],
            'max_depth': [None, 10, 20, 30, 40, 50],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
        clf = DecisionTreeClassifier()
    elif classifier == 'rf':
        param_grid = {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'bootstrap': [True, False]
        }
        clf = RandomForestClassifier()
    elif classifier == 'svm':
        param_grid = {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'degree': [2, 3, 4],
            'gamma': ['scale', 'auto'],
            'coef0': [0.0, 0.1, 0.5, 1.0]
        }
        clf = SVC()
    elif classifier == 'knn':
        param_grid = {
            'n_neighbors': [3, 5, 7, 9],
            'weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
        }
        clf = KNeighborsClassifier()
    elif classifier == 'xgb':
        param_grid = {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 6, 9],
            'learning_rate': [0.01, 0.1, 0.2],
            'subsample': [0.6, 0.8, 1.0],
            'colsample_bytree': [0.6, 0.8, 1.0]
        }
        clf = XGBClassifier()
    else:
        raise ValueError(f"Classifier '{classifier}' not supported.")
    
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
    
    grid_search.fit(X_train, y_train)

    return grid_search

In [25]:
def run_experiment(df, balance_method, test_size, classifier):
    if balance_method == 'oversample':
        df = oversample(df)
    elif balance_method == 'undersample':
        df = undersample(df)
    
    X = df.drop(columns=['STROKE'])
    y = df['STROKE']
    
    # Split the data
    X_train, X_test, y_train, y_test = split_data(X, y, test_size)
    
    # Scale the data
    X_train, X_test = scale_data(X_train, X_test)
    
     # Train classifier based on the classifier parameter
    clf = train_classifier(classifier, X_train, y_train)
    
    best_param = clf.best_params_
    best_est = clf.best_estimator_
    y_pred = best_est.predict(X_test)
    
    # Get the classification report as a dict
    report = classification_report(y_test, y_pred, output_dict=True)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return report, conf_matrix, best_param

In [26]:
# Define the scenarios
scenarios = [
    ('dt','imbalance', 0.1),
    ('dt','imbalance', 0.2),
    ('dt','imbalance', 0.3),
    ('dt','oversample', 0.1),
    ('dt','oversample', 0.2),
    ('dt','oversample', 0.3),
    ('dt','undersample', 0.1),
    ('dt','undersample', 0.2),
    ('dt','undersample', 0.3),
    ('rf','imbalance', 0.1),
    ('rf','imbalance', 0.2),
    ('rf','imbalance', 0.3),
    ('rf','oversample', 0.1),
    ('rf','oversample', 0.2),
    ('rf','oversample', 0.3),
    ('rf','undersample', 0.1),
    ('rf','undersample', 0.2),
    ('rf','undersample', 0.3),
    ('knn','imbalance', 0.1),
    ('knn','imbalance', 0.2),
    ('knn','imbalance', 0.3),
    ('knn','oversample', 0.1),
    ('knn','oversample', 0.2),
    ('knn','oversample', 0.3),
    ('knn','undersample', 0.1),
    ('knn','undersample', 0.2),
    ('knn','undersample', 0.3),
    ('xgb','imbalance', 0.1),
    ('xgb','imbalance', 0.2),
    ('xgb','imbalance', 0.3),
    ('xgb','oversample', 0.1),
    ('xgb','oversample', 0.2),
    ('xgb','oversample', 0.3),
    ('xgb','undersample', 0.1),
    ('xgb','undersample', 0.2),
    ('xgb','undersample', 0.3),
    ('svm','imbalance', 0.1),
    ('svm','imbalance', 0.2),
    ('svm','imbalance', 0.3),
    ('svm','oversample', 0.1),
    ('svm','oversample', 0.2),
    ('svm','oversample', 0.3),
    ('svm','undersample', 0.1),
    ('svm','undersample', 0.2),
    ('svm','undersample', 0.3),
]

# Create an empty dataframe to store the results
results_df = pd.DataFrame()

# Loop through each scenario
for classifier, balance_method, test_size in scenarios:
    # Run the experiment
    report, conf_matrix, best_cls = run_experiment(raw_df, balance_method, test_size, classifier)
    
    # Flatten the classification report dict
    report_flat = {}

     # Add the scenario details
    report_flat['scenario'] = f'{classifier}_{balance_method}_split_{int(test_size*100)}%'
    report_flat['best_hyperparameter'] = f'{best_cls}'

    for key_outer, value_outer in report.items():
        if isinstance(value_outer, dict):
            for key_inner, value_inner in value_outer.items():
                report_flat[f'{key_outer}-{key_inner}'] = value_inner
        else:
            report_flat[key_outer] = value_outer
    
   
    
    # Append the result to the dataframe
    # results_df = results_df.(report_flat, ignore_index=True)
    results_df = pd.concat([results_df, pd.DataFrame([report_flat])], ignore_index=True)


    # Print and plot the confusion matrix
    # print(f"\nConfusion Matrix for {balance_method} with test size {test_size*100}%:")
    # print(conf_matrix)
    # plt.figure(figsize=(8, 6))
    # sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False,
    #             xticklabels=['No CVD', 'CVD'], yticklabels=['No CVD', 'CVD'])
    # plt.xlabel('Predicted')
    # plt.ylabel('Actual')
    # plt.title(f'Confusion Matrix: {balance_method} - Test Size {int(test_size*100)}%')
    # plt.show()

pd.options.display.float_format = '{:.4f}'.format

desired_columns = [
    'scenario', 'best_hyperparameter', 'accuracy',
    '0-precision', '0-recall', '0-f1-score', '0-support',
    '1-precision', '1-recall', '1-f1-score', '1-support',
    'macro avg-precision', 'macro avg-recall', 'macro avg-f1-score', 'macro avg-support',
    'weighted avg-precision', 'weighted avg-recall', 'weighted avg-f1-score', 'weighted avg-support'
]

# Reorder columns in results_df
results_df = results_df[desired_columns]

results_df


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=10; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=10; total time=   0.1s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=2, min_samples_split=2; total time=   0.1s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=2, min_samples_split=5; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=4, min_samples_split=2; total time=   0.0s
[CV] END criterion=gini, max_depth=None, min_samples_leaf=4, min_samples_spli

Unnamed: 0,scenario,best_hyperparameter,accuracy,0-precision,0-recall,0-f1-score,0-support,1-precision,1-recall,1-f1-score,1-support,macro avg-precision,macro avg-recall,macro avg-f1-score,macro avg-support,weighted avg-precision,weighted avg-recall,weighted avg-f1-score,weighted avg-support
0,dt_imbalance_split_10%,"{'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}",0.8538,0.8508,0.9718,0.9073,745.0,0.8696,0.5243,0.6542,267.0,0.8602,0.7481,0.7807,1012.0,0.8557,0.8538,0.8405,1012.0
1,dt_imbalance_split_20%,"{'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}",0.8522,0.8479,0.9715,0.9055,1475.0,0.8739,0.531,0.6606,548.0,0.8609,0.7513,0.7831,2023.0,0.855,0.8522,0.8392,2023.0
2,dt_imbalance_split_30%,"{'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}",0.8438,0.8432,0.9662,0.9005,2220.0,0.8473,0.5104,0.6371,815.0,0.8452,0.7383,0.7688,3035.0,0.8443,0.8438,0.8298,3035.0
3,dt_oversample_split_10%,"{'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 10}",0.8289,0.8169,0.8541,0.8351,747.0,0.8425,0.803,0.8223,726.0,0.8297,0.8286,0.8287,1473.0,0.8295,0.8289,0.8288,1473.0
4,dt_oversample_split_20%,"{'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10}",0.8153,0.7988,0.8433,0.8205,1474.0,0.8338,0.7874,0.8099,1472.0,0.8163,0.8153,0.8152,2946.0,0.8163,0.8153,0.8152,2946.0
5,dt_oversample_split_30%,"{'criterion': 'gini', 'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 10}",0.8086,0.7941,0.8316,0.8124,2203.0,0.8243,0.7856,0.8045,2216.0,0.8092,0.8086,0.8085,4419.0,0.8093,0.8086,0.8085,4419.0
6,dt_undersample_split_10%,"{'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}",0.7405,0.7249,0.7943,0.758,282.0,0.7603,0.684,0.7202,269.0,0.7426,0.7392,0.7391,551.0,0.7422,0.7405,0.7395,551.0
7,dt_undersample_split_20%,"{'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}",0.7611,0.7401,0.8108,0.7739,555.0,0.787,0.7106,0.7469,546.0,0.7636,0.7607,0.7604,1101.0,0.7634,0.7611,0.7605,1101.0
8,dt_undersample_split_30%,"{'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10}",0.7583,0.7463,0.7885,0.7668,832.0,0.772,0.7277,0.7492,819.0,0.7592,0.7581,0.758,1651.0,0.7591,0.7583,0.7581,1651.0
9,rf_imbalance_split_10%,"{'bootstrap': False, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}",0.8587,0.8436,0.9919,0.9118,745.0,0.9559,0.4869,0.6452,267.0,0.8997,0.7394,0.7785,1012.0,0.8732,0.8587,0.8414,1012.0
