In [None]:
!pip install ucimlrepo optuna tpot shap

Collecting shap
  Downloading shap-0.45.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (540 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m540.5/540.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl (15 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.45.1 slicer-0.0.8


# 1. Early Stage Diabetes Risk Prediction

In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
early_stage_diabetes_risk_prediction = fetch_ucirepo(id=529)

# data (as pandas dataframes)
X = early_stage_diabetes_risk_prediction.data.features
y = early_stage_diabetes_risk_prediction.data.targets

# metadata
print(early_stage_diabetes_risk_prediction.metadata)

# variable information
print(early_stage_diabetes_risk_prediction.variables)


{'uci_id': 529, 'name': 'Early Stage Diabetes Risk Prediction', 'repository_url': 'https://archive.ics.uci.edu/dataset/529/early+stage+diabetes+risk+prediction+dataset', 'data_url': 'https://archive.ics.uci.edu/static/public/529/data.csv', 'abstract': 'This dataset contains the sign and symptpom data of newly diabetic or would be diabetic patient. ', 'area': 'Computer Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 520, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Gender'], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2020, 'last_updated': 'Mon Mar 04 2024', 'dataset_doi': '10.24432/C5VG8H', 'creators': [], 'intro_paper': {'title': 'Likelihood Prediction of Diabetes at Early Stage Using Data Mining Techniques', 'authors': 'M. M. F. Islam, Rahatara Ferdousi, Sadikur Rahman, Humayra Yasmin Bushra', 'published_in': 

# 1A

In [None]:
import pandas as pd

# Assuming X is your features DataFrame
# Let's first check the types to confirm which are categorical
print(X.dtypes)

# We will apply one-hot encoding to all categorical columns
X_encoded = pd.get_dummies(X, drop_first=True)

# View the transformed features to ensure encoding was applied
print(X_encoded.head())


age                    int64
gender                object
polyuria              object
polydipsia            object
sudden_weight_loss    object
weakness              object
polyphagia            object
genital_thrush        object
visual_blurring       object
itching               object
irritability          object
delayed_healing       object
partial_paresis       object
muscle_stiffness      object
alopecia              object
obesity               object
dtype: object
   age  gender_Male  polyuria_Yes  polydipsia_Yes  sudden_weight_loss_Yes  \
0   40         True         False            True                   False   
1   58         True         False           False                   False   
2   41         True          True           False                   False   
3   45         True         False           False                    True   
4   60         True          True            True                    True   

   weakness_Yes  polyphagia_Yes  genital_thrush_Yes  visual

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42, stratify=y)

# Check the size of each set
print("Training set:", X_train.shape)
print("Testing set:", X_test.shape)


Training set: (416, 16)
Testing set: (104, 16)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9903846153846154
Classification Report:
               precision    recall  f1-score   support

    Negative       0.98      1.00      0.99        40
    Positive       1.00      0.98      0.99        64

    accuracy                           0.99       104
   macro avg       0.99      0.99      0.99       104
weighted avg       0.99      0.99      0.99       104



  rf_classifier.fit(X_train, y_train)


# 1B

In [None]:
import optuna
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

def objective(trial):
    classifier_name = trial.suggest_categorical('classifier', ['MLP', 'RandomForest', 'XGBoost', 'LogisticRegression', 'NaiveBayes', 'SVC', 'kNN'])

    if classifier_name == 'MLP':
        model = MLPClassifier(
            hidden_layer_sizes=tuple(trial.suggest_categorical('hidden_layers', [(50,), (100,), (50, 50), (100, 100)])),
            activation=trial.suggest_categorical('activation', ['tanh', 'relu']),
            solver=trial.suggest_categorical('solver', ['sgd', 'adam']),
            alpha=trial.suggest_loguniform('alpha', 1e-4, 1e-1),
            learning_rate=trial.suggest_categorical('learning_rate', ['constant', 'adaptive']),
            max_iter=1000,
            random_state=42
        )
    elif classifier_name == 'RandomForest':
        model = RandomForestClassifier(
            n_estimators=trial.suggest_int('n_estimators', 50, 300),
            max_depth=trial.suggest_int('max_depth', 2, 20),
            random_state=42
        )
    elif classifier_name == 'XGBoost':
        model = XGBClassifier(
            n_estimators=trial.suggest_int('n_estimators', 50, 300),
            max_depth=trial.suggest_int('max_depth', 2, 10),
            learning_rate=trial.suggest_loguniform('learning_rate', 0.01, 0.2),
            subsample=trial.suggest_float('subsample', 0.5, 1.0),
            use_label_encoder=False,
            eval_metric='logloss',
            random_state=42
        )
    elif classifier_name == 'LogisticRegression':
        model = LogisticRegression(
            C=trial.suggest_loguniform('C', 1e-4, 10),
            max_iter=1000,
            random_state=42
        )
    elif classifier_name == 'NaiveBayes':
        model = GaussianNB()
    elif classifier_name == 'SVC':
        model = SVC(
            C=trial.suggest_loguniform('C', 1e-4, 10),
            gamma=trial.suggest_categorical('gamma', ['scale', 'auto']),
            random_state=42
        )
    elif classifier_name == 'kNN':
        model = KNeighborsClassifier(
            n_neighbors=trial.suggest_int('n_neighbors', 3, 20),
            weights=trial.suggest_categorical('weights', ['uniform', 'distance']),
            algorithm=trial.suggest_categorical('algorithm', ['ball_tree', 'kd_tree', 'brute'])
        )

    # 10-fold cross-validation
    scores = cross_val_score(model, X_encoded, y, n_jobs=-1, cv=10)
    return scores.mean()


In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)  # Adjust the number of trials based on your computational resources

print("Best trial:")
trial = study.best_trial

print(" Value: {:.3f}".format(trial.value))
print(" Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))


[I 2024-05-17 13:25:25,671] A new study created in memory with name: no-name-20ae7082-f899-4cbe-8c7b-09d1abd750eb
[I 2024-05-17 13:25:32,812] Trial 0 finished with value: 0.921153846153846 and parameters: {'classifier': 'kNN', 'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'brute'}. Best is trial 0 with value: 0.921153846153846.
[I 2024-05-17 13:25:33,164] Trial 1 finished with value: 0.8807692307692309 and parameters: {'classifier': 'NaiveBayes'}. Best is trial 0 with value: 0.921153846153846.
  C=trial.suggest_loguniform('C', 1e-4, 10),
[I 2024-05-17 13:25:33,943] Trial 2 finished with value: 0.7999999999999999 and parameters: {'classifier': 'SVC', 'C': 4.09176232993256, 'gamma': 'scale'}. Best is trial 0 with value: 0.921153846153846.
[I 2024-05-17 13:25:34,414] Trial 3 finished with value: 0.8807692307692309 and parameters: {'classifier': 'NaiveBayes'}. Best is trial 0 with value: 0.921153846153846.
  alpha=trial.suggest_loguniform('alpha', 1e-4, 1e-1),
[I 2024-05-17 13:25:

ValueError: Cannot set different distribution kind to the same parameter name.

In [None]:
# Create the best model from the study
best_classifier = trial.params['classifier']
# Initialize and set parameters based on best trial, similar to the 'if' conditions in `objective`
# For example:
# model = RandomForestClassifier(**params) if best_classifier == 'RandomForest' else ...

# Train the model
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Calculate accuracy and F1 score
from sklearn.metrics import accuracy_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='binary')

print("Accuracy on test data: {:.3f}".format(accuracy))
print("F1-score on test data: {:.3f}".format(f1))


# 1C

In [None]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def rf_objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 10, 100)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2'])

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )

    # 10-fold cross-validation, focusing on F1 score (weighted to account for class imbalance)
    scores = cross_val_score(model, X_encoded, y, cv=10, scoring='f1_weighted', n_jobs=-1)
    return scores.mean()

# Create the Optuna study object
study = optuna.create_study(direction='maximize')
study.optimize(rf_objective, n_trials=100)  # You can adjust the number of trials based on available computational resources

print("Best F1 score obtained:", study.best_value)
print("Best parameters:")
for key, value in study.best_trial.params.items():
    print(f"{key}: {value}")


In [None]:
# Instantiate the best model from the study
best_params = study.best_trial.params
best_rf_model = RandomForestClassifier(**best_params, random_state=42)

# Fit the model on the entire training data
best_rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = best_rf_model.predict(X_test)

# Calculate F1 score
from sklearn.metrics import f1_score

f1 = f1_score(y_test, y_pred, average='weighted')  # Using 'weighted' to consider label imbalance if present
print("F1-score on the test data: {:.3f}".format(f1))
