In [1]:
from cleaning import data_new, X_train, X_test, y_train, y_test

In [41]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from scipy.stats import randint, uniform

# Define hyperparameter distributions for each model
knn_param_distributions = {
    'knn__n_neighbors': randint(1, 21),
    'knn__weights': ['uniform', 'distance']
}

rf_param_distributions = {
    'rf__n_estimators': randint(10, 200),
    'rf__max_depth': randint(3, 30)
}

dt_param_distributions = {
    'dt__max_depth': randint(1, 30),
    'dt__min_samples_split': randint(2, 10),
    'dt__min_samples_leaf': randint(1, 10)
}

nb_param_distributions = {
    'nb__var_smoothing': uniform(1e-9, 1e-3)
}

# Create pipelines for each classifier
knn_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('knn', KNeighborsClassifier())
])

rf_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('rf', RandomForestClassifier())
])

dt_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('dt', DecisionTreeClassifier())
])

nb_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('nb', GaussianNB())
])

# Set up RandomizedSearchCV for each model
knn_search = RandomizedSearchCV(
    estimator=knn_pipeline,
    param_distributions=knn_param_distributions,
    n_iter=10,  # Number of random samples to search
    cv=5,  # Number of cross-validation folds
    random_state=42
)

rf_search = RandomizedSearchCV(
    estimator=rf_pipeline,
    param_distributions=rf_param_distributions,
    n_iter=10,
    cv=5,
    random_state=42
)

dt_search = RandomizedSearchCV(
    estimator=dt_pipeline,
    param_distributions=dt_param_distributions,
    n_iter=10,
    cv=5,
    random_state=42
)

nb_search = RandomizedSearchCV(
    estimator=nb_pipeline,
    param_distributions=nb_param_distributions,
    n_iter=10,
    cv=5,
    random_state=42
)

# Fit each search object on the training data
knn_search.fit(X_train, y_train)
rf_search.fit(X_train, y_train)
dt_search.fit(X_train, y_train)
nb_search.fit(X_train, y_train)

# Make predictions on the test data for each model
y_pred_knn = knn_search.best_estimator_.predict(X_test)
y_pred_rf = rf_search.best_estimator_.predict(X_test)
y_pred_dt = dt_search.best_estimator_.predict(X_test)
y_pred_nb = nb_search.best_estimator_.predict(X_test)

# Calculate accuracy for each model
knn_accuracy = accuracy_score(y_test, y_pred_knn)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
dt_accuracy = accuracy_score(y_test, y_pred_dt)
nb_accuracy = accuracy_score(y_test, y_pred_nb)

# Print the best hyperparameters and accuracy for each model
print(f"Best parameters for KNeighborsClassifier: {knn_search.best_params_}")
print(f"Accuracy of KNeighborsClassifier: {knn_accuracy:.4f}")

print(f"Best parameters for RandomForestClassifier: {rf_search.best_params_}")
print(f"Accuracy of RandomForestClassifier: {rf_accuracy:.4f}")

print(f"Best parameters for DecisionTreeClassifier: {dt_search.best_params_}")
print(f"Accuracy of DecisionTreeClassifier: {dt_accuracy:.4f}")

print(f"Best parameters for GaussianNB: {nb_search.best_params_}")
print(f"Accuracy of GaussianNB: {nb_accuracy:.4f}")


Best parameters for KNeighborsClassifier: {'knn__n_neighbors': 7, 'knn__weights': 'distance'}
Accuracy of KNeighborsClassifier: 0.7107
Best parameters for RandomForestClassifier: {'rf__max_depth': 23, 'rf__n_estimators': 11}
Accuracy of RandomForestClassifier: 0.6942
Best parameters for DecisionTreeClassifier: {'dt__max_depth': 28, 'dt__min_samples_leaf': 5, 'dt__min_samples_split': 2}
Accuracy of DecisionTreeClassifier: 0.6736
Best parameters for GaussianNB: {'nb__var_smoothing': 0.0009507153064099162}
Accuracy of GaussianNB: 0.4959
