In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Machine learning models and tools
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    roc_auc_score, matthews_corrcoef, cohen_kappa_score
)
from sklearn.ensemble import RandomForestClassifier

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the dataset (assuming the dataset is already loaded and categorical variables are encoded)
# For demonstration, let's assume the dataset is loaded into 'df'
df = pd.read_csv('ASD_Traits_Study_Data.csv')

# Display the first few rows of the dataset
df.head()

In [None]:
# Define features (X) and target variable (y)
X = df.drop(columns=['ASD_traits'])
y = df['ASD_traits']

In [None]:
# Split data into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Verify the shapes of training and testing datasets
print(f"Training set: {X_train.shape}, Testing set: {X_test.shape}")

In [None]:
# Feature scaling (if necessary)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Define hyperparameter ranges for RandomizedSearchCV as per the paper (Table V)

# Import scipy for distributions
from scipy.stats import randint

# Randomized Search parameter distributions
random_grid = {
    'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000],
    'min_samples_split': [2, 5, 10, 14],
    'min_samples_leaf': [1, 2, 4, 6, 8],
    'criterion': ['entropy', 'gini']
}

In [None]:
# Define hyperparameter values for GridSearchCV as per the paper (Table V)
grid_param = {
    'n_estimators': [600, 700, 800, 900, 1000],
    'max_features': ['sqrt'],
    'max_depth': [560],
    'min_samples_split': [3, 4, 5, 6, 7],
    'min_samples_leaf': [1, 3, 5],
    'criterion': ['entropy']
}

In [None]:
# Initialize Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=random_grid,
    n_iter=100,  # Number of parameter settings that are sampled
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1,
    scoring='accuracy'
)

# Fit the random search model
random_search.fit(X_train_scaled, y_train)

# Get the best model and parameters
best_random_model = random_search.best_estimator_
best_random_params = random_search.best_params_

print("Best Parameters from Randomized Search:")
print(best_random_params)

In [None]:
# Evaluate the best model on the test set
print("Evaluating the best model from RandomizedSearchCV on the test set...")

# Predict on the test set
y_test_pred_random = best_random_model.predict(X_test_scaled)
y_test_prob_random = best_random_model.predict_proba(X_test_scaled)[:, 1]

# Calculate performance metrics
accuracy_random = accuracy_score(y_test, y_test_pred_random)
precision_random = precision_score(y_test, y_test_pred_random)
recall_random = recall_score(y_test, y_test_pred_random)
f1_random = f1_score(y_test, y_test_pred_random)
roc_auc_random = roc_auc_score(y_test, y_test_prob_random)
mcc_random = matthews_corrcoef(y_test, y_test_pred_random)
kappa_random = cohen_kappa_score(y_test, y_test_pred_random)

# Print test set metrics
print("Test Set Metrics (Randomized Search):")
print(f"Accuracy: {accuracy_random:.4f}, Precision: {precision_random:.4f}, Recall: {recall_random:.4f}")
print(f"F1-Score: {f1_random:.4f}, ROC-AUC: {roc_auc_random:.4f}")
print(f"MCC: {mcc_random:.4f}, Cohen's Kappa: {kappa_random:.4f}")
print("-" * 40)

In [None]:
# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=grid_param,
    cv=5,
    verbose=1,
    n_jobs=-1,
    scoring='accuracy'
)

# Fit the grid search model
grid_search.fit(X_train_scaled, y_train)

# Get the best model and parameters
best_grid_model = grid_search.best_estimator_
best_grid_params = grid_search.best_params_

print("Best Parameters from Grid Search:")
print(best_grid_params)

In [None]:
# Evaluate the best model on the test set
print("Evaluating the best model from GridSearchCV on the test set...")

# Predict on the test set
y_test_pred_grid = best_grid_model.predict(X_test_scaled)
y_test_prob_grid = best_grid_model.predict_proba(X_test_scaled)[:, 1]

# Calculate performance metrics
accuracy_grid = accuracy_score(y_test, y_test_pred_grid)
precision_grid = precision_score(y_test, y_test_pred_grid)
recall_grid = recall_score(y_test, y_test_pred_grid)
f1_grid = f1_score(y_test, y_test_pred_grid)
roc_auc_grid = roc_auc_score(y_test, y_test_prob_grid)
mcc_grid = matthews_corrcoef(y_test, y_test_pred_grid)
kappa_grid = cohen_kappa_score(y_test, y_test_pred_grid)

# Print test set metrics
print("Test Set Metrics (Grid Search):")
print(f"Accuracy: {accuracy_grid:.4f}, Precision: {precision_grid:.4f}, Recall: {recall_grid:.4f}")
print(f"F1-Score: {f1_grid:.4f}, ROC-AUC: {roc_auc_grid:.4f}")
print(f"MCC: {mcc_grid:.4f}, Cohen's Kappa: {kappa_grid:.4f}")
print("-" * 40)

In [None]:
# Create a DataFrame to compare the performance
results = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC', 'MCC', "Cohen's Kappa"],
    'Randomized Search': [accuracy_random, precision_random, recall_random, f1_random, roc_auc_random, mcc_random, kappa_random],
    'Grid Search': [accuracy_grid, precision_grid, recall_grid, f1_grid, roc_auc_grid, mcc_grid, kappa_grid]
})

print("Comparison of Model Performance:")
print(results)

In [None]:
# Analyze if RandomizedSearchCV and GridSearchCV yielded similar performance
print("Performance comparison between RandomizedSearchCV and GridSearchCV:")
print(results)