In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning models and tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    roc_auc_score, matthews_corrcoef, cohen_kappa_score, classification_report
)

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the dataset (assuming the dataset is already loaded and categorical variables are encoded)
# For demonstration, let's assume the dataset is loaded into 'df'
df = pd.read_csv('ASD_Traits_Study_Data.csv')

# Display the first few rows of the dataset
df.head()

In [None]:
# Define features (X) and target variable (y)
X = df.drop(columns=['ASD_traits'])
y = df['ASD_traits']

In [None]:
# Split data into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Verify the shapes of training and testing datasets
print(f"Training set: {X_train.shape}, Testing set: {X_test.shape}")

In [None]:
# Display first few rows of X_train
print("First few rows of X_train:")
print(X_train.head())

# Information about X_train
print("\nInformation about X_train:")
print(X_train.info())

# Statistical summary of X_train
print("\nStatistical summary of X_train:")
print(X_train.describe())

In [None]:
# Correlation Analysis
corr_matrix = X_train.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm')
plt.title('Feature Correlation Matrix')
plt.show()

In [None]:
# Feature Importance using Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
importances = rf.feature_importances_
feature_names = X_train.columns
feature_importance = pd.Series(importances, index=feature_names).sort_values(ascending=False)
print("\nFeature Importances:")
print(feature_importance)

In [None]:
# Chi-Square Test
chi_scores, p_values = chi2(X_train, y_train)
chi_scores_df = pd.Series(chi_scores, index=X_train.columns)
chi_scores_df.sort_values(ascending=False, inplace=True)
print("\nChi-Square Scores:")
print(chi_scores_df)

In [None]:
# Mutual Information
mi_scores = mutual_info_classif(X_train, y_train, random_state=42)
mi_scores_df = pd.Series(mi_scores, index=X_train.columns)
mi_scores_df.sort_values(ascending=False, inplace=True)
print("\nMutual Information Scores:")
print(mi_scores_df)

In [None]:
# Select top 10 features based on Mutual Information
selected_features = mi_scores_df.index[:10]
print("\nSelected Features based on Mutual Information:")
print(selected_features)

# Create new datasets with selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

In [None]:
# Train Random Forest on all features
rf_all_features = RandomForestClassifier(random_state=42)
rf_all_features.fit(X_train, y_train)

# Predict and evaluate on the test set with all features
y_test_pred = rf_all_features.predict(X_test)
y_test_prob = rf_all_features.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
roc_auc = roc_auc_score(y_test, y_test_prob)
mcc = matthews_corrcoef(y_test, y_test_pred)
kappa = cohen_kappa_score(y_test, y_test_pred)

print("Test Set Metrics (All Features):")
print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}, ROC-AUC: {roc_auc:.4f}")
print(f"MCC: {mcc:.4f}, Cohen's Kappa: {kappa:.4f}")
print("-" * 40)

In [None]:
# Train Random Forest on selected features
rf_selected_features = RandomForestClassifier(random_state=42)
rf_selected_features.fit(X_train_selected, y_train)

# Predict and evaluate on the test set with selected features
y_test_pred_selected = rf_selected_features.predict(X_test_selected)
y_test_prob_selected = rf_selected_features.predict_proba(X_test_selected)[:, 1]

accuracy_sel = accuracy_score(y_test, y_test_pred_selected)
precision_sel = precision_score(y_test, y_test_pred_selected)
recall_sel = recall_score(y_test, y_test_pred_selected)
f1_sel = f1_score(y_test, y_test_pred_selected)
roc_auc_sel = roc_auc_score(y_test, y_test_prob_selected)
mcc_sel = matthews_corrcoef(y_test, y_test_pred_selected)
kappa_sel = cohen_kappa_score(y_test, y_test_pred_selected)

print("Test Set Metrics (Selected Features):")
print(f"Accuracy: {accuracy_sel:.4f}, Precision: {precision_sel:.4f}, Recall: {recall_sel:.4f}")
print(f"F1-Score: {f1_sel:.4f}, ROC-AUC: {roc_auc_sel:.4f}")
print(f"MCC: {mcc_sel:.4f}, Cohen's Kappa: {kappa_sel:.4f}")
print("-" * 40)

In [None]:
# Create a DataFrame to compare the performance
results = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC', 'MCC', "Cohen's Kappa"],
    'All Features': [accuracy, precision, recall, f1, roc_auc, mcc, kappa],
    'Selected Features': [accuracy_sel, precision_sel, recall_sel, f1_sel, roc_auc_sel, mcc_sel, kappa_sel]
})

print("Comparison of Model Performance:")
print(results)