# Training model with multiclass output classifier

In [4]:
# Step 1: Install and import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib  # For saving the trained model

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# from google.colab import files
# uploaded = files.upload()

In [5]:
# Step 2: Load the dataset
file_path = "/content/finalcsv_withlabels.csv"
df = pd.read_csv(file_path)

In [6]:
# Step 3: Check for missing values
df = df.dropna()  # Drop rows with missing values
df.head(10)

Unnamed: 0,laterality_index,color_vision_score,inconsistent_discipline_score,involvement_score,poor_supervision_score,positive_parenting_score,conduct_problem_scale,emotional_problem_scale,hyperactivity_scale,MRI_Track_Age_at_Scan,...,195throw_199thcolumn,196throw_197thcolumn,196throw_198thcolumn,196throw_199thcolumn,197throw_198thcolumn,197throw_199thcolumn,198throw_199thcolumn,ethnicity,ADHD_Outcome,Sex_F
count,627.0,627.0,627.0,627.0,627.0,627.0,627.0,627.0,627.0,627.0,...,627.0,627.0,627.0,627.0,627.0,627.0,627.0,627.0,627.0,627.0
mean,60.352313,13.157895,13.575758,39.127592,16.583732,25.15311,2.417863,2.54067,6.326954,11.297572,...,0.003584,0.115018,0.050125,0.059464,0.093961,0.090206,0.127231,0.457735,0.912281,0.304625
std,47.737876,2.771191,3.804141,6.16361,5.476694,3.963754,2.168066,2.217312,2.503663,3.19594,...,0.048111,0.053683,0.05236,0.056831,0.053262,0.058701,0.056994,0.710071,0.283112,0.460616
min,-100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.138048,-0.031929,-0.15354,-0.121943,-0.056282,-0.20416,-0.037689,0.0,0.0,0.0
25%,46.67,14.0,11.0,36.0,13.0,23.0,1.0,1.0,5.0,8.855692,...,-0.027125,0.082622,0.016591,0.021924,0.060292,0.054555,0.08759,0.0,1.0,0.0
50%,76.67,14.0,14.0,40.0,16.0,26.0,2.0,2.0,6.0,10.819187,...,0.002328,0.112855,0.052705,0.060346,0.093665,0.089836,0.126375,0.0,1.0,0.0
75%,96.67,14.0,16.0,43.0,20.0,28.0,4.0,4.0,8.0,13.344627,...,0.033008,0.148341,0.082817,0.094447,0.12731,0.12766,0.162581,1.0,1.0,1.0
max,100.0,14.0,28.0,50.0,35.0,30.0,10.0,10.0,10.0,21.564453,...,0.192015,0.375635,0.193278,0.322084,0.270674,0.267162,0.30615,3.0,1.0,1.0


## Step 4:
The participant_id column was converted into numerical values using Label Encoding, though it was not used as a feature in training.

In [8]:
# Step 4: Encode categorical variables
le = LabelEncoder()
df["participant_id"] = le.fit_transform(df["participant_id"])

## Step 5:

We loaded in labels from the data set (the column ADHD_Outcome)


In [9]:
# Step 5: Define features (X) and dummy target (y) for now
X = df.drop(columns=["participant_id"])  # Features: all numerical columns except ID
y = df["ADHD_Outcome"] # ADHD binary label

## Step 6:


1.   Here I used StandardScaler to scale all numerical features. This step ensures
that all features are on the same scale, preventing models from giving undue weight to larger values.
2.   Then split the data into 80% training and 20% testing using train_test_split() to evaluate the model's generalization performance.
3. trained a RandomForestClassifier



In [10]:
# Step 6: Standardize numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# # Step 7: Split into X and Y
# X_train = X_scaled
# y_train = y

# # Step 8: Train a Multi-Class Classifier (Random Forest)
# rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
# rf_model.fit(X_train, y_train)

# Step 7: Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 8: Train a Multi-Class Classifier (Random Forest)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 9: Evaluate the model
y_pred = rf_model.predict(X_test)
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy Score: 0.8888888888888888
Classification Report:
               precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        14
         1.0       0.89      1.00      0.94       112

    accuracy                           0.89       126
   macro avg       0.44      0.50      0.47       126
weighted avg       0.79      0.89      0.84       126



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
# Step 10: Save the trained model
joblib.dump(rf_model, "trained_model.pkl")
print("Model saved as trained_model.pkl")

Model saved as trained_model.pkl


# Approach 2 (Optimized Model - Feature Selection + XGBoost)
## (not required but i just want to try to see if there is another way to improve)

In [14]:
# Install necessary libraries
!pip install xgboost

import pandas as pd
import numpy as np
import joblib
import xgboost as xgb
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
file_path = "/content/finalcsv_withlabels.csv"
df = pd.read_csv(file_path).dropna()

# Encode participant_id (optional)
le = LabelEncoder()
df["participant_id"] = le.fit_transform(df["participant_id"])

# Separate features and generate dummy target
X = df.drop(columns=["participant_id"])
y = df["ADHD_Outcome"]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into x & y
X_train = X_scaled
y_train = y

# Feature selection using Random Forest importance
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
feature_importance = rf.feature_importances_

# Select top 50 features
# Since MRI matrix columns might contain redundant information, we used Random Forest feature importance to select the top 50 most important features.
top_n = 50
important_features = np.argsort(feature_importance)[-top_n:]
X_train_selected = X_train[:, important_features]
# X_test_selected = X_test[:, important_features]

# Train XGBoost model
xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=10, learning_rate=0.1, random_state=42)   # more trees
xgb_model.fit(X_train_selected, y_train)

# Predict
y_pred = xgb_model.predict(X_test_selected)
print(y_pred)



array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

accuracy dropped to 30.2%, which suggests the model still isn't learning well =((((

In [13]:
# Feature selection using Random Forest importance
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
feature_importance = rf.feature_importances_

# Select top 50 features
# Since MRI matrix columns might contain redundant information, we used Random Forest feature importance to select the top 50 most important features.
top_n = 50
important_features = np.argsort(feature_importance)[-top_n:]
X_train_selected = X_train[:, important_features]
# X_test_selected = X_test[:, important_features]

# Train XGBoost model
xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=10, learning_rate=0.1, random_state=42)   # more trees
xgb_model.fit(X_train_selected, y_train)

# Predict & Evaluate
y_pred = xgb_model.predict(X_test_selected)
