#Classify tabular data into three categories (Control vs. ASD) using multiple scikit-learn models and feature selection with SelectKBest

The steps:
<ol>
<li><b>Load the Data: </b>Load your tabular data into a pandas DataFrame.</li>
<li><b>Preprocess the Data:</b> Prepare the data for training, including handling missing values, encoding categorical variables, and splitting into features and labels.</li>
<li><b>Feature Selection: </b> Use SelectKBest to select the top k features that are most relevant for classification.</li>
<li><b>Model Training:</b> Train multiple scikit-learn models using the selected features.</li>
<li><b>Model Evaluation:</b> Evaluate the models using cross-validation or a separate validation set and select the best-performing model.</li>
<li><b>Model Saving:</b> Save the trained model for future use.</li></ol>

# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import (NeighborhoodComponentsAnalysis,KNeighborsClassifier)
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from itertools import product
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.neural_network import MLPClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import pickle
import os
import xgboost as xgb
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.metrics import classification_report
from pathlib import Path



# The data:
Tabular data with the fields: <br>
<ol>
<li>
<b>Type -</b> Control  / ASD ( another type of autism). </li>
<li> <b>Num. of active electrodes -</b> (0 - 60) . </li>
<li><b>Mean of all mean firing rate [spikes/sec] -</b> the average spike number during the investigation period (1 second) .  </li>
<li><b>STD of mean firing rate -</b> the STD spike number during the investigation period (1 second)</li>
<li><b>Mean of amplitudes  - </b> Mean of amplitudes.
The amplitude of a periodic variable is a measure of its change in a single period (such as time). </li>
<li><b>Std of amplitudes - </b>Std of amplitudes.



In [2]:
# 6.1 Making predictions on a custom image
#It looks like our model does well qualitatively on data from the test set.
#But how about on our own custom image?
#That's where the real fun of machine learning is!
#Predicting on your own custom data, outisde of any training or test set.

# Download custom image
import requests

image_url = "https://github.com/liatdavid2/my_custom_datasets/raw/main/Training_ASD_Table.csv"
image_name = image_url.split('/')[-1]
# Setup custom image path
data_path = Path("")
custom_image_path = data_path / image_name
print(custom_image_path)

# Download the image if it doesn't already exist
if not custom_image_path.is_file():
    with open(custom_image_path, "wb") as f:
        # When downloading from GitHub, need to use the "raw" file link
        request = requests.get(image_url)
        print(f"Downloading {custom_image_path}...")
        f.write(request.content)
else:
    print(f"{custom_image_path} already exists, skipping download.")

Training_ASD_Table.csv
Downloading Training_ASD_Table.csv...


# Evaluate model accuracy,precision,recall,f1 and confusion matrix

In [3]:
# Description: evaluate model accuracy,precision,recall,f1
#              and confusion matrix
# Input: y_test, y_pred and model_name:str
# Output: f1:float
def evaluate_model(y_test, y_pred,model_name):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred,average='micro')
    recall = recall_score(y_test, y_pred,average='micro')
    f1 = f1_score(y_test, y_pred,average='micro')
    false_positives = sum((y_test == 0) & (y_pred == 1))
    false_negatives = sum((y_test == 1) & (y_pred == 0))

    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    """print("False Positives:", false_positives)
    print("False Negatives:", false_negatives)
    cm = confusion_matrix(y_test,y_pred)
    print("confusion matrix:\n",cm)
    print(classification_report(y_test, y_pred, labels=[0, 1, 2]))"""
    return f1

# Train one of the model from models list with the data in the csv file and save him with his F1 score

In [24]:
# Description: build model from data csv file
# Input: base_model and model_name:str
def train_test_evaluate_model(model,model_name):
    # Step 1: Read the Excel file into a DataFrame
    file = "Training_ASD_Table.csv"  # Replace with the path to your csv file


    df = pd.read_csv(file)

    # Step 2: Preprocess the data if necessary
    # For example, handle missing values or encode categorical variables
    # encode categorical variables ID
    label_encoder = LabelEncoder()
    label_encoded_df = df.copy()
    df['type'] = label_encoder.fit_transform(df['type'])


    # Step 3: Extract features and labels
    # Assuming the last column contains the label (control vs. patient)
    X = df.drop(columns=['type'],axis=1).values
    y = df['type'].values
    scaler = StandardScaler()
    X = scaler.fit_transform(X)





    # Step 4: Split the data into training and testing sets (80-20 split)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Step 5: Train with model
    model.fit(X_train, y_train)

    # Step 6: Predictions
    y_pred = model.predict(X_test)
    #y_pred_proba = model.predict_proba(X_test)
    #print(y_pred_proba)
    print('\n--------------'+model_name+' classifier: -----------------')
    #for i in range(len(y_pred_proba)):
    #    print('real: ',y_test[i],'pred: ',np.argmax(y_pred_proba[i]),'prob: ',max(y_pred_proba[i]))
    # Step 7: Evaluate the model
    f1 = evaluate_model(y_test, y_pred,model_name)
    #print(label_encoder.classes_)
    # save the model to disk
    pickle.dump(model, open("models/F1_"+str(round(f1, 2))+'_'+model_name+'file_'+file.split('.')[0], 'wb'))
    print('Save model to '+"models/F1_"+str(round(f1, 2))+'_'+model_name+'file_'+file.split('.')[0])


# Model list to train

In [25]:
# Description: train & evaluate model from data csv file
def train_models_from_data():
    if not os.path.exists("models"):
        os.makedirs("models")
    # Create and train the Random Forest classifier
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    train_test_evaluate_model(rf_classifier,'Random_Forest')



    # Create and train the Gradient Boosting classifier
    gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)
    train_test_evaluate_model(gbc,'GradientBoosting')

    # Create and train the Hist Gradient Boosting classifier
    hgbc = HistGradientBoostingClassifier(max_iter=100)
    train_test_evaluate_model(hgbc,'HistGradientBoosting_Classifier')

    # Create and train the neural_network Multi-layer Perceptron classifier
    MLP = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(15,), random_state=1)
    train_test_evaluate_model(MLP,'Multi_layer_Perceptron_Classifier')
    # Create and train Gaussian naive_bayes classifier
    gnb = GaussianNB()
    train_test_evaluate_model(gnb,'Gaussian_naive_bayes')

    # Create and train SVM Kernel classifier
    rbf_svc = SVC(kernel ='rbf', random_state = 0, probability=True)
    train_test_evaluate_model(rbf_svc,'SVM_Kernel')

    # Create and train the Gradient Voting classifier
    eclf = VotingClassifier(estimators=[('rf_classifier', rf_classifier), ('hgbc', hgbc), ('gbc', gbc)],voting='soft', weights=[2, 1, 2])
    train_test_evaluate_model(eclf,'Voting')

    xgb_model  = xgb.XGBClassifier(n_jobs=1)
    train_test_evaluate_model(xgb_model,'xgb_model')

    # Create and train the KNeighbors classifier
    nca = NeighborhoodComponentsAnalysis(random_state=42)
    knn = KNeighborsClassifier(n_neighbors=3)
    nca_pipe = Pipeline([('nca', nca), ('knn', knn)])
    train_test_evaluate_model(nca_pipe,'KNeighbors')

    #neigh = RadiusNeighborsClassifier(radius=2.5)
    #train_test_evaluate_model(neigh,'neigh_radius_1_9')

train_models_from_data()


--------------Random_Forest classifier: -----------------
Accuracy: 0.9090909090909091
Precision: 0.9090909090909091
Recall: 0.9090909090909091
F1 Score: 0.9090909090909091
Save model to models/F1_0.91_Random_Forestfile_Training_ASD_Table

--------------GradientBoosting classifier: -----------------
Accuracy: 0.7272727272727273
Precision: 0.7272727272727273
Recall: 0.7272727272727273
F1 Score: 0.7272727272727273
Save model to models/F1_0.73_GradientBoostingfile_Training_ASD_Table

--------------HistGradientBoosting_Classifier classifier: -----------------
Accuracy: 0.8181818181818182
Precision: 0.8181818181818182
Recall: 0.8181818181818182
F1 Score: 0.8181818181818182
Save model to models/F1_0.82_HistGradientBoosting_Classifierfile_Training_ASD_Table

--------------Multi_layer_Perceptron_Classifier classifier: -----------------
Accuracy: 0.9090909090909091
Precision: 0.9090909090909091
Recall: 0.9090909090909091
F1 Score: 0.9090909090909091
Save model to models/F1_0.91_Multi_layer_Per

# Make predictions on new data in csv file and add column with prediction

In [26]:
# Input: model_path - the model path url
#        csv_data_path - the data file
# Description: make_predictions with model and save to csv
def make_predictions(model_path,csv_data_path):
    # Step 1: load the model from disk
    loaded_model = pickle.load(open(model_path, 'rb'))
    df = pd.read_csv(csv_data_path)

    # Step 2: Preprocess the data if necessary
    # For example, handle missing values or encode categorical variables
    # encode categorical variables ID
    label_encoder = LabelEncoder()
    label_encoded_df = df.copy()
    df['type_encoder'] = label_encoder.fit_transform(df['type'])

    # Step 3: Extract features and labels
    X = df.drop(columns=['type','type_encoder'],axis=1).values
    #X = df.drop(columns=['type'],axis=1).values
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Step 4: Predict labels
    y_pred = loaded_model.predict(X)

    # Step 5: save predicted labels to new column in csv
    #print( label_encoder.inverse_transform(y_pred))
    df['pred'] =  label_encoder.inverse_transform(y_pred)
    df.to_csv(csv_data_path.split(".")[0]+'_with_pred.csv')
    print('Save prediction to '+csv_data_path.split(".")[0]+'_with_pred.csv')



best_model_path = '/content/models/F1_1.0_KNeighborsfile_Training_ASD_Table'
csv_data_path = "Training_ASD_Table.csv"  # Replace with the path to your csv file
make_predictions(best_model_path,csv_data_path)


Save prediction to Training_ASD_Table_with_pred.csv
