In [None]:
import pandas as pd
import numpy as np

# Specify the path to the pickle file
data_file_path = '../Dataset/Dementia_paper_dataset_data.pkl'
info_file_path = '../Dataset/Dementia_paper_dataset_info.pkl'

# Open the pickle file in read mode
with open(data_file_path, 'rb') as file:
    # Load the data from the pickle file
    data = pd.read_pickle(file)

# Open the pickle file in read mode
with open(info_file_path, 'rb') as file:
    # Load the data from the pickle file
    info = pd.read_pickle(file)

# Display the first few rows of the data
data['train'].head()

In [None]:
# Import the pandas library
import pandas as pd

# Specify the file path for the Excel file
excel_file_path = '../Dataset/Dementia_paper_dataset_data.xlsx'

# Create an ExcelWriter object
writer = pd.ExcelWriter(excel_file_path, engine='openpyxl')

# Loop through the dictionary items and save each dataframe as a separate sheet in the Excel file
for key, value in info.items():
    if key != 'Info':
        # Write the dataframe to the Excel file
        value.to_excel(writer, sheet_name=key, index=False)

# Save the Excel file
writer.close()



HC: 0, MCI: 1, Dementia: 2

In [None]:
import numpy as np
# Dataframe to numpy array in each dictionary
data_ndarrays = {}
for key, value in data.items():
    # Detect if the key is 'Info' and skip it
    if key != 'Info':
        # Drop the 'ID' and 'Task' columns if they exist
        if 'ID' in data[key].columns:
            data[key].drop('ID', axis=1, inplace=True)
        if 'Task' in data[key].columns:
            data[key].drop('Task', axis=1, inplace=True)  
        data_ndarrays[key] = value.values
    else:
        data_ndarrays[key] = value

label_ndarrays = {}
label_ndarrays_CInonCI = {}
for key, value in info.items():
    if key != 'Info':
        label_ndarrays[key] = value['Label'].values
        label_ndarrays_CInonCI[key] = value['Label'].values
    else:
        label_ndarrays[key] = value
        label_ndarrays_CInonCI[key] = value

# Replace 2 with 1 in label_ndarrays
label_ndarrays_CInonCI = {key: np.where(value == 2, 1, value) for key, value in label_ndarrays_CInonCI.items()}

print(label_ndarrays['train'])
label_ndarrays_CInonCI['train']



In [None]:
# Count the number of SCD-MCI or SCD-Control in the data set
group_ndarrays = {}
n_SCD_MCI = {}
n_SCD_Control = {}
for key, value in info.items():
    if key != 'Info':
        group_ndarrays[key] = value['Group'].values
        n_SCD_MCI[key] = np.sum(group_ndarrays[key] == 'SCD-MCI') 
        n_SCD_Control[key] = np.sum(group_ndarrays[key] == 'SCD-Control') 
    else:
        group_ndarrays[key] = value

print(n_SCD_MCI['train'])
print(n_SCD_Control['train'])

In [None]:
# data organizing and feature selection(fisher score)
import os
from feature_related import feature_selection as f_selection  # Import the feature_selection module

# Train & Valid data organizing
data_train_valid = np.concatenate([data_ndarrays['train'], data_ndarrays['valid']], axis=0)
label_train_valid = np.concatenate([label_ndarrays_CInonCI['train'], label_ndarrays_CInonCI['valid']], axis=0)

# Independent test data organizing
data_test = data_ndarrays['test']
label_test = label_ndarrays_CInonCI['test']

# Feature selection - filter_based
f_selection = f_selection()
fisher_scores = f_selection.fisher_score(data_train_valid, label_train_valid)
fisher_idx = np.argsort(fisher_scores)[::-1] # sort in descending order

### LDA

In [45]:
# classifying the data using LDA and LOO-CV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import Pipeline
from sklearn.model_selection import LeaveOneOut
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from tqdm import tqdm, trange
import os
from feature_related import feature_selection as f_selection  # Import the feature_selection module
from joblib import dump, load
from sklearn.model_selection import GridSearchCV
import pandas as pd

# Create the SVM classifier
lda = LinearDiscriminantAnalysis()

# Define the parameter grid for GridSearchCV
param_grid = {'solver': ['svd']}

# Specify the folder path
folder_path = '../Results/Classification/LDA/10Fold_test/'
# Create the folder if it does not exist
if not os.path.exists(folder_path):    os.makedirs(folder_path)

df_accCV =list()
max_dim = int(label_train_valid.shape[0]/2)  # Define the maximum number of features
for idx_AOFI in tqdm(range(1, max_dim+1)):
    data_CV = data_train_valid[:, fisher_idx[:idx_AOFI]]
    
    # Create the GridSearchCV object
    grid_search = GridSearchCV(estimator=lda, param_grid=param_grid, cv=10, n_jobs=7)

    # Train the model with gridsearchCV
    grid_search.fit(data_CV, label_train_valid)

    # Get the best parameters and best score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    # Get the best model
    best_model = grid_search.best_estimator_
    test_score = best_model.score(data_test[:, fisher_idx[:idx_AOFI]], label_test)
    
    # Append the loop index and accCV to the dataframe
    df_accCV.append({'#features': idx_AOFI, 'best_CVscore': best_score, 'best_params': best_params, 'test_score': test_score})

    # Save the best model to a file
    model_folder = os.path.join(folder_path, 'Models/')
    if not os.path.exists(model_folder):    os.makedirs(model_folder)
    model_file = os.path.join(model_folder, 'best_model_'+str(idx_AOFI)+'.joblib')
    dump(best_model, model_file)  # Save the best model to a file
    pd.Series(fisher_idx[:idx_AOFI]).to_csv(model_file+'_features_idx.csv', index=False)  # Save the best features index to a file
    

# Save the dataframe to csv
df_accCV = pd.DataFrame(df_accCV) # Convert the list to a dataframe
df_accCV.to_csv(os.path.join(folder_path, 'df_accCV.csv'), index=False)

# Create a dictionary with 'fisher_idx' and 'feature_type' as keys
fisher_idx_DF = {'fisher_idx': fisher_idx, 'feature_type': data['train'].columns[fisher_idx]}
fisher_idx_DF = pd.DataFrame(fisher_idx_DF) # Convert the dictionary to a DataFrame
fisher_idx_DF.to_csv(os.path.join(folder_path, 'fisher_idx_series.csv'), index=False) # Save the DataFrame to a .csv file

100%|██████████| 105/105 [00:03<00:00, 31.63it/s]


### SVM

In [43]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import pandas as pd
from tqdm import tqdm, trange
from joblib import dump, load

# Create the SVM classifier
svm = SVC()

# Define the parameter grid for GridSearchCV
gamma_range = np.linspace(-100,100,41) #-100,-95,...,95,100
gamma_range = 1.05**gamma_range # 1.05^-100,1.05^-95,...,1.05^95,1.05^100
gamma_range = 1 / 2*(np.square(gamma_range))  # gamma = 1 / (2*sigma)^2, based on the SVC documentation
gamma_range = gamma_range.tolist()
C_range = [1, 10, 100, 500, 1000]
param_grid = {'C': C_range, 'gamma': gamma_range, 'kernel': ['linear','rbf']}

# Specify the folder path
folder_path = '../Results/Classification/SVM/AOFI_smallerGrid_10Fold/'

# Create the folder if it does not exist
if not os.path.exists(folder_path):    os.makedirs(folder_path)

df_accCV =list()
max_dim = int(label_train_valid.shape[0]/2)  # Define the maximum number of features
for idx_AOFI in tqdm(range(1, max_dim + 1)):
    data_CV = data_train_valid[:, fisher_idx[:idx_AOFI]]
    
    # Create the GridSearchCV object
    grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=10, n_jobs=7)

    # Train the model with gridsearchCV
    grid_search.fit(data_CV, label_train_valid)

    # Get the best parameters and best score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    # Get the best model
    best_model = grid_search.best_estimator_
    test_score = best_model.score(data_test[:, fisher_idx[:idx_AOFI]], label_test)
    
    # Append the loop index and accCV to the dataframe
    df_accCV.append({'#features': idx_AOFI, 'best_CVscore': best_score, 'best_params': best_params, 'test_score': test_score})
    
    # Save the best model to a file
    model_folder = os.path.join(folder_path, 'Models/') # Specify the folder path to save models
    if not os.path.exists(model_folder):    os.makedirs(model_folder) # Create the folder if it does not exist
    model_file = os.path.join(model_folder, 'best_model_'+str(idx_AOFI)+'.joblib')
    dump(best_model, model_file)  # Save the best model to a file
    pd.Series(fisher_idx[:idx_AOFI]).to_csv(model_file+'_features_idx.csv', index=False)  # Save the best features index to a file

# Save the dataframe to csv
df_accCV = pd.DataFrame(df_accCV) # Convert the list to a dataframe
df_accCV.to_csv(os.path.join(folder_path, 'df_accCV.csv'), index=False)

# Create a dictionary with 'fisher_idx' and 'feature_type' as keys
fisher_idx_DF = {'fisher_idx': fisher_idx, 'feature_type': data['train'].columns[fisher_idx]}
fisher_idx_DF = pd.DataFrame(fisher_idx_DF) # Convert the dictionary to a DataFrame
fisher_idx_DF.to_csv(os.path.join(folder_path, 'fisher_idx_series.csv'), index=False) # Save the DataFrame to a .csv file

100%|██████████| 105/105 [32:56<00:00, 18.82s/it]


## Feature configuration & decision values VS. Scales

# Regression part

In [26]:
# organizing - scores of different mental scales
scales_df = {}
scales_names = ['GDS_total','MMSE_total','IADL_total', 'ADL_total', 'CDR_total', 'MoCA_total']
for key, value in info.items():
    if key != 'Info':
        scales_df[key] = value[scales_names]
    else:
        scales_df[key] = value

scales_train_valid = pd.concat([scales_df['train'], scales_df['valid']], axis=0)
scales_test = scales_df['test']

In [38]:
# feature selction for regression using pearson's correlation - filter based

# select the specific scale for the regression
scale_idx = 1 #'GDS_total','MMSE_total','IADL_total', 'ADL_total', 'CDR_total', 'MoCA_total'
_scale_train_valid = scales_train_valid.iloc[:, scale_idx]
_scale_test = scales_test.iloc[:, scale_idx]

print(data_train_valid.shape, scales_train_valid.shape)
pearson_scores = f_selection.pearson_coef(data_train_valid, _scale_train_valid)
pearson_idx = np.argsort(pearson_scores)[::-1] # sort in descending order

(211, 13050) (211, 6)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.nd

In [42]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
import pandas as pd
from tqdm import tqdm, trange
from joblib import dump, load

# Create the SVM classifier
svr = SVR()
linR = LinearRegression()

# Define the parameter grid for GridSearchCV
gamma_range = np.linspace(-100,100,21) #-100,-95,...,95,100
gamma_range = 1.05**gamma_range # 1.05^-100,1.05^-95,...,1.05^95,1.05^100
gamma_range = 1 / 2*(np.square(gamma_range))  # gamma = 1 / (2*sigma)^2, based on the SVC documentation
gamma_range = gamma_range.tolist()
C_range = [1, 100,  1000]
param_grid = {'C': C_range, 'gamma': gamma_range, 'kernel': ['linear','rbf']}
param_grid_linR = {'fit_intercept': [True]}


max_dim = int(label_train_valid.shape[0]/2)  # Define the maximum number of features
for scale_name, scale in scales_train_valid.items():
    print(scale_name+':')
    _scale_train_valid = scale

    # Feature selection - filter_based - pearson's correlation
    pearson_scores = f_selection.pearson_coef(data_train_valid, _scale_train_valid)
    pearson_idx = np.argsort(pearson_scores)[::-1] # sort in descending order

    df_scoreCV =list()
    for idx_AOFI in tqdm(range(1, max_dim + 1)):
        data_CV = data_train_valid[:, pearson_idx[:idx_AOFI]]
        
        # Create the GridSearchCV object
        grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=10, n_jobs=10)

        # Train the model with gridsearchCV
        grid_search.fit(data_CV, _scale_train_valid)

        # Get the best parameters and best score
        best_params = grid_search.best_params_
        best_score = grid_search.best_score_

        # Get the best model
        best_model = grid_search.best_estimator_
        test_score = best_model.score(data_test[:, pearson_idx[:idx_AOFI]], _scale_test)
        
        # Append the loop index and accCV to the dataframe
        df_scoreCV.append({'#features': idx_AOFI, 'best_CVscore': best_score, 'best_params': best_params, 'test_score': test_score})

    # Specify the folder path
    folder_path = f'../Results/Regression/SVR/AOFI_10Fold/{scale_name}/'

    # Create the folder if it does not exist
    if not os.path.exists(folder_path):    os.makedirs(folder_path)

    # Save the dataframe to csv
    df_scoreCV = pd.DataFrame(df_scoreCV) # Convert the list to a dataframe
    df_scoreCV.to_csv(os.path.join(folder_path, 'df_scoreCV.csv'), index=False)

    # Create a dictionary with 'fisher_idx' and 'feature_type' as keys
    pearson_idx_DF = {'pearson_idx': pearson_idx, 'feature_type': data['train'].columns[pearson_idx]}
    pearson_idx_DF = pd.DataFrame(pearson_idx_DF) # Convert the dictionary to a DataFrame
    pearson_idx_DF.to_csv(os.path.join(folder_path, 'fpearson_idx_series.csv'), index=False) # Save the DataFrame to a .csv file

GDS_total:
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (21

100%|██████████| 105/105 [50:56<00:00, 29.11s/it]


MMSE_total:
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (2

100%|██████████| 105/105 [3:33:55<00:00, 122.24s/it] 


IADL_total:
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (2

100%|██████████| 105/105 [36:05<00:00, 20.62s/it]


ADL_total:
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (21

100%|██████████| 105/105 [37:41<00:00, 21.54s/it]


CDR_total:
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (21

100%|██████████| 105/105 [8:21:19<00:00, 286.47s/it] 


MoCA_total:
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (211,) <class 'pandas.core.series.Series'> (211,)
<class 'numpy.ndarray'> (2

100%|██████████| 105/105 [3:20:45<00:00, 114.72s/it] 
