In [15]:
import os
import sys
tools_path = os.path.join('..', 'tools')
sys.path.append(tools_path)
import file_tools


_DF_DIR = os.path.join('.', 'df_files')
_FINAL_DFS_DIR = os.path.join('.', 'final_dfs')
file_tools.ensure_dir(_FINAL_DFS_DIR)

In [16]:
import numpy as np
import pandas as pd

In [17]:
dfs = {}
for f in file_tools.list_files(_DF_DIR, '*.parquet'):
    dbf = os.path.join(_DF_DIR, f)
    df = pd.read_parquet(dbf, engine='pyarrow')
    basename = file_tools.get_filebasename(f)
    dfs[basename] = df
    print(basename, len(df), df['method_name'].unique())

results_cluster 23836 ['LinearRegression' 'SVR' 'MLP']
results_supervised_full_shuffle_backup 60 ['LinearRegression' 'Shuffle_LinearRegression' 'DummyScoring_Mean']
results_latest_moresvr 23836 ['LinearRegression' 'SVR' 'MLP']
optim_ablation 9793000 ['LinearRegression' 'Shuffle_LinearRegression' 'DummyScoring_Mean']
results_relabelling 319 ['LinearRegression' 'Shuffle_LinearRegression' 'DummyScoring_Mean']
results_clustersvr 340 ['BestSVR']
results_dimensionality_eeg 10346 ['LinearRegression' 'Shuffle_LinearRegression' 'DummyScoring_Mean']
results_bestsvreegnet 340 ['BestSVR']
results_dimensionality_face 10200 ['LinearRegression' 'Shuffle_LinearRegression' 'DummyScoring_Mean']
results_supervised_full_shuffle 5100 ['LinearRegression' 'Shuffle_LinearRegression' 'DummyScoring_Mean']
results 24592 ['LinearRegression' 'Shuffle_LinearRegression' 'DummyScoring_Mean' 'SVR'
 'RandomForest' 'MLP']
results_ablation 36720 ['LinearRegression' 'Shuffle_LinearRegression' 'DummyScoring_Mean']
optim 13

In [37]:
# some local regressors runs were test run incomplete to prepare for cluster runs
mask_results = ((dfs['results']['method_name'] != "SVR") &
        (dfs['results']['method_name'] != "MLP") &
        (dfs['results']['method_name'] != "RandomForest"))

result_df = dfs['results'][mask_results]
print(result_df['method_name'].unique())

# LinReg on cluster is a duplicate, we could keep cluster one or local run one
# we remove cluste one
mask_results_cluster = ((dfs['results_cluster']['method_name'] != "LinearRegression"))

results_cluster_df = dfs['results_cluster'][mask_results_cluster]
print(results_cluster_df['method_name'].unique())

results_best_cluster_df = dfs['results_best_cluster']


## SVR EEGNet
mask_results_bestsvreegnet = ((dfs['results_bestsvreegnet']['method_name'] == "BestSVR"))
results_bestsvreegnet_df = dfs['results_bestsvreegnet'][mask_results_bestsvreegnet]
print(results_bestsvreegnet_df['method_name'].unique())


final_result_df = pd.concat([result_df, results_cluster_df, results_best_cluster_df, results_bestsvreegnet_df], axis=0)
print(final_result_df['method_name'].unique())

['LinearRegression' 'Shuffle_LinearRegression' 'DummyScoring_Mean']
['SVR' 'MLP']
['BestSVR']
['LinearRegression' 'Shuffle_LinearRegression' 'DummyScoring_Mean' 'SVR'
 'MLP' 'BestSVR' 'BestMLP']


In [38]:
results_filename = os.path.join(_FINAL_DFS_DIR, 'results.parquet')
final_result_df.to_parquet(results_filename, engine='pyarrow')

In [13]:
results_ablation_filename = os.path.join(_FINAL_DFS_DIR, 'results_ablation.parquet')
dfs['results_ablation'].to_parquet(results_ablation_filename, engine='pyarrow')

In [18]:
results_ablation_filename = os.path.join(_FINAL_DFS_DIR, 'results_relabelling.parquet')
dfs['results_relabelling'].to_parquet(results_ablation_filename, engine='pyarrow')

In [40]:
results_supervised_filename = os.path.join(_FINAL_DFS_DIR, 'results_supervised.parquet')
dfs['results_supervised'].to_parquet(results_supervised_filename, engine='pyarrow')

results_supervised_filename = os.path.join(_FINAL_DFS_DIR, 'results_supervised_full_shuffle.parquet')
dfs['results_supervised_full_shuffle'].to_parquet(results_supervised_filename, engine='pyarrow')

KeyError: 'results_supervised'

In [41]:
results_dimensionality_eeg_filename = os.path.join(_FINAL_DFS_DIR, 'results_dimensionality_eeg.parquet')
dfs['results_dimensionality_eeg'].to_parquet(results_dimensionality_eeg_filename, engine='pyarrow')

results_dimensionality_face_filename = os.path.join(_FINAL_DFS_DIR, 'results_dimensionality_face.parquet')
dfs['results_dimensionality_face'].to_parquet(results_dimensionality_face_filename, engine='pyarrow')

In [45]:
# for optim, we cut it all to 1000 iterations
CUT_THRESHOLD = 999

df_optim = dfs['optim']

mask = ((df_optim['number'] <= CUT_THRESHOLD))
df_optim = df_optim[mask]

from collections import Counter
element_counts = Counter(df_optim['number'])
counts = list(element_counts.values())
assert len(counts) == 1000
assert all(count == counts[0] for count in counts)

optim_filename = os.path.join(_FINAL_DFS_DIR, 'optim.parquet')
df_optim.to_parquet(optim_filename, engine='pyarrow')

In [46]:
# optim_ablation is a a different beast as we will pick the 0 ablation from the optim results above
# and cut it all again to 1000 iteration

CUT_THRESHOLD = 999

df_optim_ablation = dfs['optim_ablation']

# trim to 1000 iterations
mask = ((df_optim_ablation['number'] <= CUT_THRESHOLD))
df_optim_ablation = df_optim_ablation[mask]

df_optim_ablation['method_name']

# remove all 0 runs 
mask = ((df_optim_ablation['ablation_distance'] != 0))
df_optim_ablation = df_optim_ablation[mask]
# replace them with df_optim which are the same for both ablation and control run_type
df_optim_copy = df_optim.copy()
df_optim_copy['ablation_distance'] = 0
df_optim_copy['run_type'] = 'control'
df_optim_ablation = pd.concat([df_optim_ablation, df_optim_copy], ignore_index=True)

df_optim_copy = df_optim.copy()
df_optim_copy['ablation_distance'] = 0
df_optim_copy['run_type'] = 'ablation'
df_optim_ablation = pd.concat([df_optim_ablation, df_optim_copy], ignore_index=True)

# mask = ((df_optim_ablation['ablation_distance'] != 0) &
#         (df['method_name'] != "BestMLP") &
#         (df['eeg_name'] == "EEG_Raw") & 
#         (df['test_name'] == "random"))


# sanity check
from collections import Counter
element_counts = Counter(df_optim_ablation['number'])
counts = list(element_counts.values())
assert len(counts) == 1000
assert all(count == counts[0] for count in counts)

optim_ablation_filename = os.path.join(_FINAL_DFS_DIR, 'optim_ablation.parquet')
df_optim_ablation.to_parquet(optim_ablation_filename, engine='pyarrow')

In [189]:
df_optim_ablation['ablation_distance'].unique()

array([25, 30,  5, 10, 35, 40, 15, 20,  0])

In [190]:
df_optim_ablation['run_type'].unique()

array(['ablation', 'control'], dtype=object)

In [191]:
df_optim_ablation

Unnamed: 0,method_name,number,value,euclidean_distance,ablation_distance,run_type
0,LinearRegression,0,1.001659,28.079715,25,ablation
1,LinearRegression,1,0.998253,12.823638,25,ablation
2,LinearRegression,2,0.996458,14.093707,25,ablation
3,LinearRegression,3,1.001572,8.796211,25,ablation
4,LinearRegression,4,1.004591,16.307747,25,ablation
...,...,...,...,...,...,...
8564995,DummyScoring_Mean,995,1.000000,24.501106,0,ablation
8564996,DummyScoring_Mean,996,1.000000,23.908035,0,ablation
8564997,DummyScoring_Mean,997,1.000000,25.577454,0,ablation
8564998,DummyScoring_Mean,998,1.000000,24.343669,0,ablation


In [192]:
# grouped_df = dfs['results_best_again'].groupby(['method_name', 'eeg_name'])
# grouped_df = dfs['results_clustersvr'].groupby(['method_name', 'eeg_name'])
grouped_df = dfs['results_best_cluster'].groupby(['method_name', 'eeg_name'])


# Calculate mean and std for each subtable
compiled_result = grouped_df.agg({
    'pearsonr_statistic': ['mean', 'std', 'sem']
})

# Flatten the column names
compiled_result.columns = ['_'.join(col).strip() for col in compiled_result.columns.values]

# Reset the index to display method_name and training_size as columns
compiled_result = compiled_result.reset_index()
compiled_result

Unnamed: 0,method_name,eeg_name,pearsonr_statistic_mean,pearsonr_statistic_std,pearsonr_statistic_sem
0,BestMLP,EEG_Net,-0.774051,0.048214,0.009455
1,BestMLP,EEG_Raw,-0.828877,0.028544,0.012765
2,BestSVR,EEG_Raw,-0.355963,0.146964,0.00797
