In [2]:
                                                                                                                                                                                                                              import pandas as pd
import numpy as np
import os
import shutil
import glob
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
from itertools import combinations
from joblib import Parallel, delayed, dump
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from pathlib import Path
from joblib import load
from sklearn.model_selection import GridSearchCV
import pickle

In [3]:
folders = ['RELATIONAL', 'EMOTIONAL', 'GAMBLING', 'MOTOR', 'SOCIAL', 'WM', 'LANGUAGE']

# Creating extended vertices features

In [4]:
def extract_fmri_features(time_series):
    from scipy.stats import skew, kurtosis, linregress
    from statsmodels.tsa.stattools import acf
    from scipy.signal import find_peaks
    import numpy as np
    import pandas as pd

    n_timepoints = time_series.shape[1]

    autocorr_lag1 = []
    n_peaks = []
    mean_peak_amp = []           
    for ts in time_series:
        ts = np.asarray(ts).flatten()  
        peaks, properties = find_peaks(ts, height=np.mean(ts))
        n_peaks.append(len(peaks))
        if len(peaks) > 0:
            mean_peak_amp.append(np.mean(ts[peaks]))
        else:
            mean_peak_amp.append(np.nan)

        if len(peaks) > 1:
            intervals = np.diff(peaks)
            mean_interpeak_interval.append(np.mean(intervals))
        else:
            mean_interpeak_interval.append(np.nan)

        acf_vals = acf(ts, nlags=1)
            autocorr_lag1.append(acf_vals[1] if len(acf_vals) > 1 else np.nan)

    return pd.DataFrame({
        'mean': time_series.mean(axis=1),
        'std': time_series.std(axis=1),
        'skewness': skew(time_series, axis=1),
        'autocorr_lag1': autocorr_lag1,
        'n_peaks': n_peaks,
        'mean_peak_amp': mean_peak_amp,
        'mean_interpeak_interval': mean_interpeak_interval
    })


In [6]:
import joblib

def load(pickle_path):
    try:
        return joblib.load(pickle_path)
    except Exception as e:
        print(f"Error loading {pickle_path}: {e}")
        return None  
    
source_base = Path("split_mean")
destination_base = Path("vertices_mean_extended")
#folders = ['RELATIONAL', 'EMOTIONAL', 'GAMBLING', 'MOTOR', 'SOCIAL', 'WM', 'LANGUAGE']
folders = ['EMOTIONAL']

for folder in tqdm(folders):
    source_folder = source_base / folder
    for pickle_path in source_folder.glob("*.pickle"):
        try:
            data = load(pickle_path)
            if data is None:
                continue  
            data = np.hstack(data)
            df = pd.DataFrame(data)
            data_array = df.values
            features_df = extract_fmri_features(data_array)

            if "LR" in pickle_path.name:
                subfolder = "LR"
            elif "RL" in pickle_path.name:
                subfolder = "RL"
            else:
                print(f"Warning: File {pickle_path.name} does not contain 'LR' or 'RL'. Skipping.")
                continue

            destination_folder = destination_base / folder / subfolder
            destination_folder.mkdir(parents=True, exist_ok=True)

            output_path = destination_folder / pickle_path.name
            features_df.to_pickle(output_path)

        except Exception as e:
            print(f"Error processing {pickle_path}: {e}")
            traceback.print_exc()

100%|████████████████████████████████████████████| 1/1 [02:43<00:00, 163.85s/it]


In [None]:
base_dir = 'vertices_mean_extended'  
output_dir = 'pairwise_datasets'   
pattern = os.path.join(base_dir, '**', '*.pickle')
pickle_files = glob.glob(pattern, recursive=True)

os.makedirs(output_dir, exist_ok=True)

all_files = []
for file_path in pickle_files:
    data = pd.read_pickle(file_path)
    print(file_path)
    class_label = file_path.split('_')[4].replace('.pickle', '')
    data['class'] = class_label
    data['sample_id'] = (file_path.split('/')[3]).split('_')[0]
    all_files.append(data)
   

full_data = pd.concat(all_files)

In [20]:
full_data.reset_index(inplace=True)
full_data.rename(columns={'index': 'vertex'}, inplace=True)


In [21]:
full_data['vertex'] += 1
full_data

Unnamed: 0,vertex,mean,std,skewness,autocorr_lag1,n_peaks,mean_peak_amp,mean_interpeak_interval,class,sample_id
0,1,-0.147010,0.850654,-0.206136,0.328977,21,0.684999,3.750000,rnd,322224
1,2,0.204570,0.939359,-0.218530,0.643224,20,1.223812,3.947368,rnd,322224
2,3,-0.352507,0.993342,-0.282886,0.060360,26,0.676476,3.400000,rnd,322224
3,4,-0.096319,0.807683,-0.179952,0.358285,21,0.721222,4.000000,rnd,322224
4,5,-0.181171,0.933933,-0.020551,0.253191,25,0.736216,3.583333,rnd,322224
...,...,...,...,...,...,...,...,...,...,...
6165567,375,-0.238299,0.977066,-0.317467,0.299620,17,0.776802,3.875000,neut,990366
6165568,376,-0.349985,0.935441,0.108594,0.115772,17,0.765804,4.062500,neut,990366
6165569,377,-0.127553,0.962288,-0.129217,0.335510,20,0.896335,3.526316,neut,990366
6165570,378,-0.093324,0.969911,0.057978,0.338253,16,1.071996,4.333333,neut,990366


In [23]:
data = full_data.pivot_table(index=['sample_id', 'class'],
                              columns='vertex',
                              values=['mean', 'std', 'autocorr_lag1', 'n_peaks', 'mean_peak_amp', 'mean_interpeak_interval'])
data

Unnamed: 0_level_0,Unnamed: 1_level_0,autocorr_lag1,autocorr_lag1,autocorr_lag1,autocorr_lag1,autocorr_lag1,autocorr_lag1,autocorr_lag1,autocorr_lag1,autocorr_lag1,autocorr_lag1,...,std,std,std,std,std,std,std,std,std,std
Unnamed: 0_level_1,vertex,1,2,3,4,5,6,7,8,9,10,...,370,371,372,373,374,375,376,377,378,379
sample_id,class,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
100206,0bk,0.893819,0.464748,0.603468,0.851265,0.786838,0.909225,0.675615,0.695618,0.738377,0.692180,...,0.960313,0.935202,0.898046,1.023829,1.020021,0.991788,0.950959,1.041028,0.951495,0.968369
100206,2bk,0.861421,0.487011,0.484680,0.790721,0.769333,0.888625,0.603071,0.553682,0.624310,0.655591,...,0.957964,0.962293,0.957600,0.960843,0.924773,0.929355,0.972268,0.979947,0.975403,0.987517
100206,fear,0.787456,0.512411,0.583106,0.750800,0.637945,0.806604,0.628705,0.632117,0.541350,0.452887,...,1.018323,1.066575,1.043297,0.910316,0.977589,0.988025,1.001793,0.944066,0.993105,0.973372
100206,l,0.828420,0.447045,0.599733,0.779001,0.749867,0.697426,0.494070,0.672157,0.662432,0.654740,...,0.929404,0.994043,0.921388,0.944134,1.049938,1.028949,1.045922,0.959480,0.939064,0.869964
100206,loss,0.841642,0.583493,0.632507,0.794625,0.773274,0.822869,0.459484,0.604053,0.599757,0.810513,...,0.955303,0.976240,0.961123,0.988804,1.013960,1.038079,0.979294,0.943085,0.854481,1.037637
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996782,r,0.727708,0.426522,0.565467,0.729745,0.741279,0.681953,0.512494,0.589251,0.580114,0.472755,...,1.052960,0.964462,0.968415,1.021673,0.966433,0.966144,1.033262,0.962946,0.927782,0.929914
996782,relation,0.792886,0.503521,0.550323,0.794583,0.793159,0.763781,0.657890,0.526877,0.735584,0.704892,...,1.120857,1.036813,0.975302,1.024739,0.927820,1.015248,0.927360,1.111079,1.036310,1.067334
996782,rnd,0.695992,0.790285,0.441623,0.634431,0.858011,0.865658,0.607152,0.557471,0.684295,0.758048,...,1.002791,0.872504,0.941104,1.008072,0.989134,0.865046,0.962983,0.826940,0.882297,0.896415
996782,story,0.759290,0.656334,0.785014,0.829183,0.825330,0.800928,0.453851,0.651768,0.789581,0.625014,...,1.043619,0.953864,0.975513,0.990321,1.042829,0.950362,1.015744,0.967266,0.929367,1.004862


In [24]:
data.reset_index(inplace=True)
data.columns = [f"{feat}_{vertex}" if vertex else feat for feat, vertex in data.columns]
data

Unnamed: 0,sample_id,class,autocorr_lag1_1,autocorr_lag1_2,autocorr_lag1_3,autocorr_lag1_4,autocorr_lag1_5,autocorr_lag1_6,autocorr_lag1_7,autocorr_lag1_8,...,std_370,std_371,std_372,std_373,std_374,std_375,std_376,std_377,std_378,std_379
0,100206,0bk,0.893819,0.464748,0.603468,0.851265,0.786838,0.909225,0.675615,0.695618,...,0.960313,0.935202,0.898046,1.023829,1.020021,0.991788,0.950959,1.041028,0.951495,0.968369
1,100206,2bk,0.861421,0.487011,0.484680,0.790721,0.769333,0.888625,0.603071,0.553682,...,0.957964,0.962293,0.957600,0.960843,0.924773,0.929355,0.972268,0.979947,0.975403,0.987517
2,100206,fear,0.787456,0.512411,0.583106,0.750800,0.637945,0.806604,0.628705,0.632117,...,1.018323,1.066575,1.043297,0.910316,0.977589,0.988025,1.001793,0.944066,0.993105,0.973372
3,100206,l,0.828420,0.447045,0.599733,0.779001,0.749867,0.697426,0.494070,0.672157,...,0.929404,0.994043,0.921388,0.944134,1.049938,1.028949,1.045922,0.959480,0.939064,0.869964
4,100206,loss,0.841642,0.583493,0.632507,0.794625,0.773274,0.822869,0.459484,0.604053,...,0.955303,0.976240,0.961123,0.988804,1.013960,1.038079,0.979294,0.943085,0.854481,1.037637
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8129,996782,r,0.727708,0.426522,0.565467,0.729745,0.741279,0.681953,0.512494,0.589251,...,1.052960,0.964462,0.968415,1.021673,0.966433,0.966144,1.033262,0.962946,0.927782,0.929914
8130,996782,relation,0.792886,0.503521,0.550323,0.794583,0.793159,0.763781,0.657890,0.526877,...,1.120857,1.036813,0.975302,1.024739,0.927820,1.015248,0.927360,1.111079,1.036310,1.067334
8131,996782,rnd,0.695992,0.790285,0.441623,0.634431,0.858011,0.865658,0.607152,0.557471,...,1.002791,0.872504,0.941104,1.008072,0.989134,0.865046,0.962983,0.826940,0.882297,0.896415
8132,996782,story,0.759290,0.656334,0.785014,0.829183,0.825330,0.800928,0.453851,0.651768,...,1.043619,0.953864,0.975513,0.990321,1.042829,0.950362,1.015744,0.967266,0.929367,1.004862


In [25]:
vertex_columns = np.arange(1, 380)
vertex_pairs = list(combinations(vertex_columns, 2))
total_pairs = len(vertex_pairs)
print(f'Всего пар вершин: {total_pairs}')

Всего пар вершин: 71631


In [5]:
import os
import pandas as pd

def save_pairwise_dataset(pair,output_dir='pairwise_datasets'):
    v1, v2 = pair
    feature_types = ['mean', 'std', 'autocorr_lag1', 'n_peaks', 'mean_peak_amp', 'mean_interpeak_interval']

    os.makedirs(output_dir, exist_ok=True)
    filename = os.path.join(output_dir, f'dataset_v{v1}_v{v2}.pkl')
    if os.path.exists(filename):
        print(f"File already exists: {filename} - Skip")
        return False

    cols_to_extract = []
    for feat in feature_types:
        cols_to_extract.extend([f'{feat}_{v1}', f'{feat}_{v2}'])
    cols_to_extract.extend(['class', 'sample_id'])
    
    try:
        subset = data[cols_to_extract].copy()
        
        rename_dict = {}
        for feat in feature_types:
            rename_dict.update({
                f'{feat}_{v1}': f'{feat}_v1',
                f'{feat}_{v2}': f'{feat}_v2'
            })
        
        subset.rename(columns=rename_dict, inplace=True)
        subset.to_pickle(filename)
        #print(f"Successfully saved: {filename}")
        return True
        
    except Exception as e:
        print(f"Error processing pair ({v1}, {v2}): {str(e)}")
        return False

In [6]:
def process_pairs_in_parallel(pairs, num_processes=None):
    if num_processes is None:
        num_processes = max(1, cpu_count() - 1)  
    with Pool(processes=num_processes) as pool:
        results = list(tqdm(pool.imap(save_pairwise_dataset, pairs), total=len(pairs)))
    success_count = sum(results)
    print(f"Успешно сохранено датасетов: {success_count}/{len(pairs)}")
            

In [28]:
process_pairs_in_parallel(vertex_pairs)

100%|████████████████████████████████████| 71631/71631 [02:00<00:00, 592.17it/s]


Успешно сохранено датасетов: 71631/71631


# Create full correlation table

In [5]:
def pad_array(arr, target_length, padding_value=np.nan):
    if arr.shape[-1] < target_length:
        padding = np.full((arr.shape[0], arr.shape[1], target_length - arr.shape[2]), padding_value)
        arr = np.concatenate([arr, padding], axis=2)  
    return arr

In [6]:
def compute_correlation_with_nan_handling(arr1, arr2):
    valid_mask = ~np.isnan(arr1) & ~np.isnan(arr2)
    return np.corrcoef(arr1[valid_mask], arr2[valid_mask])[0, 1]

In [7]:
import os
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm

def extract_id_type_and_class(filename):
    base = os.path.basename(filename).replace('.pickle', '')
    sample_id, data_type, cls = base.split('_')
    return sample_id, data_type, cls

In [12]:
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
import os

def build_correlation_graph_by_type(file_list, out_csv_path, desired_type="LR", batch_size=20):
    all_edges = []
    count = 0

    def save_batch(batch_data):
        df = pd.DataFrame(batch_data, columns=["type", "pair", "class", "vertices", "weight"])
        if not os.path.exists(out_csv_path):
            df.to_csv(out_csv_path, index=False, mode='w', header=True)
        else:
            df.to_csv(out_csv_path, index=False, mode='a', header=False)

    for file_path in tqdm(file_list):
        sample_id, data_type, cls = extract_id_type_and_class(file_path)
        if data_type != desired_type:
            continue

        with open(file_path, 'rb') as f:
            data = pickle.load(f)

        lengths = [arr.shape[1] for arr in data]
        max_length = max(max(lengths), 37)
        padded = []
        for arr in data:
            pad_width = max_length - arr.shape[1]
            arr_padded = np.pad(arr,
                                ((0, 0),           
                                 (0, pad_width)),
                                mode='constant',
                                constant_values=0)
            padded.append(arr_padded)

        series = np.stack(padded, axis=0)
        ts = series.transpose(1, 0, 2).reshape(series.shape[1], -1)
        corr_matrix = np.corrcoef(ts)
        n_vertices = ts.shape[0]

        for i in range(n_vertices):
            for j in range(i + 1, n_vertices):
                vertex_str = f"v{i}_v{j}"
                weight = corr_matrix[i, j]
                pair_id = sample_id  
                all_edges.append((data_type, pair_id, cls, vertex_str, weight))

        count += 1
        if count >= batch_size:
            save_batch(all_edges)
            all_edges.clear()  
            count = 0

    if all_edges:
        save_batch(all_edges)

    print(f"[{desired_type}] Сохранено: {out_csv_path}")


In [None]:
from glob import glob

files = glob("split_mean/**/*.pickle", recursive=True)
files = [file for file in files if 'LR' in os.path.basename(file)]

In [14]:
build_correlation_graph_by_type(files, "graph_LR.csv", desired_type="LR")

100%|███████████████████████████████████████| 8134/8134 [36:11<00:00,  3.75it/s]


[LR] Сохранено: graph_LR.csv


In [15]:
from glob import glob

emote_files = [file for file in files if ('fear' or 'neut') in os.path.basename(file)]

In [17]:
build_correlation_graph_by_type(emote_files, "graph_emotions_LR.csv", desired_type="LR")

100%|█████████████████████████████████████████| 581/581 [03:56<00:00,  2.46it/s]


[LR] Сохранено: graph_emotions_LR.csv


# Creating pairwise datasets

In [8]:
def create_pairwise_dataset(base_dir, features):
    pattern = os.path.join(base_dir, '**', '*.pickle')
    pickle_files = glob.glob(pattern, recursive=True)
    all_files = []
    for file_path in pickle_files:
        data = pd.read_pickle(file_path)
        class_label = file_path.split('_')[4].replace('.pickle', '')
        data['class'] = class_label
        data['sample_id'] = (file_path.split('/')[3]).split('_')[0]
        all_files.append(data)


    full_data = pd.concat(all_files)
    full_data.reset_index(inplace=True)
    full_data.rename(columns={'index': 'vertex'}, inplace=True)
    full_data['vertex'] += 1
    data = full_data.pivot_table(index=['sample_id', 'class'],
                              columns='vertex',
                              values=['mean', 'std'])
    data.reset_index(inplace=True)
    data.columns = [f"{feat}_{vertex}" if vertex else feat for feat, vertex in data.columns]
    return data


In [10]:
def save_pairwise_dataset(pair, output_dir, feature_types, data):
    v1, v2 = pair
    os.makedirs(output_dir, exist_ok=True)
    filename = os.path.join(output_dir, f'dataset_v{v1}_v{v2}.pkl')
    if os.path.exists(filename):
        print(f"File already exists: {filename} - Skipping processing")
        return False

    cols_to_extract = []
    for feat in feature_types:
        cols_to_extract.extend([f'{feat}_{v1}', f'{feat}_{v2}'])
    cols_to_extract.extend(['class', 'sample_id'])
    
    try:
        subset = data[cols_to_extract].copy()
        
        rename_dict = {}
        for feat in feature_types:
            rename_dict.update({
                f'{feat}_{v1}': f'{feat}_v1',
                f'{feat}_{v2}': f'{feat}_v2'
            })
        
        subset.rename(columns=rename_dict, inplace=True)
        subset.to_pickle(filename)
        return True
        
    except Exception as e:
        print(f"Error processing pair ({v1}, {v2}): {str(e)}")
        return False
    
def process_pairs_in_parallel(pairs, output_dir, feature_types, data, num_processes=None):
    if num_processes is None:
        num_processes = max(1, cpu_count() - 1)

    args = [
        (pair, output_dir, feature_types, data)
        for pair in pairs
    ]

    with Pool(processes=num_processes) as pool:
        results = list(
            tqdm(pool.starmap(save_pairwise_dataset, args), total=len(args))
        )

    success_count = sum(results)
    print(f"Успешно сохранено датасетов: {success_count}/{len(pairs)}")


In [6]:
base_dir = 'vertices_mean_extended/SOCIAL/LR'  
output_dir = 'pairwise_social'
features = ['mean', 'std']
data = create_pairwise_dataset(base_dir, features)
number_of_vertices = 380
vertex_columns = np.arange(1, number_of_vertices)
vertex_pairs = list(combinations(vertex_columns, 2))
total_pairs = len(vertex_pairs)
process_pairs_in_parallel(vertex_pairs, output_dir, features, data)

100%|████████████████████████████████| 71631/71631 [00:00<00:00, 3232646.76it/s]

Успешно сохранено датасетов: 71631/71631





In [7]:
base_dir = 'vertices_mean_extended/RELATIONAL/LR'  
output_dir = 'pairwise_relational'
data = create_pairwise_dataset(base_dir, features)
process_pairs_in_parallel(vertex_pairs, output_dir, features, data)

100%|████████████████████████████████| 71631/71631 [00:00<00:00, 2259065.75it/s]

Успешно сохранено датасетов: 71631/71631





In [10]:
base_dir = 'vertices_mean_extended/MOTOR/LR'  
output_dir = 'pairwise_motor_LR'
data = create_pairwise_dataset(base_dir, features)
process_pairs_in_parallel(vertex_pairs, output_dir, features, data)

100%|████████████████████████████████| 71631/71631 [00:00<00:00, 3066080.78it/s]

Успешно сохранено датасетов: 71631/71631





In [11]:
base_dir = 'vertices_mean_extended/LANGUAGE/LR'  
output_dir = 'pairwise_language_LR'
data = create_pairwise_dataset(base_dir, features)
process_pairs_in_parallel(vertex_pairs, output_dir, features, data)

100%|████████████████████████████████| 71631/71631 [00:00<00:00, 2339710.22it/s]

Успешно сохранено датасетов: 71631/71631





In [13]:
base_dir = 'vertices_mean_extended/GAMBLING/LR'  
output_dir = 'pairwise_gambling_LR'
data = create_pairwise_dataset(base_dir, features)
process_pairs_in_parallel(vertex_pairs, output_dir, features, data)

100%|████████████████████████████████| 71631/71631 [00:00<00:00, 4787160.45it/s]

Успешно сохранено датасетов: 71631/71631





In [7]:
base_dir = 'vertices_mean_extended/EMOTIONAL/RL'  
output_dir = 'pairwise_emotional_RL'
features = ['mean', 'std']
data = create_pairwise_dataset(base_dir, features)
number_of_vertices = 380
vertex_columns = np.arange(1, number_of_vertices)
vertex_pairs = list(combinations(vertex_columns, 2))
total_pairs = len(vertex_pairs)
process_pairs_in_parallel(vertex_pairs, output_dir, features, data)

100%|████████████████████████████████| 71631/71631 [00:00<00:00, 3334948.66it/s]

Успешно сохранено датасетов: 71631/71631





In [None]:
base_dir = 'vertices_mean_extended/GAMBLING/RL'  
output_dir = 'pairwise_gambling_RL'
features = ['mean', 'std']
data = create_pairwise_dataset(base_dir, features)
number_of_vertices = 380
vertex_columns = np.arange(1, number_of_vertices)
vertex_pairs = list(combinations(vertex_columns, 2))
total_pairs = len(vertex_pairs)
process_pairs_in_parallel(vertex_pairs, output_dir, features, data)

In [27]:
base_dir = 'vertices_mean_extended/LANGUAGE/RL'  
output_dir = 'pairwise_language_RL'
features = ['mean', 'std']
data = create_pairwise_dataset(base_dir, features)
number_of_vertices = 380
vertex_columns = np.arange(1, number_of_vertices)
vertex_pairs = list(combinations(vertex_columns, 2))
total_pairs = len(vertex_pairs)
process_pairs_in_parallel(vertex_pairs, output_dir, features, data)

100%|████████████████████████████████| 71631/71631 [00:00<00:00, 1847499.34it/s]

Успешно сохранено датасетов: 71631/71631





In [11]:
base_dir = 'vertices_mean_extended/MOTOR/RL'  
output_dir = 'pairwise_motor_RL'
features = ['mean', 'std']
data = create_pairwise_dataset(base_dir, features)
number_of_vertices = 380
vertex_columns = np.arange(1, number_of_vertices)
vertex_pairs = list(combinations(vertex_columns, 2))
total_pairs = len(vertex_pairs)
process_pairs_in_parallel(vertex_pairs, output_dir, features, data)

100%|████████████████████████████████| 71631/71631 [00:00<00:00, 4592653.24it/s]

Успешно сохранено датасетов: 71631/71631





In [12]:
base_dir = 'vertices_mean_extended/RELATIONAL/RL'  
output_dir = 'pairwise_relational_RL'
features = ['mean', 'std']
data = create_pairwise_dataset(base_dir, features)
number_of_vertices = 380
vertex_columns = np.arange(1, number_of_vertices)
vertex_pairs = list(combinations(vertex_columns, 2))
total_pairs = len(vertex_pairs)
process_pairs_in_parallel(vertex_pairs, output_dir, features, data)

100%|████████████████████████████████| 71631/71631 [00:00<00:00, 3544747.81it/s]

Успешно сохранено датасетов: 71631/71631





In [13]:
base_dir = 'vertices_mean_extended/SOCIAL/RL'  
output_dir = 'pairwise_social_RL'
features = ['mean', 'std']
data = create_pairwise_dataset(base_dir, features)
number_of_vertices = 380
vertex_columns = np.arange(1, number_of_vertices)
vertex_pairs = list(combinations(vertex_columns, 2))
total_pairs = len(vertex_pairs)
process_pairs_in_parallel(vertex_pairs, output_dir, features, data)

100%|████████████████████████████████| 71631/71631 [00:00<00:00, 3374655.34it/s]

Успешно сохранено датасетов: 71631/71631





In [14]:
base_dir = 'vertices_mean_extended/WM/RL'  
output_dir = 'pairwise_wm_RL'
features = ['mean', 'std']
data = create_pairwise_dataset(base_dir, features)
number_of_vertices = 380
vertex_columns = np.arange(1, number_of_vertices)
vertex_pairs = list(combinations(vertex_columns, 2))
total_pairs = len(vertex_pairs)
process_pairs_in_parallel(vertex_pairs, output_dir, features, data)

100%|████████████████████████████████| 71631/71631 [00:00<00:00, 3159788.71it/s]

Успешно сохранено датасетов: 71631/71631





# Build correlation graphs per file

In [None]:
import os
import pickle
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm

def extract_id_type_and_class(filename):
    base = os.path.basename(filename)
    name, _ = os.path.splitext(base)
    sample_id, data_type, cls = name.split('_')
    return sample_id, data_type, cls

def save_correlations_per_file(file_list, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    for file_path in tqdm(file_list, desc=f"Processing files"):
        base = os.path.basename(file_path)
        fname = os.path.splitext(base)[0]  
        out_path = os.path.join(output_dir, f"{fname}.pkl")
        if os.path.exists(out_path):
            continue
        sample_id, data_type, cls = extract_id_type_and_class(file_path)

        if file_path.lower().endswith('.csv'):
            df0 = pd.read_csv(file_path)
            ts = df0.iloc[:, 1:].to_numpy()
        else:
            data = pd.read_pickle(file_path)
            lengths = [arr.shape[1] for arr in data]
            L = max(max(lengths), 37)
            padded = []
            for arr in data:
                pad_len = L - arr.shape[1]
                arr_p = np.pad(arr,
                               ((0,0), (0,pad_len)),
                               mode='constant', constant_values=0)
                padded.append(arr_p)
            mat = np.stack(padded, axis=0)        
            ts = mat.transpose(1, 0, 2).reshape(mat.shape[1], -1)


        corr_mat = np.corrcoef(ts)
        n = corr_mat.shape[0]
        edges = []
        for i in range(n):
            for j in range(i+1, n):
                edges.append({
                    'vertex_1': f'v{i}',
                    'vertex_2': f'v{j}',
                    'weight': corr_mat[i, j]
                })

        corr_df = pd.DataFrame(edges, columns=['vertex_1', 'vertex_2', 'weight'])

        corr_df.to_pickle(out_path)

    print(f"Done: saved per-file PKLs into {output_dir}")

  

In [None]:
files = glob("split_mean/**/*.*", recursive=True)
save_correlations_per_file(files, output_dir="correlation_graphs")