# Исследование связаности на кластерах

Первый вариант:

    (F3, F7, T3, C3);     (F4, F8, T4, C4);   (T5, P3, O1);  (T6, P4, O2);  (Fz, Cz, Pz)

Второй вариант:

    (F3, Fz, F4) (F7, T3), (F8, T4), (C3, Cz, C4) (P3, Pz, P4), (T5, O1), (T6, O2)

Третий вариант:

    (F3, F7, T3), (F4, F8, T4), (C3, Cz, C4, Pz), (T5, O1, Р3), (T6, O2, P4)

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../eeg-research/')

from os.path import join

# import pipeline.features
# import pipeline.models
from pipeline.explore import feat_performance

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'svg'


In [31]:
path_df = pd.read_csv('../../preproc_data/autists/path_file.csv')
path_df = path_df[path_df['target'] != 'organic']
path_df = path_df[path_df['seconds'] >= 30]
path_df = path_df[~path_df['fn'].isin(['fedya_10_og.csv', 'hartonov-3_asd.csv', 
                                       'voroncova_kristina_5_fon.csv'])]
path_df['age'] = path_df['age'].fillna(0)

In [32]:
intervals = [
    ('All', 0, 100),
    ('2-4', 2, 4),
    ('5-6', 5, 6),
    ('7+', 7, 100),
    ('7-8', 7, 8),
    ('6-8', 6, 8),
    ('9+', 9, 100), 
]

rows = []
to_select = {}
for name, val_from, val_to in intervals:
    cur_df = path_df[(path_df['age'] >= val_from) & (path_df['age'] <= val_to)]
    to_select[name] = cur_df['fn']
    rows.append({
        'Age': name,
        'N': len(cur_df),
        'N autism': len(cur_df[cur_df['target'] == 'asd']),
        'N hc': len(cur_df[cur_df['target'] == 'typical']),
    })
    
age_summary_df = pd.DataFrame(rows)
age_summary_df

Unnamed: 0,Age,N,N autism,N hc
0,All,203,119,84
1,2-4,51,35,16
2,5-6,57,26,31
3,7+,90,53,37
4,7-8,45,31,14
5,6-8,69,39,30
6,9+,45,22,23


In [33]:
def get_binary_target(df):
    df = df[df['target'] != 'organic'].copy()
    df['target'] = df['target'].replace('asd', 1).replace('typical', 0).astype(int)
    return df


In [42]:
def feat_performance_by_age(df, fns_to_select):
    df = df[df['fn'].isin(fns_to_select)]
    features = [col for col in df.columns if col not in ['target', 'fn']]
    return feat_performance(df, features)

In [58]:
def get_merged_feat_scores(df, df_1, df_2, fns_to_select):

    df = df[df['fn'].isin(fns_to_select)]
    df_1 = df_1[df_1['fn'].isin(fns_to_select)]
    df_2 = df_2[df_2['fn'].isin(fns_to_select)]


    feat_scores_full = feat_performance(df)
    feat_scores_part1 = feat_performance(df_1)
    feat_scores_part2 = feat_performance(df_2)

    feat_scores_merged = (feat_scores_full
        .merge(feat_scores_part1[['feature', 'roc_auc']], on='feature', suffixes=('_full', '_part1'))
        .merge(feat_scores_part2[['feature', 'roc_auc']].rename(columns={'roc_auc': 'roc_auc_part2'}), on='feature'))
    
    feat_scores_merged.sort_values('roc_auc_full', ascending=False, inplace=True)
    
    feat_scores_merged['diff'] = feat_scores_merged.apply(
    lambda row: (
        max(row['roc_auc_full'], row['roc_auc_part1'], row['roc_auc_part2']) - 
        min(row['roc_auc_full'], row['roc_auc_part1'], row['roc_auc_part2'])), 
    axis=1)
    
    return feat_scores_merged

## Кластеризация - версия 1

In [63]:
path = '../own_data/200708_aut_clust_v1/'

df = pd.read_csv(join(path, 'full/set_2.csv'))
df_1 = pd.read_csv(join(path, 'part_1/set_2.csv'))
df_2 = pd.read_csv(join(path, 'part_2/set_2.csv'))

df = get_binary_target(df)
df_1 = get_binary_target(df_1)
df_2 = get_binary_target(df_2)

In [64]:
get_merged_feat_scores(df, df_1, df_2, to_select['2-4']).head(20)

Unnamed: 0,feature,roc_auc_full,mean_difference,roc_auc_part1,roc_auc_part2,diff
0,coh_theta_clust_3_clust_4,0.755357,-0.094693,0.683929,0.728571,0.071429
1,coh_beta_clust_0_clust_4,0.701786,0.10445,0.6875,0.682143,0.019643
2,coh_alpha_clust_3_clust_4,0.694643,-0.079988,0.698214,0.689286,0.008929
3,coh_alpha_clust_2_clust_3,0.692857,-0.061507,0.691071,0.658929,0.033929
4,env_theta_clust_2_clust_3,0.683929,-0.09091,0.6875,0.65,0.0375
5,env_alpha_clust_3_clust_4,0.683929,-0.123784,0.655357,0.717857,0.0625
6,env_alpha_clust_1_clust_2,0.682143,-0.117284,0.641071,0.725,0.083929
7,env_beta_clust_3_clust_4,0.682143,-0.08639,0.655357,0.714286,0.058929
8,coh_beta_clust_3_clust_4,0.676786,-0.060882,0.703571,0.685714,0.026786
9,coh_alpha_clust_1_clust_2,0.676786,-0.054273,0.669643,0.691071,0.021429


In [65]:
get_merged_feat_scores(df, df_1, df_2, to_select['5-6']).head(20)

Unnamed: 0,feature,roc_auc_full,mean_difference,roc_auc_part1,roc_auc_part2,diff
0,env_alpha_clust_0_clust_1,0.748139,-0.191638,0.760546,0.693548,0.066998
1,coh_theta_clust_0_clust_1,0.724566,-0.101126,0.734491,0.779156,0.054591
2,env_theta_clust_2_clust_3,0.700993,-0.105571,0.666253,0.67866,0.034739
3,coh_alpha_clust_0_clust_1,0.687345,-0.082841,0.703474,0.686104,0.01737
4,env_theta_clust_0_clust_1,0.686104,-0.131087,0.682382,0.67866,0.007444
5,coh_theta_clust_2_clust_3,0.684864,-0.044481,0.650124,0.755583,0.105459
6,env_alpha_clust_2_clust_3,0.677419,-0.101436,0.733251,0.614144,0.119107
7,env_theta_clust_1_clust_2,0.674938,-0.121589,0.698511,0.651365,0.047146
8,env_alpha_clust_1_clust_2,0.667494,-0.123707,0.686104,0.6067,0.079404
9,coh_alpha_clust_1_clust_2,0.66005,-0.055219,0.689826,0.662531,0.029777


In [66]:
get_merged_feat_scores(df, df_1, df_2, to_select['7-8']).head(20)

Unnamed: 0,feature,roc_auc_full,mean_difference,roc_auc_part1,roc_auc_part2,diff
0,coh_beta_clust_2_clust_4,0.78341,-0.103023,0.725806,0.776498,0.057604
1,env_alpha_clust_2_clust_3,0.748848,-0.137377,0.675115,0.746544,0.073733
2,env_beta_clust_2_clust_4,0.730415,-0.151894,0.739631,0.698157,0.041475
3,coh_theta_clust_0_clust_1,0.718894,-0.087069,0.682028,0.71659,0.036866
4,env_beta_clust_0_clust_4,0.714286,-0.186815,0.682028,0.760369,0.078341
5,coh_beta_clust_0_clust_4,0.700461,-0.123719,0.691244,0.709677,0.018433
6,env_beta_clust_0_clust_2,0.695853,-0.142052,0.675115,0.677419,0.020737
7,env_beta_clust_1_clust_2,0.68894,-0.056433,0.633641,0.700461,0.06682
8,env_beta_clust_2_clust_3,0.686636,-0.104402,0.670507,0.684332,0.016129
9,env_theta_clust_3_clust_4,0.679724,0.117353,0.705069,0.640553,0.064516


## Кластеризация - версия 2

In [67]:
path = '../own_data/200708_aut_clust_v2/'

df = pd.read_csv(join(path, 'full/set_2.csv'))
df_1 = pd.read_csv(join(path, 'part_1/set_2.csv'))
df_2 = pd.read_csv(join(path, 'part_2/set_2.csv'))

df = get_binary_target(df)
df_1 = get_binary_target(df_1)
df_2 = get_binary_target(df_2)

In [68]:
get_merged_feat_scores(df, df_1, df_2, to_select['2-4']).head(20)

Unnamed: 0,feature,roc_auc_full,mean_difference,roc_auc_part1,roc_auc_part2,diff
0,coh_beta_clust_0_clust_1,0.778571,0.150415,0.773214,0.755357,0.023214
1,env_beta_clust_3_clust_5,0.753571,-0.160306,0.771429,0.771429,0.017857
2,env_beta_clust_0_clust_1,0.742857,0.211281,0.782143,0.710714,0.071429
3,env_beta_clust_0_clust_3,0.735714,0.202139,0.769643,0.694643,0.075
4,coh_beta_clust_1_clust_2,0.728571,0.111258,0.721429,0.703571,0.025
5,env_beta_clust_4_clust_5,0.717857,-0.124624,0.741071,0.721429,0.023214
6,env_beta_clust_1_clust_2,0.716071,0.185849,0.755357,0.667857,0.0875
7,coh_alpha_clust_0_clust_1,0.716071,0.073264,0.675,0.685714,0.041071
8,env_alpha_clust_3_clust_5,0.714286,-0.158053,0.735714,0.725,0.021429
9,coh_alpha_clust_4_clust_5,0.710714,-0.080435,0.683929,0.682143,0.028571


In [69]:
get_merged_feat_scores(df, df_1, df_2, to_select['5-6']).head(20)

Unnamed: 0,feature,roc_auc_full,mean_difference,roc_auc_part1,roc_auc_part2,diff
0,env_alpha_clust_0_clust_2,0.705955,-0.146598,0.62531,0.666253,0.080645
1,env_theta_clust_1_clust_5,0.69727,-0.126607,0.671216,0.645161,0.052109
2,env_theta_clust_4_clust_5,0.691067,-0.116574,0.69603,0.699752,0.008685
3,env_alpha_clust_4_clust_5,0.686104,-0.098524,0.725806,0.638958,0.086849
4,coh_theta_clust_1_clust_5,0.684864,-0.065741,0.717122,0.688586,0.032258
5,coh_alpha_clust_4_clust_5,0.674938,-0.047525,0.739454,0.694789,0.064516
6,coh_beta_clust_0_clust_2,0.669975,-0.098607,0.617866,0.647643,0.052109
7,coh_alpha_clust_0_clust_1,0.667494,-0.082271,0.663772,0.627792,0.039702
8,env_beta_clust_0_clust_2,0.666253,-0.123583,0.600496,0.633995,0.065757
9,coh_alpha_clust_0_clust_2,0.666253,-0.081002,0.645161,0.66129,0.021092


In [70]:
get_merged_feat_scores(df, df_1, df_2, to_select['7-8']).head(20)

Unnamed: 0,feature,roc_auc_full,mean_difference,roc_auc_part1,roc_auc_part2,diff
0,env_beta_clust_0_clust_1,0.751152,-0.181785,0.75576,0.769585,0.018433
2,env_beta_clust_4_clust_5,0.737327,-0.126349,0.730415,0.732719,0.006912
1,env_beta_clust_1_clust_5,0.737327,-0.118424,0.728111,0.714286,0.023041
3,coh_beta_clust_1_clust_2,0.735023,-0.101678,0.739631,0.739631,0.004608
4,env_alpha_clust_4_clust_5,0.732719,-0.140178,0.658986,0.746544,0.087558
5,env_beta_clust_0_clust_2,0.730415,-0.122001,0.679724,0.776498,0.096774
6,env_beta_clust_0_clust_3,0.728111,-0.123593,0.663594,0.781106,0.117512
7,env_beta_clust_1_clust_2,0.725806,-0.136186,0.74424,0.668203,0.076037
8,env_beta_clust_1_clust_3,0.723502,-0.139982,0.677419,0.700461,0.046083
9,env_beta_clust_3_clust_5,0.711982,-0.12757,0.675115,0.732719,0.057604


## Кластеризация - версия 3

In [71]:
path = '../own_data/200708_aut_clust_v3/'

df = pd.read_csv(join(path, 'full/set_2.csv'))
df_1 = pd.read_csv(join(path, 'part_1/set_2.csv'))
df_2 = pd.read_csv(join(path, 'part_2/set_2.csv'))

df = get_binary_target(df)
df_1 = get_binary_target(df_1)
df_2 = get_binary_target(df_2)

In [72]:
get_merged_feat_scores(df, df_1, df_2, to_select['2-4']).head(20)

Unnamed: 0,feature,roc_auc_full,mean_difference,roc_auc_part1,roc_auc_part2,diff
0,env_theta_clust_2_clust_4,0.710714,-0.154863,0.7,0.683929,0.026786
1,coh_theta_clust_2_clust_4,0.710714,-0.088952,0.660714,0.739286,0.078571
2,coh_beta_clust_0_clust_2,0.701786,0.102038,0.696429,0.6875,0.014286
3,coh_beta_clust_2_clust_4,0.696429,-0.089767,0.694643,0.708929,0.014286
4,coh_alpha_clust_3_clust_4,0.692857,-0.061507,0.691071,0.658929,0.033929
5,env_beta_clust_2_clust_4,0.691071,-0.123526,0.683929,0.739286,0.055357
6,coh_alpha_clust_1_clust_3,0.689286,-0.059244,0.6875,0.683929,0.005357
7,env_alpha_clust_1_clust_3,0.689286,-0.121242,0.628571,0.741071,0.1125
8,env_theta_clust_3_clust_4,0.683929,-0.09091,0.6875,0.65,0.0375
9,coh_alpha_clust_2_clust_4,0.682143,-0.08782,0.664286,0.673214,0.017857


In [73]:
get_merged_feat_scores(df, df_1, df_2, to_select['5-6']).head(20)

Unnamed: 0,feature,roc_auc_full,mean_difference,roc_auc_part1,roc_auc_part2,diff
0,env_alpha_clust_0_clust_1,0.760546,-0.193151,0.789082,0.699752,0.08933
1,coh_theta_clust_0_clust_1,0.744417,-0.115575,0.776675,0.791563,0.047146
2,env_theta_clust_0_clust_1,0.744417,-0.156618,0.74938,0.708437,0.040943
3,env_theta_clust_1_clust_3,0.702233,-0.133579,0.736973,0.648883,0.088089
4,env_theta_clust_3_clust_4,0.700993,-0.105571,0.666253,0.67866,0.034739
5,env_alpha_clust_1_clust_2,0.698511,-0.135533,0.693548,0.624069,0.074442
6,env_theta_clust_1_clust_4,0.688586,-0.128376,0.621588,0.62531,0.066998
7,coh_theta_clust_3_clust_4,0.684864,-0.044481,0.650124,0.755583,0.105459
8,env_alpha_clust_1_clust_3,0.677419,-0.128445,0.708437,0.617866,0.090571
9,env_alpha_clust_3_clust_4,0.677419,-0.101436,0.733251,0.614144,0.119107


In [74]:
get_merged_feat_scores(df, df_1, df_2, to_select['7-8']).head(20)

Unnamed: 0,feature,roc_auc_full,mean_difference,roc_auc_part1,roc_auc_part2,diff
0,env_alpha_clust_3_clust_4,0.748848,-0.137377,0.675115,0.746544,0.073733
1,coh_beta_clust_2_clust_3,0.728111,-0.088868,0.677419,0.74424,0.06682
2,coh_theta_clust_0_clust_1,0.705069,-0.093731,0.700461,0.725806,0.025346
3,env_beta_clust_2_clust_3,0.702765,-0.134491,0.642857,0.721198,0.078341
4,coh_beta_clust_0_clust_2,0.700461,-0.101062,0.702765,0.71659,0.016129
5,env_beta_clust_1_clust_3,0.695853,-0.072671,0.640553,0.705069,0.064516
6,env_beta_clust_0_clust_3,0.693548,-0.135181,0.670507,0.68894,0.023041
7,env_alpha_clust_0_clust_1,0.686636,-0.13593,0.654378,0.702765,0.048387
8,env_beta_clust_3_clust_4,0.686636,-0.104402,0.670507,0.684332,0.016129
9,env_theta_clust_0_clust_1,0.686636,-0.124902,0.668203,0.71659,0.048387
