# Построение моделей по данным аутизма по возрастам

Провели исследование предиктивной силы признаков. Использовали только записи 30 и более секунд. Использовали только первые 30 секунд, таким образом все записи были одинаковой длины. Также как и в предыдущем исследовании разбили их на две равные части по 15 секунд.

Построили модели на всех возрастах и на разных возрастных группах

Модель на 9+ построить не получается. По остальным группам получены неплохие результаты. По некоторым группа ROC-AUC приблизился к 0.9 на вторых половинах записей.

In [25]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../eeg-research/')

from os.path import join

import pipeline.features
import pipeline.models


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
plt.rcParams["font.family"] = "Times New Roman"


from sklearn.metrics import roc_auc_score

In [40]:
def get_feat_scores(df):
    res = {}
    for col in df.columns:
        if col in ['target', 'fn']:
            continue
        res[col] = roc_auc_score(df['target'], df[col].fillna(0))
    
    res = pd.Series(res).to_frame().reset_index()
    res.columns = ['feature', 'roc_auc']
    res['sign'] = res['roc_auc'].apply(lambda val: 1 if val > 1 - val else 0)
    res['roc_auc'] = res['roc_auc'].apply(lambda val: max(val, 1 - val))
    res.sort_values('roc_auc', ascending=False, inplace=True)
    return res

def get_binary_target(df):
    df = df[df['target'] != 'organic'].copy()
    df['target'] = df['target'].replace('asd', 1).replace('typical', 0).astype(int)
    return df


def get_binary_target_2(df):
    df['target'] = df['target'].replace('asd', 1).replace('organic', 0).replace('typical', 0).astype(int)
    return df

## Будем использовать записи по 30 секунд

Это не сильно уменьшать выборку, но получить более стабильные признаки

In [75]:
base_path = '../own_data/200426_autists_features_v2_thr30/'

In [359]:
path_df = pd.read_csv('../../preproc_data/autists/path_file.csv')
path_df = path_df[path_df['target'] != 'organic']
path_df = path_df[path_df['seconds'] >= 30]
path_df = path_df[~path_df['fn'].isin(['fedya_10_og.csv', 'hartonov-3_asd.csv', 
                                       'voroncova_kristina_5_fon.csv'])]
path_df['age'] = path_df['age'].fillna(0)

In [360]:
from pipeline.features import get_merged_df

In [361]:
feature_methods = [
    'env-alpha',
    'env-beta',
    'env-theta',
    'coh-alpha',
    'coh-beta',
    'coh-theta',
    'bands'
]

def get_df(df_type):
    df = get_merged_df(join(base_path, df_type), feature_methods)
    return get_binary_target(df)

df = get_df('full')
df_1 = get_df('part_1')
df_2 = get_df('part_2')

In [362]:
intervals = [
    ('All', 0, 100),
    ('2-4', 2, 4),
    ('5-6', 5, 6),
    ('7+', 7, 100),
    ('7-8', 7, 8),
    ('6-8', 6, 8),
    ('9+', 9, 100), 
]

rows = []
to_select = {}
for name, val_from, val_to in intervals:
    cur_df = path_df[(path_df['age'] >= val_from) & (path_df['age'] <= val_to)]
    to_select[name] = cur_df['fn']
    rows.append({
        'Age': name,
        'N': len(cur_df),
        'N autism': len(cur_df[cur_df['target'] == 'asd']),
        'N hc': len(cur_df[cur_df['target'] == 'typical']),
    })
    
age_summary_df = pd.DataFrame(rows)
age_summary_df

Unnamed: 0,Age,N,N autism,N hc
0,All,203,119,84
1,2-4,51,35,16
2,5-6,57,26,31
3,7+,90,53,37
4,7-8,45,31,14
5,6-8,69,39,30
6,9+,45,22,23


## Оцениваем потенциал признаков

**Выводы**

Разбиение 2-4, 5-6, 7+ в целом нормальное, но лучше разбить дополнительно на 7-8 и 9+

In [363]:
def get_merged_feat_scores(df, df_1, df_2, fns_to_select):

    df = df[df['fn'].isin(fns_to_select)]
    df_1 = df_1[df_1['fn'].isin(fns_to_select)]
    df_2 = df_2[df_2['fn'].isin(fns_to_select)]


    feat_scores_full = get_feat_scores(df)
    feat_scores_part1 = get_feat_scores(df_1)
    feat_scores_part2 = get_feat_scores(df_2)

    feat_scores_merged = (feat_scores_full
        .merge(feat_scores_part1[['feature', 'roc_auc']], on='feature', suffixes=('_full', '_part1'))
        .merge(feat_scores_part2[['feature', 'roc_auc']].rename(columns={'roc_auc': 'roc_auc_part2'}), on='feature'))
    
    feat_scores_merged.sort_values('roc_auc_full', ascending=False, inplace=True)
    
    feat_scores_merged['diff'] = feat_scores_merged.apply(
    lambda row: (
        max(row['roc_auc_full'], row['roc_auc_part1'], row['roc_auc_part2']) - 
        min(row['roc_auc_full'], row['roc_auc_part1'], row['roc_auc_part2'])), 
    axis=1)
    
    return feat_scores_merged

In [364]:
get_merged_feat_scores(df, df_1, df_2, to_select['2-4']).head(5)

Unnamed: 0,feature,roc_auc_full,sign,roc_auc_part1,roc_auc_part2,diff
0,bands_gamma_t6,0.855357,1,0.851786,0.835714,0.019643
1,bands_gamma_pz,0.833929,1,0.816071,0.807143,0.026786
2,bands_gamma_c3,0.816071,1,0.803571,0.805357,0.0125
3,coh_alpha_t6_o2,0.814286,0,0.789286,0.791071,0.025
4,bands_gamma_o2,0.8125,1,0.776786,0.807143,0.035714


In [365]:
get_merged_feat_scores(df, df_1, df_2, to_select['5-6']).head(5)

Unnamed: 0,feature,roc_auc_full,sign,roc_auc_part1,roc_auc_part2,diff
0,coh_theta_f7_f8,0.843672,0,0.82134,0.80397,0.039702
1,coh_theta_fp2_f7,0.813896,0,0.800248,0.812655,0.013648
2,env_theta_t5_t6,0.812655,0,0.774194,0.764268,0.048387
3,coh_theta_t5_t6,0.781638,0,0.744417,0.78536,0.040943
4,bands_gamma_pz,0.774194,0,0.811414,0.801489,0.037221


In [366]:
get_merged_feat_scores(df, df_1, df_2, to_select['7+']).head(5)

Unnamed: 0,feature,roc_auc_full,sign,roc_auc_part1,roc_auc_part2,diff
0,coh_alpha_f8_pz,0.712902,1,0.684855,0.643549,0.069352
1,env_beta_t5_o1,0.709842,0,0.716471,0.690974,0.025497
2,coh_beta_t5_o1,0.693524,0,0.678225,0.690974,0.015298
3,env_beta_t3_o1,0.693014,0,0.667517,0.645079,0.047935
4,env_beta_t3_t6,0.688934,0,0.670066,0.610403,0.078531


In [367]:
get_merged_feat_scores(df, df_1, df_2, to_select['6-8']).head(5)

Unnamed: 0,feature,roc_auc_full,sign,roc_auc_part1,roc_auc_part2,diff
0,env_beta_fp1_t3,0.786325,0,0.779487,0.789744,0.010256
1,env_beta_fp1_c3,0.786325,0,0.783761,0.763248,0.023077
2,coh_beta_fp1_f3,0.782051,0,0.779487,0.744444,0.037607
3,coh_alpha_fp1_f3,0.781197,0,0.745299,0.764957,0.035897
4,coh_theta_t3_t4,0.779487,0,0.752137,0.809402,0.057265


In [368]:
get_merged_feat_scores(df, df_1, df_2, to_select['7-8']).head(5)

Unnamed: 0,feature,roc_auc_full,sign,roc_auc_part1,roc_auc_part2,diff
0,env_beta_p4_o2,0.866359,0,0.776498,0.896313,0.119816
1,env_beta_t3_t4,0.852535,0,0.843318,0.806452,0.046083
2,coh_beta_pz_o1,0.834101,0,0.797235,0.799539,0.036866
3,env_alpha_p3_p4,0.829493,0,0.792627,0.705069,0.124424
4,env_beta_t5_o2,0.824885,0,0.788018,0.801843,0.036866


In [369]:
get_merged_feat_scores(df, df_1, df_2, to_select['9+']).head(5)

Unnamed: 0,feature,roc_auc_full,sign,roc_auc_part1,roc_auc_part2,diff
0,coh_alpha_f4_pz,0.8083,1,0.747036,0.695652,0.112648
1,coh_alpha_f8_pz,0.800395,1,0.737154,0.72332,0.077075
2,coh_alpha_fp2_pz,0.784585,1,0.774704,0.727273,0.057312
3,coh_theta_f7_f3,0.77668,1,0.778656,0.709486,0.06917
4,coh_alpha_f8_p3,0.766798,1,0.786561,0.717391,0.06917


## Строим модель

Для отбора признаков будем использовать порог 0.01/0.02, по результам моих предыдущих экспериментов. На всякий случай я проверил еще несколько значений на группе возраста 5-6, данные пороги можно считать оптимальным. Целевое количество признаков - 5-10

In [370]:
model = pipeline.models.LRScaled()

In [395]:
def order_func(features, df):
    scores = get_feat_scores(df).sort_values('roc_auc', ascending=False)
    feature_candidates = scores.head(150)['feature'].tolist()
    return feature_candidates

features = [col for col in df.columns if col not in ['fn', 'target']]

### Все возраста

In [396]:
features_selected_all, best_score, hist = pipeline.models.select_features(
    df_1[df_1['fn'].isin(to_select['All'])], features, model, 
    df_val=df_2[df_2['fn'].isin(to_select['All'])], 
    n_repeats=20, threshold=0.01, order_func=order_func)
hist

Feature selection. Step 1


HBox(children=(IntProgress(value=0, max=150), HTML(value='')))

Feature selection. Step 2


Unnamed: 0,feature,action,score,score_std,score_val,score_val_std
0,coh_theta_t3_t4,added,0.68192,0.003819,0.66967,0.005231
1,env_alpha_t6_o1,added,0.734986,0.005331,0.714433,0.005253
2,env_beta_t6_o1,added,0.746231,0.005037,0.703724,0.008356
3,bands_beta_t6,added,0.762147,0.004292,0.716594,0.007839
4,env_alpha_t3_o2,added,0.775473,0.005052,0.71849,0.007168
5,coh_theta_t4_o1,added,0.790774,0.006627,0.743215,0.007144
6,coh_theta_c3_t4,added,0.801248,0.005846,0.739678,0.008895
7,coh_beta_f4_c3,added,0.814068,0.004683,0.763223,0.007374


### 2-4

In [397]:

features_selected_24, best_score, hist = pipeline.models.select_features(
    df_1[df_1['fn'].isin(to_select['2-4'])], features, model, 
    df_val=df_2[df_2['fn'].isin(to_select['2-4'])], 
    n_repeats=20, threshold=0.01, order_func=order_func)
hist

Feature selection. Step 1


HBox(children=(IntProgress(value=0, max=150), HTML(value='')))

Feature selection. Step 2


Unnamed: 0,feature,action,score,score_std,score_val,score_val_std
0,bands_gamma_t6,added,0.767946,0.03694,0.752054,0.031122
1,env_alpha_p4_t6,added,0.830536,0.010338,0.815179,0.011562
2,bands_gamma_pz,added,0.869107,0.008382,0.802411,0.012583
3,env_beta_f7_c3,added,0.891071,0.008207,0.856429,0.009775
4,coh_alpha_t3_c3,added,0.909464,0.008151,0.880714,0.006259
5,env_theta_f8_t3,added,0.935268,0.005993,0.894732,0.00806
6,env_beta_f7_c3,removed,0.945804,0.006017,0.902232,0.009618


### 5-6

In [398]:
features = [col for col in df.columns if col not in ['fn', 'target']]
features_selected_56, best_score, hist = pipeline.models.select_features(
    df_1[df_1['fn'].isin(to_select['5-6'])], features, model, 
    df_val=df_2[df_2['fn'].isin(to_select['5-6'])], 
    n_repeats=20, threshold=0.01, order_func=order_func)
hist

Feature selection. Step 1


HBox(children=(IntProgress(value=0, max=150), HTML(value='')))

Feature selection. Step 2


Unnamed: 0,feature,action,score,score_std,score_val,score_val_std
0,coh_theta_f7_f8,added,0.790757,0.009103,0.776985,0.007474
1,bands_gamma_pz,added,0.867122,0.008842,0.855397,0.008645
2,bands_gamma_f7,added,0.884677,0.007511,0.81737,0.009156
3,coh_theta_fp2_f7,added,0.898449,0.00858,0.843486,0.009966
4,coh_alpha_fp2_f7,added,0.909864,0.009254,0.830025,0.010163
5,coh_alpha_fp1_t4,added,0.922457,0.009797,0.837283,0.009811
6,coh_alpha_f3_f8,added,0.934926,0.009991,0.830645,0.009906


### 7-8

In [399]:
features = [col for col in df.columns if col not in ['fn', 'target']]
features_selected_78, best_score, hist = pipeline.models.select_features(
    df_1[df_1['fn'].isin(to_select['7-8'])], features, model, 
    df_val=df_2[df_2['fn'].isin(to_select['7-8'])], 
    n_repeats=20, threshold=0.01, order_func=order_func)
hist

Feature selection. Step 1


HBox(children=(IntProgress(value=0, max=150), HTML(value='')))

Feature selection. Step 2


Unnamed: 0,feature,action,score,score_std,score_val,score_val_std
0,env_beta_t3_t4,added,0.814286,0.013909,0.760599,0.013785
1,env_beta_fp1_t3,added,0.831567,0.008921,0.827074,0.01288
2,bands_beta_t3,added,0.888134,0.007045,0.865438,0.012502
3,env_beta_p3_t6,added,0.909562,0.004826,0.890207,0.01049
4,bands_beta_t6,added,0.956106,0.004278,0.875922,0.021716


### 9+

In [None]:
features = [col for col in df.columns if col not in ['fn', 'target']]
features_selected_9x, best_score, hist = pipeline.models.select_features(
    df_1[df_1['fn'].isin(to_select['9+'])], features, model, 
    df_val=df_2[df_2['fn'].isin(to_select['9+'])], 
    n_repeats=20, threshold=0.02, order_func=order_func)

In [402]:
hist

Unnamed: 0,feature,action,score,score_std,score_val,score_val_std
0,coh_alpha_f3_p4,added,0.769466,0.015777,0.555435,0.028299
1,coh_alpha_f8_p3,added,0.808399,0.013613,0.646937,0.015577
2,env_alpha_fz_c3,added,0.830336,0.013232,0.611166,0.016439
3,coh_alpha_fp1_p3,added,0.865711,0.013323,0.590613,0.023767


## Дополнительный раздел - сравниваю варианты обрезки записей

**Потом перенести ноутбук с анализом данных**

Тут я показываю, что разброс ROC-AUC между частями в среднем одинаковый для различных вариантов данных (полные записи разной длины, первые 13 секунд, первые 30 секунд)

Однако чем меньше запись, тем больше дисперсия этой разницы. Иными словами я хочу сказать, что дисперсия ROC-AUC зависит от длины записи. Чем длиннее запись, тем более близкие признаки посчитанные на первой и второй половинах

Также есть проблема, что ROC-AUC полной записи может сильно отличаться от половин. Например, 0.75 для полной записи, и 0.7 для первой и второй половин. Не понятно, в чем причина. Это справедливо и для coherence и для envelopes correlation

In [81]:
feature_methods = [
    'env-alpha',
    'env-beta',
    'env-theta',
    'coh-alpha',
    'coh-beta',
    'coh-theta',
    'bands'
]

to_select = path_df[(path_df['age'] >= 5) & (path_df['age'] <= 6)]['fn']

def get_merged_feat_scores_custom(base_path):

    df = pipeline.features.get_merged_df(join(base_path, 'full'), feature_methods)
    df = get_binary_target(df)
    df = df[df['fn'].isin(to_select)]

    df_1 = pipeline.features.get_merged_df(join(base_path, 'part_1'), feature_methods)
    df_1 = get_binary_target(df_1)
    df_1 = df_1[df_1['fn'].isin(to_select)]

    df_2 = pipeline.features.get_merged_df(join(base_path, 'part_2'), feature_methods)
    df_2 = get_binary_target(df_2)
    df_2 = df_2[df_2['fn'].isin(to_select)]


    feat_scores_full = get_feat_scores(df)
    feat_scores_part1 = get_feat_scores(df_1)
    feat_scores_part2 = get_feat_scores(df_2)

    feat_scores_merged = (feat_scores_full
        .merge(feat_scores_part1[['feature', 'roc_auc']], on='feature', suffixes=('_full', '_part1'))
        .merge(feat_scores_part2[['feature', 'roc_auc']].rename(columns={'roc_auc': 'roc_auc_part2'}), on='feature'))
    
    feat_scores_merged.sort_values('roc_auc_full', ascending=False, inplace=True)
    
    feat_scores_merged['diff'] = feat_scores_merged.apply(
    lambda row: (
        max(row['roc_auc_full'], row['roc_auc_part1'], row['roc_auc_part2']) - 
        min(row['roc_auc_full'], row['roc_auc_part1'], row['roc_auc_part2'])), 
    axis=1)
    
    return feat_scores_merged

In [82]:
feat_scores_compl = get_merged_feat_scores_custom('../own_data/200401_autists_features/')
feat_scores_thr13 = get_merged_feat_scores_custom('../own_data/200426_autists_features_v2_thr13/')
feat_scores_thr30 = get_merged_feat_scores_custom('../own_data/200426_autists_features_v2_thr30/')

In [83]:
feat_scores_compl.head(30)['diff'].mean()

0.056262626262626264

In [84]:
feat_scores_compl.head(30)['diff'].std()

0.02655326414468246

In [85]:
feat_scores_thr13.head(30)['diff'].mean()

0.05589225589225588

In [86]:
feat_scores_thr13.head(30)['diff'].std()

0.04504231457920396

In [87]:
feat_scores_thr30.head(30)['diff'].mean()

0.050600961538461546

In [88]:
feat_scores_thr30.head(30)['diff'].std()

0.03105387278806327