In [1]:
import pandas as pd
import numpy as np

from scipy.stats import ttest_ind
from sklearn.metrics import roc_auc_score

from statsmodels.stats.multitest import multipletests

import mne

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
%matplotlib notebook

In [18]:
df = pd.read_csv('../own_data/210403_aut_bands_and_env_var/full/features.csv')

In [19]:
groups = [
    ('2-4', 2, 4),
    ('5-6', 5, 6),
    ('7+', 7, 100),
]

for g in groups:
    df.loc[(df['age'] >= g[1]) & (df['age'] <= g[2]), 'age_group'] = g[0]

In [20]:
df.pivot_table(values='fn', index='age_group', columns='target', aggfunc='count', margins=True)

target,asd,organic,typical,All
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2-4,37,8,17,62
5-6,28,7,33,68
7+,55,20,48,123
All,120,35,98,253


Удаляем органиков и те по кому нет возраста. ASD = 1, HC = 0

In [21]:
df = df[(df['target'] != 'organic') & (df['age_group'])]
df.loc[:, 'target'] = df['target'].apply(lambda s: 1 if s == 'asd' else 0) 

In [72]:
df_number_of_samples = df.pivot_table(values='fn', index='age_group', columns='target', aggfunc='count', margins=True)
df_number_of_samples

target,0,1,All
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2-4,17,37,54
5-6,33,28,61
7+,48,55,103
All,98,120,218


In [73]:
df_number_of_samples.to_clipboard(excel=True, decimal=',')

In [23]:
def feat_performance(df, features=None):
    rows = []
    df_0 = df[df['target'] == 0].copy()
    df_1 = df[df['target'] == 1].copy()
    
    for feat in features:        
        
        mean_diff = df_0[feat].mean() - df_1[feat].mean()
        mean_ratio = df_0[feat].mean() / df_1[feat].mean()
        ttest_res = ttest_ind(df_0[feat], df_1[feat], equal_var=False)
        roc_auc = max(roc_auc_score(df['target'], df[feat]), 1 - roc_auc_score(df['target'], df[feat]))
        
        d = {
            'feature': feat,
            'ttest_pval': ttest_res.pvalue,
            'ttest_stat': ttest_res.statistic,
            'mean_hc': df_0[feat].mean(),
            'mean_asd': df_1[feat].mean(),
            'roc_auc': roc_auc,
            'mean_diff': mean_diff,
            'mean_ratio': mean_ratio,
        }
        rows.append(d)
    res = pd.DataFrame(rows)
    return res

In [26]:
features = [col for col in df.columns if col not in ['target', 'fn', 'age', 'age_group']]

In [28]:
len(features)

108

In [30]:
108 * 0.01

1.08

In [None]:
pd.set_option

In [70]:
df_performance = feat_performance(df, features=features)
# df_performance = df_performance[df_performance['ttest_pval'] < 0.01].sort_values('ttest_pval').round(3)
df_performance = df_performance.sort_values('ttest_pval').round(3)
df_performance

Unnamed: 0,feature,ttest_pval,ttest_stat,mean_hc,mean_asd,roc_auc,mean_diff,mean_ratio
79,bands_beta_t6,0.000,-5.432,0.135,0.225,0.687,-0.090,0.602
40,bands_4_6_c3,0.000,-4.619,0.182,0.227,0.684,-0.045,0.800
41,bands_6_8_c3,0.000,-4.619,0.182,0.227,0.684,-0.045,0.800
42,bands_8_10_c3,0.000,-4.619,0.182,0.227,0.684,-0.045,0.800
43,bands_10_12_c3,0.000,-4.619,0.182,0.227,0.684,-0.045,0.800
...,...,...,...,...,...,...,...,...
92,f7_env_var,0.929,0.089,0.718,0.711,0.572,0.008,1.011
91,fp2_env_var,0.960,0.051,0.713,0.709,0.531,0.004,1.006
101,t5_env_var,0.963,-0.047,0.728,0.732,0.503,-0.004,0.994
102,p3_env_var,0.970,0.038,0.710,0.707,0.509,0.003,1.004


In [68]:
df_performance = feat_performance(df[df['age_group'] == '7+'], features=features)
df_performance = df_performance.sort_values('ttest_pval').round(3)
df_performance

Unnamed: 0,feature,ttest_pval,ttest_stat,mean_hc,mean_asd,roc_auc,mean_diff,mean_ratio
89,bands_beta_o2,0.001,-3.435,0.169,0.253,0.710,-0.084,0.669
79,bands_beta_t6,0.001,-3.444,0.126,0.194,0.698,-0.068,0.648
84,bands_beta_o1,0.002,-3.177,0.171,0.252,0.666,-0.081,0.677
48,bands_10_12_c4,0.004,-2.936,0.173,0.218,0.638,-0.046,0.791
47,bands_8_10_c4,0.004,-2.936,0.173,0.218,0.638,-0.046,0.791
...,...,...,...,...,...,...,...,...
100,t4_env_var,0.751,-0.318,0.626,0.639,0.621,-0.013,0.979
99,c4_env_var,0.856,-0.182,0.661,0.671,0.524,-0.010,0.986
106,o1_env_var,0.944,0.070,0.654,0.651,0.552,0.003,1.004
92,f7_env_var,0.951,0.062,0.649,0.645,0.512,0.004,1.006


In [71]:
df_performance.to_clipboard(excel=True, index=False, decimal=',')