## References

Cluster building idea from https://trendscenter.org/wp/wp-content/uploads/2019/09/frontiers_pub_pic.jpg
Finding the optimal number of Clusters from https://www.kaggle.com/mks2192/trends-cluster-sfnc-groups/notebook

## Todos
- deal with site 2 bias


In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from pathlib import Path

from sklearn.cluster import KMeans

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
KAGGLE_PATH = Path('/kaggle/input/trends-assessment-prediction')

# subject-levels
#SCN - Sub-cortical Network
#ADN - Auditory Network
#SMN - Sensorimotor Network
#VSN - Visual Network
#CON - Cognitive-control Network
#DMN - Default-mode Network
#CBN - Cerebellar Network
SL = ['SCN','ADN','SMN','VSN','CON','DMN','CBN']

In [3]:
sfnc = pd.read_csv(KAGGLE_PATH/'fnc.csv') #.drop('Id',axis=1)

ids = sfnc.pop('Id')

cols = sfnc.columns

sfnc.shape

(11754, 1378)

Grouping column names to group pairs

In [4]:
group_columns={}

for c in cols:
    groupkey = c.split('(')[0] + '_' + c.split('(')[1].split('_',-1)[2]
    
    group_col_list = group_columns.get(groupkey)
    
    if group_col_list == None:
        group_col_list = [c]
    else:
        group_col_list += [c] 
    
    group_columns[groupkey] = group_col_list

# test
group_columns['SCN_SCN']

['SCN(53)_vs_SCN(69)',
 'SCN(98)_vs_SCN(69)',
 'SCN(99)_vs_SCN(69)',
 'SCN(45)_vs_SCN(69)',
 'SCN(98)_vs_SCN(53)',
 'SCN(99)_vs_SCN(53)',
 'SCN(45)_vs_SCN(53)',
 'SCN(99)_vs_SCN(98)',
 'SCN(45)_vs_SCN(98)',
 'SCN(45)_vs_SCN(99)']

# Build cluster

In [5]:
def gen_clusters(n_clusters = 3, suffix=''):
    
    sfnc_group_clusters = pd.DataFrame(ids)
    sfnc_dist_to_cluster_center = sfnc_group_clusters.copy()

    for gc in group_columns:

        X = sfnc[group_columns[gc]].values

        kmeans = KMeans(n_clusters=n_clusters, random_state=2020).fit(X)
        sfnc_group_clusters[gc] = kmeans.labels_

        #preds = kmeans.predict(sfnc[group_columns[gc]].head().values)  # ==> same as kmeans.labels
        #kmeans.cluster_centers_,

        ## euclidean distance to n cluster center 
        for cc in range(n_clusters):
            sfnc_dist_to_cluster_center[gc+'_c'+str(cc)] = (((sfnc[group_columns[gc]] - kmeans.cluster_centers_[cc])**2).sum(axis=1))**0.5

    # Test

    #sfnc_group_clusters, kmeans.cluster_centers_,
    display(sfnc_dist_to_cluster_center.head())

    sfnc_group_clusters.to_csv('sfnc_group_clusters'+suffix+'.csv',index=False)
    sfnc_dist_to_cluster_center.to_csv('sfnc_dist_to_cluster_center'+suffix+'.csv',index=False)

## 3 cluster (first version)

In [6]:
gen_clusters(n_clusters = 3, suffix='')

Unnamed: 0,Id,SCN_SCN_c0,SCN_SCN_c1,SCN_SCN_c2,ADN_SCN_c0,ADN_SCN_c1,ADN_SCN_c2,SMN_SCN_c0,SMN_SCN_c1,SMN_SCN_c2,...,CBN_CON_c2,DMN_DMN_c0,DMN_DMN_c1,DMN_DMN_c2,CBN_DMN_c0,CBN_DMN_c1,CBN_DMN_c2,CBN_CBN_c0,CBN_CBN_c1,CBN_CBN_c2
0,10001,0.871788,0.390716,0.385402,0.379821,0.516688,0.934726,0.856102,2.229552,0.921524,...,1.961355,0.799655,0.609774,0.514475,0.812328,1.008764,0.835154,1.026888,0.309202,0.563603
1,10002,1.036675,0.530807,0.254072,0.363862,0.765579,0.649249,1.218857,2.016922,0.950614,...,1.968841,0.885537,0.604468,0.945977,1.144277,1.7565,1.184483,0.509853,0.552877,0.224146
2,10003,0.617352,0.349032,0.586275,0.91386,1.427406,0.390556,2.375122,1.001784,1.508054,...,1.232811,0.730111,0.686342,1.107144,1.127908,0.765626,0.984706,0.324147,0.741272,0.375366
3,10004,0.841799,0.338555,0.314868,0.699953,0.400624,1.335371,0.675547,2.418559,1.034166,...,1.546244,0.869713,0.683294,0.730443,0.835201,1.225956,0.60105,0.832998,0.230621,0.342938
4,10005,0.659295,0.220484,0.391603,0.26505,0.57744,0.806089,1.051276,1.915216,0.777836,...,1.209711,0.650574,1.036489,0.797182,0.638312,0.81619,0.750403,0.786174,0.329378,0.339744


## 2 cluster (optimal)
see https://www.kaggle.com/mks2192/trends-cluster-sfnc-groups/notebook

In [7]:
gen_clusters(n_clusters = 2, suffix='_2c')

Unnamed: 0,Id,SCN_SCN_c0,SCN_SCN_c1,ADN_SCN_c0,ADN_SCN_c1,SMN_SCN_c0,SMN_SCN_c1,VSN_SCN_c0,VSN_SCN_c1,CON_SCN_c0,...,DMN_CON_c0,DMN_CON_c1,CBN_CON_c0,CBN_CON_c1,DMN_DMN_c0,DMN_DMN_c1,CBN_DMN_c0,CBN_DMN_c1,CBN_CBN_c0,CBN_CBN_c1
0,10001,0.327022,0.703847,0.699391,0.4036,0.753925,1.955025,1.107814,0.572425,1.701457,...,1.926791,1.471899,1.394921,1.72966,0.64276,0.554125,0.774757,0.945126,0.352743,0.869366
1,10002,0.325258,0.867152,0.441541,0.612393,1.054461,1.758555,1.730239,0.85233,1.786591,...,2.038782,1.960696,1.803384,2.273406,0.645151,0.916361,1.065385,1.673043,0.43189,0.364208
2,10003,0.47338,0.476762,0.527872,1.257158,2.108609,0.928525,1.222084,0.852539,1.879278,...,2.171499,2.186715,1.314675,1.345713,0.614549,1.018815,1.066457,0.756163,0.622682,0.243977
3,10004,0.250673,0.670627,1.092868,0.454336,0.665244,2.139911,1.354021,0.84429,1.601029,...,1.835515,1.692437,1.070897,1.361166,0.706626,0.741357,0.636016,1.139628,0.180334,0.668381
4,10005,0.277043,0.492615,0.563784,0.426462,0.873482,1.64787,0.946227,0.83885,1.241445,...,1.560583,1.853078,1.009328,1.642869,0.915636,0.703537,0.650595,0.744056,0.265985,0.62847
