In [1]:
import os
import numpy as np
import pandas as pd
from itertools import combinations

from general_class_balancer import *

# Generate Datasets
This notebook is to generate datasets that are as balanced as possible for binary classification wrt confounding factors from the entire pool of controls.
- For idiopathic conditions, just use controls from the same sites.
- For CNVs use Matthew Leming's class balancing algorithm.

## Define functions

In [2]:
def save_ids(ids, path_out, case):
    tag = f'{case}.txt'
    filename = os.path.join(path_out,tag)
    with open(filename, 'w') as file:
        for i in ids:
            file.write(f"{i}\n")

## Load data

In [3]:
p_pheno = '/home/harveyaa/Documents/fMRI/data/ukbb_9cohorts/pheno_26-01-22.csv'
path_out = '/home/harveyaa/Documents/masters/MTL/conf_balancing/dataset_ids'

pheno = pd.read_csv(p_pheno,index_col=0)

conf = ['AGE','SEX','SITE','mean_conn','FD_scrubbed']

  exec(code_obj, self.user_global_ns, self.user_ns)


## IPC

In [4]:
ipc = ['SZ',
        'ASD',
        'BIP']

In [5]:
for case in ipc:  
    control = 'CON_IPC'

    df_con = pheno[(pheno[control] == 1)&(pheno['PI'].isin(pheno[pheno[case] == 1]['PI'].unique()))]
    df = pd.concat([df_con,pheno[pheno[case]==1]])
    df.loc[:,case] = df.loc[:,case].astype(int)

    # Save out those ids
    save_ids(df.index.to_list(),path_out,case)

## Most CNVs

In [6]:
cnvs = [#'DEL22q11_2',
        'DUP22q11_2',
        'DEL16p11_2',
        #'DUP16p11_2',
        'DEL1q21_1',
        'DUP1q21_1'
        ]

In [7]:
for case in cnvs:
    control = 'non_carriers'

    df_con = pheno[(pheno[control] == 1)&(pheno['PI'].isin(pheno[pheno[case] == 1]['PI'].unique()))]
    df = pd.concat([df_con,pheno[pheno[case]==1]])
    df.loc[:,case] = df.loc[:,case].astype(int)

    confounds = df[conf].transpose().values
    classes = df[case].values.astype(int)
    n_case = np.sum(classes)
    print('total cases: ', n_case)

    selected_case = 0
    while selected_case != n_case:
        selection = class_balance(classes,confounds)
        selected_case = np.sum(classes[selection])
        print(selected_case)

    save_ids(df[selection].index.to_list(),path_out,case)

total cases:  22
22
total cases:  32
32
total cases:  25
25
total cases:  19
19


## DUP16p11_2
Special case, can't balance w/ all controls: pick a balance w/ 34/35 and hand select last control & add excluded case.

In [8]:
case = 'DUP16p11_2'
dup16p_ids = []

control = 'non_carriers'

df_con = pheno[(pheno[control] == 1)&(pheno['PI'].isin(pheno[pheno[case] == 1]['PI'].unique()))]
df = pd.concat([df_con,pheno[pheno[case]==1]])
df.loc[:,case] = df.loc[:,case].astype(int)

confounds = df[conf].transpose().values
classes = df[case].values.astype(int)
n_case = np.sum(classes)
print('total cases: ', n_case)

selected_case = 0
while selected_case != n_case:
    selection = class_balance(classes,confounds)
    selected_case = np.sum(classes[selection])
    print(selected_case)

    if selected_case == 34:
        dup16p_ids.append(df[selection].index.to_list())
    
    if len(dup16p_ids) == 5:
        break

total cases:  35
32
32
34
33
34
32
34
34
33
34


In [9]:
# ALWAYS THE SAME GUY EXCLUDED
for i in range(5):
    all_dup16 = pheno[pheno['DUP16p11_2']==1].index
    sel_dup16 = pheno[(pheno.index.isin(dup16p_ids[i])) & (pheno['DUP16p11_2']==1)].index
    print('Excluded case: ', set(all_dup16).difference(set(sel_dup16)))

Excluded case:  {'s14723xx17xFCAP1'}
Excluded case:  {'s14723xx17xFCAP1'}
Excluded case:  {'s14723xx17xFCAP1'}
Excluded case:  {'s14723xx17xFCAP1'}
Excluded case:  {'s14723xx17xFCAP1'}


In [10]:
# ARE THE CONTROLS ALWAYS THE SAME?
for i,j in combinations(range(5),2):
    sel_con_i = pheno[(pheno.index.isin(dup16p_ids[i])) & (pheno['DUP16p11_2']==0)].index
    sel_con_j = pheno[(pheno.index.isin(dup16p_ids[i])) & (pheno['DUP16p11_2']==0)].index
    print(f'Different controls {i},{j}:', len(set(sel_con_i).difference(set(sel_con_j))))

Different controls 0,1: 0
Different controls 0,2: 0
Different controls 0,3: 0
Different controls 0,4: 0
Different controls 1,2: 0
Different controls 1,3: 0
Different controls 1,4: 0
Different controls 2,3: 0
Different controls 2,4: 0
Different controls 3,4: 0


In [11]:
# subject age too young to find a match?
pheno.loc['s14723xx17xFCAP1'][conf]

AGE                 5.0
SEX                Male
SITE              Svip1
mean_conn      0.293059
FD_scrubbed    0.154738
Name: s14723xx17xFCAP1, dtype: object

In [12]:
# pick youngest 'matched' control not already in selection
handpick_con = [pheno[(pheno['SITE'] == 'Svip1') 
                & (pheno['SEX'] == 'Male')
                & (pheno['non_carriers'] == 1)
                & (~pheno.index.isin(dup16p_ids[0]))][conf].sort_values('AGE').index[0]]

excluded_case = ['s14723xx17xFCAP1']


In [13]:
dup16p_hand_selection = dup16p_ids[0] + excluded_case + handpick_con

save_ids(dup16p_hand_selection,path_out,'DUP16p11_2')

## DEL22q11_2
Special case, take all case/control from single site.

In [14]:
del22q_con_idx = pheno[(pheno['SITE']=='UCLA_CB') & (pheno['non_carriers']==1)].index.to_list()
del22q_case_idx = pheno[(pheno['SITE']=='UCLA_CB') & (pheno['DEL22q11_2']==1)].index.to_list()
save_ids(del22q_con_idx + del22q_case_idx,path_out,'DEL22q11_2')
