In [14]:
import csv
from collections import Counter, defaultdict
import scipy.special
from os import listdir


In [16]:
sample_to_sex, sample_to_role, sample_to_asd = dict(), dict(), dict()
family_to_individuals = defaultdict(list)

with open('../phenotypes/spark/individuals.csv', 'r') as f:
    reader = csv.reader(f)
    header = next(reader)
    ind_index = header.index('subject_sp_id')
    family_index = header.index('family_id')
    sex_index = header.index('sex')
    asd_index = header.index('asd')
    role_index = header.index('role')
    
    for pieces  in reader:
        ind_id = pieces[ind_index]
        sample_to_sex[ind_id] = '2' if pieces[sex_index]=='Female' else '1' if pieces[sex_index]=='Male' else '-1'
        sample_to_role[ind_id] = pieces[role_index]
        sample_to_asd[ind_id] = '2' if pieces[asd_index]=='TRUE' else '1' if pieces[asd_index]=='FALSE' else '-1'
        family_to_individuals[pieces[family_index]].append(ind_id)


In [17]:
print(len(sample_to_asd))

150064


In [34]:
no_mother, no_father, multiple_mothers, multiple_fathers = 0, 0, 0, 0
no_parents = 0
parents_ok = 0

no_mother_sibpairs, no_father_sibpairs, no_parents_sibpairs, parents_ok_sibpairs = 0, 0, 0, 0
with open('../../DATA/spark/spark.ped', 'w+') as f:
    for fam, inds in family_to_individuals.items():
        mothers = [x for x in inds if sample_to_role[x]=='Mother']
        fathers = [x for x in inds if sample_to_role[x]=='Father']
        children = [x for x in inds if sample_to_role[x]=='Proband' or sample_to_role[x]=='Sibling']

        if len(mothers)==0 and len(fathers)==0:
            no_parents += len(children)
            no_parents_sibpairs += scipy.special.comb(len(children), 2)
            mother_id, father_id = '0', '0'
        else:
            if len(mothers)==0:
                no_mother += len(children)
                no_mother_sibpairs += scipy.special.comb(len(children), 2)
                mother_id = '0'
            elif len(mothers)==1:
                mother_id = mothers[0]
            else:
                multiple_mothers += len(children)
                mother_id = '0'

            if len(fathers)==0:
                no_father += len(children)
                no_father_sibpairs += scipy.special.comb(len(children), 2)
                father_id = '0'
            elif len(fathers)==1:
                father_id = fathers[0]
            else:
                multiple_fathers += len(children)
                father_id = '0'
                    
        if mother_id != '0' and father_id != '0':
            parents_ok += len(children)
            parents_ok_sibpairs += scipy.special.comb(len(children), 2)
                
        for child in children:
            f.write('\t'.join([fam, child, father_id, mother_id, sample_to_sex[child], sample_to_asd[child]]) + '\n')
            
print('parents ok', parents_ok, 'no parents', no_parents, 'no mother', no_mother, 'no father', no_father, 'multiple mothers', multiple_mothers, 'multiple fathers', multiple_fathers )
print('parents ok', parents_ok_sibpairs, 'no parents', no_parents_sibpairs, 'no mother', no_mother_sibpairs, 'no father', no_father_sibpairs )

# no parents 39 no mother 187 no father 2442 multiple mothers 0 multiple fathers 0          

parents ok 32478 no parents 4556 no mother 4225 no father 40096 multiple mothers 0 multiple fathers 0
parents ok 15043.0 no parents 703.0 no mother 1579.0 no father 16170.0


# From fam files

In [28]:
child_to_pieces = dict()
major_issue = set()
    
for fam_file in listdir('../data/spark_fam'):
    if fam_file.endswith('.fam'):
        print(fam_file)  
        with open('../data/spark_fam/%s' % fam_file, 'r') as f:
            for line in f:
                pieces = line.strip().split()
                if len(pieces)==6:
                    fam_id, ind_id, father_id, mother_id, sex, phen = pieces
                    if sex == '-9' and ind_id in sample_to_sex:
                        sex = sample_to_sex[ind_id]
                    if phen == '-9' and ind_id in sample_to_asd:
                        phen = sample_to_asd[ind_id]
                            
                    if ind_id in child_to_pieces:
                        fam_id_old, ind_id_old, father_id_old, mother_id_old, sex_old, phen_old = child_to_pieces[ind_id]
                        if fam_id != fam_id_old:
                            print('family mismatch', fam_id, fam_id_old)
                            if fam_id_old.startswith('SF') and not fam_id.startswith('SF'):
                                fam_id = fam_id_old
                            print('going with', fam_id)
                        if father_id != father_id_old:
                            print('father mismatch', father_id, father_id_old)
                            if father_id == '0':
                                father_id = father_id_old
                            print('going with', father_id)
                        if mother_id != mother_id_old:
                            print('mother mismatch', mother_id, mother_id_old)
                            if mother_id == '0':
                                mother_id = mother_id_old
                            print('going with', mother_id)
                        if sex != sex_old:
                            print('sex mismatch', sex, sex_old)
                            if sex == '-9':
                                sex = sex_old
                                print('going with', sex)
                            else:
                                major_issue.add(ind_id)
                                print('unresolvable, removing')
                        if phen != phen_old:
                            print('phenotype mismatch', phen, phen_old)
                            if phen == '-9':
                                phen = phen_old
                                print('going with', phen)
                            else:
                                major_issue.add(ind_id)
                                print('unresolvable, removing')
                        
                    child_to_pieces[ind_id] = [fam_id, ind_id, father_id, mother_id, sex, phen]
                else:
                    print(pieces)
                        
with open('../../DATA/spark/sparkfam.ped', 'w+') as outf:
    for child, pieces in child_to_pieces.items():
        if child not in major_issue:
            outf.write('\t'.join(pieces) + '\n')

                
print(len([x for child, x in child_to_pieces.items() if (child not in major_issue) and (x[-1]=='-9')]))

SPARK.WES1.release.2021_03.genotype.fam
SPARK.WES2.release.2021_03.genotype.fam
SPARK.WES3.genotype.2021_04.fam
SPARK_WGS3.fam
family mismatch 384 SF0199316
going with SF0199316
family mismatch 392 SF0199316
going with SF0199316
father mismatch 0 SP0199339
going with SP0199339
mother mismatch 0 SP0199315
going with SP0199315
family mismatch 776 SF0186959
going with SF0186959
father mismatch 0 SP0197135
going with SP0197135
mother mismatch 0 SP0186955
going with SP0186955
family mismatch 778 SF0186959
going with SF0186959
family mismatch 926 SF0154106
going with SF0154106
father mismatch 0 SP0154127
going with SP0154127
mother mismatch 0 SP0154105
going with SP0154105
family mismatch 1027 SF0159854
going with SF0159854
father mismatch 0 SP0166810
going with SP0166810
mother mismatch 0 SP0159852
going with SP0159852
family mismatch 1258 SF0226577
going with SF0226577
father mismatch 0 SP0226586
going with SP0226586
mother mismatch 0 SP0226576
going with SP0226576
family mismatch 1266 SF0