# Data analysis of UDN patients

### Connect to the UDN data resource using the HPDS Adapter

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
import scipy.stats as st
import scipy.sparse as sp
import networkx as nx
from community import community_louvain
from scipy.stats import kruskal
import seaborn as sns
import collections as collec
import os
import xml.etree.ElementTree as ET
import collections
import operator
import pandas

In [None]:
import PicSureHpdsLib
import PicSureClient
# Connection to the PicSure Client w/ key
# token is the individual key given to connect to the resource
connection = PicSureClient.Client.connect("https://udn.hms.harvard.edu/picsure", token)
adapter = PicSureHpdsLib.Adapter(connection)
resource = adapter.useResource("8e8c7ed0-87ea-4342-b8da-f939e46bac26")

### General methods

In [None]:
def removekey(d, key):
    """This functions returns a copy of a dictionnary with a removed key
    Parameters: d : dictionnary
                key: the key that must be deleted
    Returns: copy of dictionnary d without the key 
    """
    r = dict(d)
    del r[key]
    return r

In [None]:
def get_CI(a):
    """Returns the 95% confidence interval for a list/array a
    Parameters: a: list or array we want the CI for
    Returns: a tuple with 95% confidence interval
    """
    return st.t.interval(0.95, len(a)-1, loc=np.mean(a), scale=st.sem(a,nan_policy='omit'))

### Download data

In [None]:
def get_data_df(column_head):
    """Enables the user to download the data as a pandas dataframe indexed by UDN IDs (through API)
    Parameters : column_head : string, with the name of the header that will be selected. For example, if the columns that 
                                should be selected containt "this string", then column_head="this string".
    Returns: df : dataframe indexed by UDN IDs of the selected columns
    """
    dictionary=resource.dictionary().find(column_head)
    query=resource.query()
    query.select().add(dictionary.keys())
    query.select().add('\\000_UDN ID\\')
    df=query.getResultsDataFrame()
    df.set_index("\\000_UDN ID\\", inplace=True)
    query.select().clear()
    return df

### Download phenotypic, status, genomic, primary symptoms and meta- data

In [None]:
phenotypes = get_data_df("\\04_Clinical symptoms and physical findings (in HPO, from PhenoTips)\\")

In [None]:
# select only the phenotypes, and not the prenatal phenotypes
columns_to_del=[]
for col in list(phenotypes.columns)[1:]:
    if "Prenatal Phenotype" in col.split('\\'):
        columns_to_del.append(col)

In [None]:
phenotypes=phenotypes.drop(columns_to_del,axis=1)

In [None]:
status = get_data_df("\\13_Status\\")

In [None]:
genes=get_data_df("\\11_Candidate genes\\")
variants=get_data_df("\\12_Candidate variants\\")

In [None]:
primary_symptoms=get_data_df("\\01_Primary symptom category reported by patient or caregiver\\")

In [None]:
clinical_site=get_data_df('\\03_UDN Clinical Site\\')

In [None]:
family_history=get_data_df("\\08_Family history (from PhenoTips)\\")

In [None]:
natal_history=get_data_df("\\09_Prenatal and perinatal history (from PhenoTips)\\")

In [None]:
demographics=get_data_df("\\00_Demographics\\")

In [None]:
diagnostics=get_data_df('\\14_Disorders (in OMIM, from PhenoTips)\\')

### Age group separation

In [None]:
# break down the analysis in two groups: pediatric (<18 yo) and adults (>=18 yo)
adult_patients=list(demographics["\\00_Demographics\\Age at symptom onset in years\\"][demographics["\\00_Demographics\\Age at symptom onset in years\\"]>=18.0].index)
pediatric_patients=list(demographics["\\00_Demographics\\Age at symptom onset in years\\"][demographics["\\00_Demographics\\Age at symptom onset in years\\"]<18.0].index)


### HPO analysis

In [None]:
def get_patient_phenotypes(phenotypes):
    """Gets the list of unique phenotypes presented by the patients of the UDN 
    Parameters: phenotypes : pandas dataframe with the phenotypes
    
    Returns : patient_phen : dictionary with patients as keys, with values being dictionaries with keys ("pos","neg") 
                             with a list of the positive and negative phenotypes presented by each patient
    """
    header_phen=list(phenotypes)
    patient_phen={patient: {"pos": [], "neg": []} for patient in phenotypes.index.values}
    for patient,row in phenotypes.iterrows():
        for i,phen in enumerate(row):
            if phen=="Positive":
                if not header_phen[i].split("\\")[-2] in patient_phen[patient]["pos"]:
                    patient_phen[patient]["pos"].append(header_phen[i].split("\\")[-2])
            elif phen=="Negative":
                if not header_phen[i].split("\\")[-2] in patient_phen[patient]["neg"]:
                    patient_phen[patient]["neg"].append(header_phen[i].split("\\")[-2])

    for patient in patient_phen:
        if len(patient_phen[patient]["pos"])==0 and len(patient_phen[patient]["neg"])==0:
            patient_phen=removekey(patient_phen,patient)
    return patient_phen

In [None]:
patient_phen=get_patient_phenotypes(phenotypes)
patient_phen

In [None]:
# retrieve the evaluation date from patients, and remove the information for patients not in the update 
# (eval date comes from JSON)
patient_eval_date = {}
with open("patient_eval_date.txt","r") as evaldate:
    lines=evaldate.readlines()
    for line in lines:
        patient_eval_date[line.split(" ")[3]]=line.split(" ")[5].split("\n")[0]
for pat in list(patient_eval_date.keys()):
    if not (pat in list(patient_phen)):
        patient_eval_date=removekey(patient_eval_date,pat)
print("Number of patients with no information on eval date : ",collec.Counter([patient_eval_date[pat] for pat in patient_eval_date])["None"])

In [None]:
# get the list of patients evaluated  before 2015, and delete the negative terms for these patients (cf. paper on possible bias)
# in the entry of negative terms before 2015
list_pat_before_2015=[]
for pat in patient_eval_date:
    if patient_eval_date[pat]=="None":
        continue
    if int(patient_eval_date[pat].split("-")[0])<=2015:
        list_pat_before_2015.append(pat)
patient_phen_wo_2015=patient_phen.copy()
for pat in list_pat_before_2015:
    patient_phen_wo_2015[pat]["neg"]=[]


In [None]:
# delete negative terms for patients with over 50 negative terms (cut-off for bias)
for pat in patient_phen_wo_2015:
    if len(patient_phen_wo_2015[pat]["neg"])>=50:
        patient_phen_wo_2015[pat]["neg"]=[]

### Breakdown into pediatrics, diagnosed or undiagnosed, and adults, diagnosed or undiagnosed

In [None]:
# get the counts of positive, negative and total HPO terms
HPO_terms_pos,HPO_terms_neg={patient: len(patient_phen_wo_2015[patient]["pos"]) for patient in patient_phen_wo_2015},{patient: len(patient_phen_wo_2015[patient]["neg"]) for patient in patient_phen_wo_2015}
HPO_terms={patient: len(patient_phen_wo_2015[patient]["pos"])+len(patient_phen_wo_2015[patient]["neg"]) for patient in patient_phen_wo_2015}

In [None]:
# get the counts of positive, negative and total HPO terms for adult population
HPO_terms_pos_adult,HPO_terms_neg_adult={patient: len(patient_phen_wo_2015[patient]["pos"]) for patient in adult_patients},{patient: len(patient_phen_wo_2015[patient]["neg"]) for patient in adult_patients}
HPO_terms_adult={patient: len(patient_phen_wo_2015[patient]["pos"])+len(patient_phen_wo_2015[patient]["neg"]) for patient in adult_patients}

In [None]:
# get the counts of positive, negative and total HPO terms for pediatric population
HPO_terms_pos_pediatric,HPO_terms_neg_pediatric={patient: len(patient_phen_wo_2015[patient]["pos"]) for patient in pediatric_patients},{patient: len(patient_phen_wo_2015[patient]["neg"]) for patient in pediatric_patients}
HPO_terms_pediatric={patient: len(patient_phen_wo_2015[patient]["pos"])+len(patient_phen_wo_2015[patient]["neg"]) for patient in pediatric_patients}

In [None]:
# get the list of diagnosed and undiagnosed patients
list_diagnosed=status.loc[status["\\13_Status\\"] == "solved"].index.values.tolist()
list_undiagnosed=status.loc[status["\\13_Status\\"] != "solved"].index.values.tolist()

In [None]:
# get the list of diagnosed or undiagnosed patients that have at least one HPO term
list_diagnosed_phen=[patient for patient in list_diagnosed if(patient in patient_phen)]
list_undiagnosed_phen=[patient for patient in list_undiagnosed if(patient in patient_phen)]

In [None]:
# get the lists for breakdown (adults diag and undiag, pediatric diag and undiag)
list_adult_diagnosed=[patient for patient in patient_phen if (patient in adult_patients) and (patient in list_diagnosed_phen)]
list_adult_undiagnosed=[patient for patient in patient_phen if (patient in adult_patients) and (patient in list_undiagnosed_phen)]
list_pediatric_diagnosed=[patient for patient in patient_phen if (patient in pediatric_patients) and (patient in list_diagnosed_phen)]
list_pediatric_undiagnosed=[patient for patient in patient_phen if (patient in pediatric_patients) and (patient in list_undiagnosed_phen)]

In [None]:
print("# of adult diagnosed patients",len(list_adult_diagnosed))
print("# of adult undiagnosed patients",len(list_adult_undiagnosed))
print("# of pediatric diagnosed patients",len(list_pediatric_diagnosed))
print("# of pediatric undiagnosed patients",len(list_pediatric_undiagnosed))

In [None]:
# get the dataframes with the phenotypes of diagnosed or undiagnosed patients that have at least one HPO term
phenotypes_diagnosed=phenotypes.loc[list_diagnosed_phen]
phenotypes_undiagnosed=phenotypes.loc[list_undiagnosed_phen]

#### Best phenotypes 

In [None]:
def get_best_phenotypes(list_patients,patient_phen,nb_of_phen):
    """Shows the phenotypes the most represented in the UDN gateway for a given community of patients
        Parameters: list_patients: list of patients IDs that should be considered
                    patient_phen: dictionary with patients as keys, with values being dictionaries with keys ("pos","neg") 
                                 with a list of the positive and negative phenotypes presented by each patient
                    nb_of_phen : int, number of best phen to represent
        Returns: list_neg_phen : list of ranked best negative associations
                 list_pos_phen : list of ranked best positive associations
        Shows the nb_of_phen best pos and neg HPO association with % of representation
    """
    list_neg_phen,list_pos_phen=[],[]
    neg_phen_w_count,pos_phen_w_count={},{}
    for patient in list_patients:
        for phen in patient_phen[patient]["neg"]:
            if not(phen in neg_phen_w_count):
                neg_phen_w_count[phen]=1/len(list_patients)*100
            else:
                neg_phen_w_count[phen]+=1/len(list_patients)*100
        for phen in patient_phen[patient]["pos"]:
            if not(phen in pos_phen_w_count):
                pos_phen_w_count[phen]=1/len(list_patients)*100
            else:
                pos_phen_w_count[phen]+=1/len(list_patients)*100  
    sorted_dict_pos=collections.OrderedDict(sorted(pos_phen_w_count.items(), key=operator.itemgetter(1), reverse=True))
    sorted_dict_neg=collections.OrderedDict(sorted(neg_phen_w_count.items(), key=operator.itemgetter(1), reverse=True))
    print("Most highly ranked positive phenotypes")
    for i,key in enumerate(sorted_dict_pos):
        print(key,sorted_dict_pos[key])
        list_pos_phen.append(sorted_dict_pos[key])
        if i>nb_of_phen:
            break
    print("----------------------------------------")
    print("Most highly ranked negative phenotypes")
    for i,key in enumerate(sorted_dict_neg):
        print(key,sorted_dict_neg[key])
        list_neg_phen.append(sorted_dict_neg[key])
        if i>nb_of_phen:
            break
    return list_neg_phen,list_pos_phen

In [None]:
# best phenotypes for all patients
_,_=get_best_phenotypes(list(patient_phen.keys()),patient_phen,10)

In [None]:
def get_composition_phen(list_patients,patient_phen,list_phenotypes,pos_or_neg):
    """Shows the composition for a certain population of the list of phenotypes given as input
    Parameters: list_patients: list of str, UDN IDs to take into consideration 
                patient_phen: dictionary with patients as keys, with values being dictionaries with keys ("pos","neg") 
                                 with a list of the positive and negative phenotypes presented by each patient
                list_phenotypes: list of str, phenotypes to search 
                pos_or_neg: str, "pos" or "neg", type of phenotypic association to search for
    Returns: None
    Shows the % of phenotypes in the list for the given population
    """
    for phen in list_phenotypes:
        count=0
        for pat in list_patients:
            if phen in patient_phen[pat][pos_or_neg]:
                count+=1
        print(phen," : ",count/len(list_patients)*100) 

In [None]:
# search for composition for the most represented positive phenotypes
list_phenotypes_pos_to_find=["Global developmental delay","Seizures","Short stature","Generalized hypotonia","Microcephaly"]
print("Adult diagnosed")
get_composition_phen(list_adult_diagnosed,patient_phen,list_phenotypes_pos_to_find,"pos")
print("Adult undiagnosed")
get_composition_phen(list_adult_undiagnosed,patient_phen,list_phenotypes_pos_to_find,"pos")
print("Pediatric diagnosed")
get_composition_phen(list_pediatric_diagnosed,patient_phen,list_phenotypes_pos_to_find,"pos")
print("Pediatric undiagnosed")
get_composition_phen(list_pediatric_undiagnosed,patient_phen,list_phenotypes_pos_to_find,"pos")

In [None]:
# search for composition for the least represented phenotypes
list_phenotypes_neg_to_find=["Abnormal echocardiogram","EMG abnormality","Hearing impairment","Abnormality of the eye","Abnormality of brain morphology"]
print("Adult diagnosed")
get_composition_phen(list_adult_diagnosed,patient_phen,list_phenotypes_neg_to_find,"neg")
print("Adult undiagnosed")
get_composition_phen(list_adult_undiagnosed,patient_phen,list_phenotypes_neg_to_find,"neg")
print("Pediatric diagnosed")
get_composition_phen(list_pediatric_diagnosed,patient_phen,list_phenotypes_neg_to_find,"neg")
print("Pediatric undiagnosed")
get_composition_phen(list_pediatric_undiagnosed,patient_phen,list_phenotypes_neg_to_find,"neg")

In [None]:
# transform the HPO counts into a list of values 
HPO_list_pos=[HPO_terms_pos[patient] for patient in HPO_terms_pos]
HPO_list_neg=[HPO_terms_neg[patient] for patient in HPO_terms_neg]
HPO_list=[HPO_terms[patient] for patient in HPO_terms]

In [None]:
# show the total number of positive, negative, and all HPO terms in the database
print("# of positive HPO : ",np.sum(HPO_list_pos),"# of negative HPO : ",np.sum(HPO_list_neg),"# of total HPO : ",np.sum(HPO_list))

In [None]:
# transform the HPO counts into a list of values // adult diagnosed 
HPO_list_pos_adult_diagnosed=[HPO_terms_pos_adult[patient] for patient in list_adult_diagnosed]
HPO_list_neg_adult_diagnosed=[HPO_terms_neg_adult[patient] for patient in list_adult_diagnosed]
HPO_list_adult_diagnosed=[HPO_terms_adult[patient] for patient in list_adult_diagnosed]

In [None]:
# transform the HPO counts into a list of values // adult undiagnosed
HPO_list_pos_adult_undiagnosed=[HPO_terms_pos_adult[patient] for patient in list_adult_undiagnosed]
HPO_list_neg_adult_undiagnosed=[HPO_terms_neg_adult[patient] for patient in list_adult_undiagnosed]
HPO_list_adult_undiagnosed=[HPO_terms_adult[patient] for patient in list_adult_undiagnosed]

In [None]:
# transform the HPO counts into a list of values // pediatric diagnosed
HPO_list_pos_pediatric_diagnosed=[HPO_terms_pos_pediatric[patient] for patient in list_pediatric_diagnosed]
HPO_list_neg_pediatric_diagnosed=[HPO_terms_neg_pediatric[patient] for patient in list_pediatric_diagnosed]
HPO_list_pediatric_diagnosed=[HPO_terms_pediatric[patient] for patient in list_pediatric_diagnosed]

In [None]:
# transform the HPO counts into a list of values // pediatric undiagnosed
HPO_list_pos_pediatric_undiagnosed=[HPO_terms_pos_pediatric[patient] for patient in list_pediatric_undiagnosed]
HPO_list_neg_pediatric_undiagnosed=[HPO_terms_neg_pediatric[patient] for patient in list_pediatric_undiagnosed]
HPO_list_pediatric_undiagnosed=[HPO_terms_pediatric[patient] for patient in list_pediatric_undiagnosed]

In [None]:
def show_stats_HPO_counts(HPO_list,HPO_list_pos,HPO_list_neg):
    """Show the average and confidence interval for HPO terms for a selected population
    Parameters: HPO_list: list of HPO # for selected population
                HPO_list_pos: list of positive HPO # for selected population
                HPO_list_neg: list of negative HPO # for selected population
    Returns: None
    Shows the average and CI 95% for HPO counts
    """
    print("HPO pos average : ",np.average(HPO_list_pos),", CI 95% : ",get_CI(HPO_list_pos),", HPO pos max : ",np.max(HPO_list_pos))
    print("HPO neg average : ",np.average(HPO_list_neg),", CI 95% : ",get_CI(HPO_list_neg),", HPO neg max : ",np.max(HPO_list_neg))
    print("HPO average : ",np.average(HPO_list),", CI 95% : ",get_CI(HPO_list),", HPO max : ",np.max(HPO_list))

In [None]:
show_stats_HPO_counts(HPO_list,HPO_list_pos,HPO_list_neg)

In [None]:
show_stats_HPO_counts(HPO_list_adult_diagnosed,HPO_list_pos_adult_diagnosed,HPO_list_neg_adult_diagnosed)

In [None]:
show_stats_HPO_counts(HPO_list_adult_undiagnosed,HPO_list_pos_adult_undiagnosed,HPO_list_neg_adult_undiagnosed)

In [None]:
show_stats_HPO_counts(HPO_list_pediatric_diagnosed,HPO_list_pos_pediatric_diagnosed,HPO_list_neg_pediatric_diagnosed)

In [None]:
show_stats_HPO_counts(HPO_list_pediatric_undiagnosed,HPO_list_pos_pediatric_undiagnosed,HPO_list_neg_pediatric_undiagnosed)

In [None]:
def show_distrib_HPO(HPO_list,name):
    """Plots the distribution of count of HPO terms per patient
    Parameters : HPO_list: list of counts for each patient of HPO terms
                 name: string, title of the figure
    Returns : None
    Shows matplotlib plot of distribution of HPO
    """
    distrib=collec.Counter(HPO_list)
    X=[key for key in distrib.keys()]
    Y=[distrib[key] for key in distrib.keys()]
    plt.figure(figsize=(20,15))
    plt.plot(X,Y,"o")
    plt.xlabel("Number of HPO terms",fontsize=40)
    plt.ylabel("Count of patients",fontsize=40)
    plt.title(name,fontsize=50)
    plt.xticks(fontsize=40)
    plt.yticks(fontsize=40)
    plt.yscale("log")
    plt.xscale("log")
    plt.axes().set_ylim(None,200)
    plt.show()
    plt.savefig("HPO_terms_log")

In [None]:
show_distrib_HPO(HPO_list,"Distribution of HPO terms")
show_distrib_HPO(HPO_list_neg,"Distribution of negative HPO terms")
show_distrib_HPO(HPO_list_pos,"Distribution of positive HPO terms")

### Negative HPO linked to eval date

In [None]:
from datetime import date
def plot_eval_date_neg_terms(patient_eval_date,patient_phen):
    """Plots the # of HPO terms according to the evaluation date of patients
    Parameters: patient_eval_date : dictionary with patients as keys and evaluation date in str (format "yyyy-mm-dd") as value
                patient_phen :  dictionary with patients as keys, with values being dictionaries with keys ("pos","neg") 
                             with a list of the positive and negative phenotypes presented by each patient
    Returns: None
    Shows the plot of # of negative HPO terms vs evaluation date
    """
    pateval=[key for key in list(patient_eval_date.keys()) if patient_eval_date[key]!="None"]
    timeeval=pandas.Series([len(patient_phen[pat]["neg"]) for pat in pateval],
                           index=[date(int(patient_eval_date[pat].split("-")[0]), 
                                      int(patient_eval_date[pat].split("-")[1]),
                                      int(patient_eval_date[pat].split("-")[2])) for pat in pateval])
    plt.figure(figsize=(20,15))
    g1=timeeval[timeeval>10]
    g2=timeeval[timeeval<=10]
    g1.plot(style=".",color="r",markersize=25)
    """for i in range(len(timeeval)):
        if timeeval[i]>10:
            plt.text(timeeval.index[i],timeeval[i]+0.5,pateval[i])"""
    g2.plot(style=".",color="b",markersize=25)
    plt.title("# of negative HPO terms according to evaluation date")
    plt.xlabel("Evaluation date")
    plt.ylabel("# of HPO neg terms")
    plt.show()

In [None]:
plot_eval_date_neg_terms(patient_eval_date,patient_phen)

In [None]:
# return the list of patients with evaluation prior to 2015, with negative HPO associations >10 and patients with no evaluation
# date in resp. pat_prior_2015,pat_high_neg_HPO,pat_no_eval
pat_prior_2015,pat_high_neg_HPO,pat_no_eval=[],[],[]
for patient in patient_phen:
    if patient_eval_date[patient]=="None":
        pat_no_eval.append(patient)
        continue
    if int(patient_eval_date[patient].split("-")[0])<=2015:
        pat_prior_2015.append([patient,HPO_terms_neg[patient]])
    else:
        if HPO_terms_neg[patient]>10:
            pat_high_neg_HPO.append([patient,HPO_terms_neg[patient],patient_eval_date[patient]])
pat_prior_2015,pat_high_neg_HPO

In [None]:
def show_neg_terms_by_cs(list_of_cs,patient_phen,patient_eval_date,pat_no_eval,clinical_site,year):
    """Shows the breakdown (max,min,avg,std) of negative HPO association within the different clinical sites for a given year
        Parameters: list_of_cs : list of string of clinical sites
                    patient_eval_date : dictionary with patients as keys and evaluation date in str (format "yyyy-mm-dd") as value
                    patient_phen :  dictionary with patients as keys, with values being dictionaries with keys ("pos","neg") 
                                 with a list of the positive and negative phenotypes presented by each patient
                    pat_no_eval : list of patients with no evaluation date associated
                    clinical_site: dataframe with the associated clinical sites with patients as index
                    year: int, year to be shown
        Returns: None
        Shows the breakdown per site
    """
    for site in list_of_cs:
        print("Clinical site ",site)
        neg_HPO_site=[]
        for patient in patient_phen:
            if not( patient in pat_no_eval) and int(patient_eval_date[patient].split("-")[0])==year:
                if clinical_site.loc[patient]["\\03_UDN Clinical Site\\"]==site:
                    neg_HPO_site.append(HPO_terms_neg[patient])
        if len(neg_HPO_site)>0:
            print(len(neg_HPO_site))
            print("Min : ",np.min(neg_HPO_site)," Max : ",np.max(neg_HPO_site)," Avg : ",np.average(neg_HPO_site)," Std : ",np.std(neg_HPO_site))

In [None]:
# year attribute can be changed to plot different years
show_neg_terms_by_cs(list_of_cs,patient_phen,patient_eval_date,pat_no_eval,clinical_site,2016)

In [None]:
# write down list of patients with evaluation prior to 2015
with open("list_UDNID_HPO_neg.txt","w") as l:
    for i in range(len(pat_prior_2015)):
        l.write("Patient ID : "+pat_prior_2015[i][0]+" # of negative HPO terms : "+str(pat_prior_2015[i][1])+"\n")
l.close()

In [None]:
# write down list of patients with evaluation post 2015 and strictly more than 10 HPO terms
with open("list_UDNID_HPO_neg_post_2015.txt","w") as l:
    for i in range(len(pat_high_neg_HPO)):
        l.write("Patient ID : "+pat_high_neg_HPO[i][0]+" # of negative HPO terms : "+str(pat_high_neg_HPO[i][1])+" Eval date : "+pat_high_neg_HPO[i][2]+"\n")
l.close()

### HPO large group stats

In [None]:
# get the list of large groups in the HPO hierarchy
large_groups_HPO=[]
header_phen=list(phenotypes)[1:]
for phen in header_phen:
    if not(phen.split("\\")[4] in large_groups_HPO):
        large_groups_HPO.append(phen.split("\\")[4])
large_groups_HPO  

In [None]:
# get the association between unique phenotypes and the large groups they are related to in the HPO hierarchy
# list_phenotypes_unique is a dictionary with the phenotypes as keys, and a list of associated large groups as value
list_phenotypes_unique={}
for phen in header_phen:
    if not(phen.split("\\")[-2] in list_phenotypes_unique):
        list_phenotypes_unique[phen.split("\\")[-2]]=[phen.split("\\")[4]]
    else:
        if not(phen.split("\\")[4] in list_phenotypes_unique[phen.split("\\")[-2]]):
            list_phenotypes_unique[phen.split("\\")[-2]].append(phen.split("\\")[4])
list_phenotypes_unique

In [None]:
def get_large_groups_HPO_count(large_groups_HPO,patient_phen,list_patients):
    """Returns the count of HPO terms that belong to a certain group of HPO terms
    Parameters: large_groups : list of large groups that belong to the HPO hierarchy
                phenotypes : pandas dataframe with the phenotypes
    
    Returns : group_count : dictionary with keys ("pos","neg") that counts the occurrences of positive or negative HPO terms
                            for each large group
    """
    header_phen=list(phenotypes)
    group_count={"pos":{lg: 0 for lg in large_groups_HPO},"neg": {lg: 0 for lg in large_groups_HPO}}
    for patient in list_patients:
        for phen in patient_phen[patient]["pos"]:
            for lg in list_phenotypes_unique[phen]:
                group_count["pos"][lg]+=1
        for phen in patient_phen[patient]["neg"]:
            for lg in list_phenotypes_unique[phen]:
                group_count["neg"][lg]+=1
    return group_count

In [None]:
# get the HPO occurrences for all patients
large_groups_HPO_count=get_large_groups_HPO_count(large_groups_HPO,patient_phen_wo_2015,list(patient_phen_wo_2015.keys()))
print("Total : neg : ",np.sum(list(large_groups_HPO_count["neg"].values()))," pos : ",np.sum(list(large_groups_HPO_count["pos"].values())))
large_groups_HPO_count

In [None]:
# get the count of large groups for positive and negative terms of adult patients
large_groups_HPO_count_adult=get_large_groups_HPO_count(large_groups_HPO,patient_phen_wo_2015,adult_patients)
print("Total : neg : ",np.sum(list(large_groups_HPO_count_adult["neg"].values()))," pos : ",np.sum(list(large_groups_HPO_count_adult["pos"].values())))
large_groups_HPO_count_adult

In [None]:
# get the count of large groups for positive and negative terms of pediatric patients
large_groups_HPO_count_pediatric=get_large_groups_HPO_count(large_groups_HPO,patient_phen_wo_2015,pediatric_patients)
print("Total : neg : ",np.sum(list(large_groups_HPO_count_pediatric["neg"].values()))," pos : ",np.sum(list(large_groups_HPO_count_pediatric["pos"].values())))
large_groups_HPO_count_pediatric

### Comparison HPO and Primary Symptoms


In [None]:
def get_link_between_PS_HPO(patient_phen,primary_symptoms,list_phenotypes_unique):
    """Returns the link count of occurrence of a certain HPO large group for patients with a certain primary symptom
    Parameters : patient_phen :  dictionary with patients as keys, with values being dictionaries with keys ("pos","neg") 
                                 with a list of the positive and negative phenotypes presented by each patient 
                 primary_symptoms: dataframe with UDN IDs as index, and list of primary symptoms reported 
                 list_phenotypes_unique: dictionary of link between phenotypes and the large groups they are linked
                 to in the HPO hierarchy
    Returns : dictionary with keys ("pos","neg") that contain a dictionary with the primary symptoms as keys and a dictionary 
              with the count for every large group of HPO hierarchy of occurrences as value
    """
    link_PS_HPO={"pos": {}, "neg": {}}
    for patient in patient_phen:
        ps=list(primary_symptoms.loc[patient])[1]
        if not(ps in link_PS_HPO["pos"]):
            link_PS_HPO["pos"][ps]={}
        if not(ps in link_PS_HPO["neg"]):
            link_PS_HPO["neg"][ps]={}
        for phen in patient_phen[patient]["pos"]:
            for lg in list_phenotypes_unique[phen]:
                if lg in link_PS_HPO["pos"][ps]:
                    link_PS_HPO["pos"][ps][lg]+=1
                else:
                    link_PS_HPO["pos"][ps][lg]=1
        for phen in patient_phen[patient]["neg"]:
            for lg in list_phenotypes_unique[phen]:
                if lg in link_PS_HPO["neg"][ps]:
                    link_PS_HPO["neg"][ps][lg]+=1
                else:
                    link_PS_HPO["neg"][ps][lg]=1
    return link_PS_HPO

In [None]:
# get the links between the primary symptoms and the HPO large groups
link_PS_HPO=get_link_between_PS_HPO(patient_phen,primary_symptoms,list_phenotypes_unique)

In [None]:
# Show the ranked HPO groups for each primary symptom 
for ps in link_PS_HPO["pos"]:
    print("Primary symptom ",ps)
    print("-------------------------------------------")
    if type(ps)==float:
        continue
    lg_list=list(link_PS_HPO["pos"][ps])
    val=[link_PS_HPO["pos"][ps][lg] for lg in lg_list]
    indsort=np.argsort(val)[::-1]
    lg_list=np.array(lg_list)[indsort]
    val=np.array(val)[indsort]
    for i in range(len(indsort)):
        print(lg_list[i],val[i])
    print("-------------------------------------------")

### Analysis of demographics and clinical site

In [None]:
# get the dataframes for patients with at least one phenotype, for adult or pediatric, diagnosed and undiagnosed 
demographics = demographics.loc[list(patient_phen)]
demographics_adult_diagnosed = demographics.loc[list_adult_diagnosed]
demographics_adult_undiagnosed = demographics.loc[list_adult_undiagnosed]
demographics_pediatric_diagnosed = demographics.loc[list_pediatric_diagnosed]
demographics_pediatric_undiagnosed = demographics.loc[list_pediatric_undiagnosed]
clinical_site = clinical_site.loc[list(patient_phen)]
clinical_site_adult_diagnosed = clinical_site.loc[list_adult_diagnosed]
clinical_site_adult_undiagnosed = clinical_site.loc[list_adult_undiagnosed]
clinical_site_pediatric_diagnosed = clinical_site.loc[list_pediatric_diagnosed]
clinical_site_pediatric_undiagnosed = clinical_site.loc[list_pediatric_undiagnosed]

In [None]:
# get count of clinical sites for patients with at least one phenotype, for adult or pediatric, diagnosed and undiagnosed
cscount = clinical_site.groupby('\\03_UDN Clinical Site\\')['Patient ID'].nunique()
cscount_ad = clinical_site_adult_diagnosed.groupby('\\03_UDN Clinical Site\\')['Patient ID'].nunique()
cscount_and = clinical_site_adult_undiagnosed.groupby('\\03_UDN Clinical Site\\')['Patient ID'].nunique()
cscount_pd = clinical_site_pediatric_diagnosed.groupby('\\03_UDN Clinical Site\\')['Patient ID'].nunique()
cscount_pnd = clinical_site_pediatric_undiagnosed.groupby('\\03_UDN Clinical Site\\')['Patient ID'].nunique()

In [None]:
print("Clinical site count general")
print(cscount)
print("Clinical site count adult diagnosed")
print(cscount_ad)
print("Clinical site count adult undiagnosed")
print(cscount_and)
print("Clinical site count pediatric diagnosed")
print(cscount_pd)
print("Clinical site count pediatric undiagnosed")
print(cscount_pnd)

In [None]:
print("Count eth for general ",collec.Counter(demographics['\\00_Demographics\\Ethnicity\\']))
print("Count eth for adult diagnosed ",collec.Counter(demographics_adult_diagnosed['\\00_Demographics\\Ethnicity\\']))
print("Count eth for adult undiagnosed ",collec.Counter(demographics_adult_undiagnosed['\\00_Demographics\\Ethnicity\\']))
print("Count eth for pediatric diagnosed ",collec.Counter(demographics_pediatric_diagnosed['\\00_Demographics\\Ethnicity\\']))
print("Count eth for pediatric undiagnosed ",collec.Counter(demographics_pediatric_undiagnosed['\\00_Demographics\\Ethnicity\\']))

In [None]:
print("Count race for general ",collec.Counter(demographics["\\00_Demographics\\Race\\"]))
print("Count race for adult diagnosed ",collec.Counter(demographics_adult_diagnosed["\\00_Demographics\\Race\\"]))
print("Count race for adult undiagnosed ",collec.Counter(demographics_adult_undiagnosed["\\00_Demographics\\Race\\"]))
print("Count race for pediatric diagnosed ",collec.Counter(demographics_pediatric_diagnosed["\\00_Demographics\\Race\\"]))
print("Count race for pediatric undiagnosed ",collec.Counter(demographics_pediatric_undiagnosed["\\00_Demographics\\Race\\"]))

In [None]:
# get the statistics for demographics for adult all patients
demographics.describe()

In [None]:
# get the statistics for demographics for adult diagnosed patients
demographics_adult_diagnosed.describe()

In [None]:
# get the statistics for demographics for adult undiagnosed patients
demographics_adult_undiagnosed.describe()

In [None]:
# get the statistics for demographics, for pediatric diagnosed patients
demographics_pediatric_diagnosed.describe()

In [None]:
# get the statistics for demographics, for pediatric undiagnosed patients
demographics_pediatric_undiagnosed.describe()

In [None]:
def show_age_distrib(demographics):
    """Show the age distribution in the network
    Parameters: demographics: pd dataframe, with columns containing age at symptom onset
    Returns: None
    Shows the age distribution as a plot
    """
    X=list(collec.Counter(demographics["\\00_Demographics\\Age at symptom onset in years\\"].fillna(0)))
    Y=[collec.Counter(demographics["\\00_Demographics\\Age at symptom onset in years\\"])[i] for i in X]
    plt.figure(figsize=(20,20))
    plt.plot(X,Y)
    plt.title("Age at symptom onset (in y) distribution in UDN")
    plt.xlabel("Age at symptom onset (in y)")
    plt.ylabel("Count of patients")
    plt.show()

In [None]:
show_age_distrib(demographics)

In [None]:
# get the gender count, for adult or pediatric, diagnosed and undiagnosed
gender_count = demographics.groupby("\\00_Demographics\\Gender\\")['Patient ID'].nunique()
gender_count_ad = demographics_adult_diagnosed.groupby("\\00_Demographics\\Gender\\")['Patient ID'].nunique()
gender_count_and = demographics_adult_undiagnosed.groupby("\\00_Demographics\\Gender\\")['Patient ID'].nunique()
gender_count_pd = demographics_pediatric_diagnosed.groupby("\\00_Demographics\\Gender\\")['Patient ID'].nunique()
gender_count_pnd = demographics_pediatric_undiagnosed.groupby("\\00_Demographics\\Gender\\")['Patient ID'].nunique()

In [None]:
print("Gender count general")
print(gender_count)
print("Gender count adult diagnosed")
print(gender_count_ad)
print("Gender count adult undiagnosed")
print(gender_count_and)
print("Gender count pediatric diagnosed")
print(gender_count_pd)
print("Gender count pediatric undiagnosed")
print(gender_count_pnd)

### Distribution of HPO terms according to clinical site

In [None]:
# Shows the distribution of HPO terms within the clinical sites, with min, max, avg and std, as well as solved cases
list_of_cs=list(clinical_site["\\03_UDN Clinical Site\\"].unique())
for site in list_of_cs:
    print("Clinical site ",site)
    pat_in_site=list(clinical_site[clinical_site["\\03_UDN Clinical Site\\"]==site].index)
    print("# of patients in site : ",len(pat_in_site))
    list_HPO_site=[]
    diag_cases=[]
    for pat in pat_in_site:
        list_HPO_site.append(HPO_terms[pat])
        if pat in list_diagnosed_phen:
            diag_cases.append(pat)
    print("Min : ",np.min(list_HPO_site)," Max : ",np.max(list_HPO_site)," Avg : ",np.average(list_HPO_site)," Std : ",np.std(list_HPO_site))
    print("Percentage solved cases : ",len(diag_cases)/len(pat_in_site)*100)

### Count of primary symptoms 

In [None]:
# get the primary symptoms for patients with at least one phenotype, for adult or pediatric, diagnosed and undiagnosed
primary_symptoms = primary_symptoms.loc[list(patient_phen)]
primary_symptoms_ad = primary_symptoms.loc[list_adult_diagnosed]
primary_symptoms_and = primary_symptoms.loc[list_adult_undiagnosed]
primary_symptoms_pd = primary_symptoms.loc[list_pediatric_diagnosed]
primary_symptoms_pnd = primary_symptoms.loc[list_pediatric_undiagnosed]

In [None]:
# get the primary symptom count, for adult or pediatric, diagnosed and undiagnosed
pscount = primary_symptoms.groupby("\\01_Primary symptom category reported by patient or caregiver\\")['Patient ID'].nunique()
pscount_ad = primary_symptoms_ad.groupby("\\01_Primary symptom category reported by patient or caregiver\\")['Patient ID'].nunique()
pscount_and = primary_symptoms_and.groupby("\\01_Primary symptom category reported by patient or caregiver\\")['Patient ID'].nunique()
pscount_pd = primary_symptoms_pd.groupby("\\01_Primary symptom category reported by patient or caregiver\\")['Patient ID'].nunique()
pscount_pnd = primary_symptoms_pnd.groupby("\\01_Primary symptom category reported by patient or caregiver\\")['Patient ID'].nunique()

In [None]:
print("Primary symptom count general")
print(pscount)
print("---------------------------------------------")
print("Primary symptom count adult diagnosed")
print(pscount_ad)
print("---------------------------------------------")
print("Primary symptom count adult undiagnosed")
print(pscount_and)
print("---------------------------------------------")
print("Primary symptom count pediatric diagnosed")
print(pscount_pd)
print("---------------------------------------------")
print("Primary symptom count pediatric undiagnosed")
print(pscount_pnd)

### Family history

In [None]:
# get family history for patients with at least one phenotype, diagnosed or undiagnosed
family_history = family_history.loc[list(patient_phen)]
family_history_d = family_history.loc[list_diagnosed_phen]
family_history_nd = family_history.loc[list_undiagnosed_phen]

In [None]:
# get count of affected relatives, for diagnosed or undiagnosed
fhcount = family_history.groupby("\\08_Family history (from PhenoTips)\\Affected Relatives\\")['Patient ID'].nunique()
fhcount_d = family_history_d.groupby("\\08_Family history (from PhenoTips)\\Affected Relatives\\")['Patient ID'].nunique()
fhcount_nd = family_history_nd.groupby("\\08_Family history (from PhenoTips)\\Affected Relatives\\")['Patient ID'].nunique()

In [None]:
print("Affected relatives count general")
print(fhcount)
print("Affected relatives count diagnosed")
print(fhcount_d)
print("Affected relatives count undiagnosed")
print(fhcount_nd)

In [None]:
# get natal history for patients with at least one phenotype
natal_history = natal_history.loc[list(patient_phen)]

In [None]:
# replace missing values by NaN
natal_history = natal_history.replace(0, np.NaN)

In [None]:
# get natal history for pediatric and adult patients
natal_history_adult=natal_history.loc[adult_patients]
natal_history_pediatric=natal_history.loc[pediatric_patients]

In [None]:
natal_history.count()

In [None]:
# plot characteristics for natal history for entire network
natal_history.describe()

In [None]:
# plot characteristics for natal history for adult patients
natal_history_adult.describe()

In [None]:
# plot characteristics for natal history for pediatric patients
natal_history_pediatric.describe()

In [None]:
# get the number of positive or negative occurrences for any given phenotype. Ex: if count_pos_phen[i]=3, 
# then there are three patients in the database that are positive for the phenotype header_phen[i]
count_pos_phen,count_neg_phen=[0 for i in range(1,phenotypes.shape[1])],[0 for i in range(1,phenotypes.shape[1])]
for i in range(1,phenotypes.shape[1]):
    cts=phenotypes.iloc[:,i].value_counts()
    keys=cts.keys().tolist()
    for j in range(len(keys)):
        if keys[j]=="Positive":
            count_pos_phen[i-1]=cts[j]
        elif keys[j]=="Negative":
            count_neg_phen[i-1]=cts[j]
    

In [None]:
collec.Counter(family_history["\\08_Family history (from PhenoTips)\\Consanguinity\\"])

In [None]:
def get_best_phenotypes_consang(patient_phen,family_history):
    """Gives the list of overrepresented phenotypes in the consanguineous community
    Parameters : patient_phen :  dictionary with patients as keys, with values being dictionaries with keys ("pos","neg") 
                                 with a list of the positive and negative phenotypes presented by each patient
                 family_history : dataframe with family history 
    Returns : dictionary with the count for positive or negative phenotypes of patients presenting such phenotype
    Shows the ranked 10 best phenotypes, for positive and negative as well as the Mann Whitney U stats for difference in 
    distribution between the consanguineous and general community
    """
    count_phenotype_consang={"pos": {}, "neg": {}}
    csgcount=0
    for patient in list(patient_phen):
        consang = family_history.loc[patient][2]
        if consang==True:
            csgcount+=1
            for phen_pos in patient_phen[patient]["pos"]:
                if not(phen_pos in count_phenotype_consang["pos"]):
                    count_phenotype_consang["pos"][phen_pos]=1
                else:
                    count_phenotype_consang["pos"][phen_pos]+=1
            for phen_neg in patient_phen[patient]["neg"]:
                if not(phen_neg in count_phenotype_consang["neg"]):
                    count_phenotype_consang["neg"][phen_neg]=1
                else:
                    count_phenotype_consang["neg"][phen_neg]+=1
    
    phen_pos_list=list(count_phenotype_consang["pos"])
    val=[count_phenotype_consang["pos"][phen] for phen in phen_pos_list]
    indsort=np.argsort(val)[::-1]
    phen_pos_list=np.array(phen_pos_list)[indsort][:18]
    val=np.array(val)[indsort][:18]
    print('Best positive phenotypes')
    comp_mw_true=[]
    for j,phen in enumerate(phen_pos_list):
        for i,p in enumerate(list(phenotypes)[1:]):
            if p.split("\\")[-2]==phen:
                print(phen,"consang % ",val[j]/csgcount*100," general % ",count_pos_phen[i]/phenotypes.shape[0]*100)
                comp_mw_true.append(count_pos_phen[i]/phenotypes.shape[0]*100)
                break
    print("Mann-Whitney pos : ")
    print("Medians : ",np.median(np.multiply(val,100/csgcount)),np.median(comp_mw_true))
    print(mannwhitneyu(np.multiply(val,100/csgcount),comp_mw_true))
    phen_neg_list=list(count_phenotype_consang["neg"])
    val=[count_phenotype_consang["neg"][phen] for phen in phen_neg_list]
    indsort=np.argsort(val)[::-1]
    phen_neg_list=np.array(phen_neg_list)[indsort][:10]
    val=np.array(val)[indsort][:10]
    print('Best negative phenotypes')
    comp_mw_true=[]
    for j,phen in enumerate(phen_neg_list):
        for i,p in enumerate(list(phenotypes)[1:]):
            if p.split("\\")[-2]==phen:
                print(phen,"consang % ",val[j]/csgcount*100," general % ",count_neg_phen[i]/phenotypes.shape[0]*100)
                comp_mw_true.append(count_neg_phen[i]/phenotypes.shape[0]*100)
                break
    print("Mann-Whitney neg : ")
    print("Medians : ",np.median(np.multiply(val,100/csgcount)),np.median(comp_mw_true))
    print(mannwhitneyu(np.multiply(val,100/csgcount),comp_mw_true))
    print("How many consang ?",csgcount)
    return count_phenotype_consang

In [None]:
count_phenotype_consang=get_best_phenotypes_consang(patient_phen,family_history)

In [None]:
# mat_age is the maternal age without the NaN values
mat_age=np.array(natal_history["\\09_Prenatal and perinatal history (from PhenoTips)\\Maternal Age\\"])
isnan_mat=np.isnan(mat_age)
mat_age=mat_age[[not(isnan_mat[i]) for i in range(len(isnan_mat))]]

In [None]:
# pat_age is the paternal age without the NaN values
pat_age=np.array(natal_history["\\09_Prenatal and perinatal history (from PhenoTips)\\Paternal Age\\"])
isnan_pat=np.isnan(pat_age)
pat_age=pat_age[[not(isnan_pat[i]) for i in range(len(isnan_pat))]]

In [None]:
# distribution of paternal age in the US in 2009 (cf. article)
USA_dist_pat=[4.7,17.7,25.1,26.6,16.3,6.7,2.1,0.8]
tranches_pat=["0-19","20-24","25-29","30-34","35-39","40-44","44-50",">50"]
boundaries_pat=[[0,19],[20,24],[25,29],[30,34],[35,39],[40,44],[44,50],[50,100]]

# distribution of maternal age in the US in 2009 (cf. article)
USA_dist_mat = [3.1,6.9,24.4,28.2,23.1,11.5,2.8]
tranches_mat=["0-18","19","20-24","25-29","30-34","35-39",">39"]
boundaries_mat=[[0,18],[19,19],[20,24],[25,29],[30,34],[35,39],[39,100]]

In [None]:
def distrib_age(parent_age, known_dist,tranches,boundaries,mat_or_pat):
    """Shows the distribution of maternal age compared between UDN and the US in 2009
    Parameters : parent_age: array of parental age in the UDN database
                 known_dist: list, known distribution of parental age for age groups given in splits
                 tranches: list of str, age groups that correspond to the known distribution 
                 boundaries: array of 2-D arrays, with the boundaries in int corresponding to the splits given in tranches
                 mat_or_pat: "mat" or "pat", for maternal or paternal age
    Returns: dictionary with age distribution in the UDN 
    Shows a joint plot of UDN distribution and known distribution of maternal age
    """
    count_age={}
    for age in parent_age:
        if age in count_age:
            count_age[age]+=1
        else:
            count_age[age]=1
    distrib_age=[0 for i in range(len(tranches))]
    for age in count_age:
        for i in range(len(boundaries)):
            if age>=boundaries[i][0] and age<=boundaries[i][1]:
                distrib_age[i]+=count_age[age]/len(parent_age)*100
    plt.figure(figsize=(20,15))
    plt.plot(tranches,distrib_age,'b',label="Distribution in UDN")
    plt.plot(tranches,known_dist,'r',label="Distribution in USA in 2009")
    plt.xlabel(mat_or_pat+" age at birth",fontsize=20)
    plt.ylabel("Distribution in UDN vs USA in 2009 (%)",fontsize=20)
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=15)
    plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
           ncol=2, mode="expand", borderaxespad=0.,fontsize=20)
    plt.show()
    return distrib_age

In [None]:
dist_age_mat=distrib_age(mat_age,USA_dist_mat,tranches_mat,boundaries_mat,"Maternal")

ttest_ind(dist_age_mat,USA_dist)

### Paternal age

In [None]:
dist_age_pat=distrib_age(pat_age,USA_dist_pat,tranches_pat,boundaries_pat,"Paternal")

ttest_ind(dist_age_pat,USA_dist_pat)

In [None]:
# cut-off for "old" vs "young" parents is 35 y/o, cut the analysis according to that
paternal_age_df=natal_history["\\09_Prenatal and perinatal history (from PhenoTips)\\Paternal Age\\"].dropna()
patients_with_pat_age=list(paternal_age_df.index)
old_pat_age=[patients_with_pat_age[i] for i in range(len(patients_with_pat_age)) if paternal_age_df.loc[patients_with_pat_age[i]]>35]
young_pat_age=[patients_with_pat_age[i] for i in range(len(patients_with_pat_age)) if paternal_age_df.loc[patients_with_pat_age[i]]<=35]

In [None]:
# demographics description for all patients having reported paternal age
demographics.loc[old_pat_age+young_pat_age].describe()

In [None]:
# demographics for patients with paternal age >=35
demographics.loc[old_pat_age].describe()

In [None]:
# demographics for patients with paternal age <35
demographics.loc[young_pat_age].describe()

In [None]:
# Mann Whitney U test for diff between young and old paternal age for Age at UDN evaluation
mannwhitneyu(list(demographics["\\00_Demographics\\Age at UDN Evaluation (in years)\\"].loc[young_pat_age]),list(demographics["\\00_Demographics\\Age at UDN Evaluation (in years)\\"].loc[old_pat_age]))

In [None]:
# Mann Whitney U test for diff between young and old paternal age for Age at UDN evaluation
mannwhitneyu(list(demographics["\\00_Demographics\\Age at symptom onset in years\\"].loc[young_pat_age]),list(demographics["\\00_Demographics\\Age at symptom onset in years\\"].loc[old_pat_age]))

In [None]:
# get the list of patient phenotypes for patients with old, young paternal age and all those having reported paternal age
old_pat_patient_phen,young_pat_patient_phen,all_pat_patient_phen={},{},{}
for pat in patient_phen:
    if pat in old_pat_age:
        old_pat_patient_phen[pat]=patient_phen[pat]
        all_pat_patient_phen[pat]=patient_phen[pat]
    if pat in young_pat_age:
        young_pat_patient_phen[pat]=patient_phen[pat]
        all_pat_patient_phen[pat]=patient_phen[pat]
len(old_pat_patient_phen),len(young_pat_patient_phen),len(all_pat_patient_phen)

In [None]:
def get_best_phenotype_pat(patient_phen):
    """Shows the phenotypes ranked by (indicated) composition in the population of patients entered 
    Parameters: patient_phen :  dictionary with patients as keys, with values being dictionaries with keys ("pos","neg") 
                                 with a list of the positive and negative phenotypes presented by each patient
    Returns: sorted_dict_bp: ordered dictionnary, with composition as sorting key and phenotypes as value
    """
    best_phen={}
    for pat in patient_phen:
        for phen in patient_phen[pat]["pos"]:
            if phen in best_phen:
                best_phen[phen]+=1/len(patient_phen)
            else:
                best_phen[phen]=1/len(patient_phen)
    sorted_dict_bp=collections.OrderedDict(sorted(best_phen.items(), key=operator.itemgetter(1), reverse=True))
    return sorted_dict_bp

In [None]:
# show best phenotypes for patients with older paternal age
get_best_phenotype_pat(old_pat_patient_phen)

In [None]:
# show best phenotypes for patients with younger paternal age
get_best_phenotype_pat(young_pat_patient_phen)

In [None]:
# show best phenotype for all patients having reported paternal age
get_best_phenotype_pat(all_pat_patient_phen)

In [None]:
# Mann Whitney U test for best phenotypes between yound and old paternal age
mannwhitneyu([32,20,17,19,23,18,12],[37,27,32,27,14,19,22])

In [None]:
diag_old_pat=diagnostics.loc[old_pat_age]["\\14_Disorders (in OMIM, from PhenoTips)\\"]
len(diag_old_pat.dropna())

In [None]:
natal_history_adult_diagnosed.describe()

In [None]:
natal_history_adult_undiagnosed.describe()

In [None]:
natal_history_pediatric_diagnosed.describe()

In [None]:
natal_history_pediatric_undiagnosed.describe()

In [None]:
diag_young_pat=diagnostics.loc[young_pat_age]["\\14_Disorders (in OMIM, from PhenoTips)\\"]
len(diag_young_pat.dropna())

135 diagnosed cases (56% of the 239) for 429 (41%) of all patients

### Genomics

In [None]:
def get_gene_data(filename,var_or_gene):
    """Retrieve genetic data from a text file (formatted from JSON file)
    Parameters: filename: string, name of the text file with the genetic information
                var_or_gene: string, "Var" if variants of "Gen" if genes
    Returns: genomic_data: dictionary with UDN ID as key and list of dictionaries as value, each dictionary containing 
                           information about genes or variants
    """
    genomic_data={}
    with open(filename,"r") as pg:
        lines=pg.readlines()
        for line in lines:
            if line.split("<")[0]=="ID":
                pid=line.split(" ")[3].split("\n")[0]
                genomic_data[pid]=[]
            elif line.split("<")[0]==var_or_gene:
                var=int(line.split(" ")[1].split("\n")[0])
                genomic_data[pid].append({})
            else:
                if not(len(line.split(" "))==1):
                    genomic_data[pid][var][line.split(" ")[0].split("\n")[0]]=line.split(" ")[1].split("\n")[0]
    print(len(genomic_data))
    for patient in genomic_data:
        if not(patient in list(patient_phen.keys())):
            genomic_data=removekey(genomic_data,patient)
    print(len(genomic_data))
    return genomic_data

In [None]:
variants=get_gene_data("patient_genomic.txt","Var")

In [None]:
genes=get_gene_data("patient_genes.txt","Gene")

In [None]:
# get the list of patients that present a candidate gene or candidate variants
list_patient_genes=list(genes.keys())
list_patient_variants=list(variants.keys())

In [None]:
print("Patients in both", len([patient for patient in patient_phen if patient in list_patient_genes and patient in list_patient_variants]))
print("Patients with only genes", len([patient for patient in patient_phen if patient in list_patient_genes and not(patient in list_patient_variants)]))
print("Patients with only variants", [patient for patient in patient_phen if not(patient in list_patient_genes) and patient in list_patient_variants])

In [None]:
# count the number of solved cases for people with an indicated gene or an indicated variant
print("Number of solved and unsolved cases for genes indicated : ",collec.Counter(status.loc[list(genes.keys())]["\\13_Status\\"]))
print("Number of solved and unsolved cases for variants indicated : ",collec.Counter(status.loc[list(variants.keys())]["\\13_Status\\"]))

In [None]:
def get_dist_genomic(genomic_data,var_or_gene):
    """Get the distribution associated to genomic data for its characteristics
    Parameters: genomic_data: dictionary, with UDN ID as key and list with dictionaries as value, dict contaning characteristics
                              of the considered genomic data
                var_or_gene: string, "Var" if variants, "Gen" otherwise
    Returns: gene_effects: counter, with distribution of characteristics for selected genomic data
    """
    gene_list=[]
    for patient in genomic_data:
        for i in range(len(genomic_data[patient])):
            if var_or_gene=="Var":
                if "effect" in list(genomic_data[patient][i].keys()) and "gene" in list(genomic_data[patient][i].keys()):
                    gene_list.append([genomic_data[patient][i]["gene"],genomic_data[patient][i]["effect"]])
                else:
                    gene_list.append([genomic_data[patient][i]["gene"],"NA"])
            elif var_or_gene=="Gen":
                if "status" in list(genomic_data[patient][i].keys()) and "gene" in list(genomic_data[patient][i].keys()):
                    gene_list.append([genomic_data[patient][i]["gene"],genomic_data[patient][i]["status"]])
                else:
                    gene_list.append([genomic_data[patient][i]["gene"],"NA"])  
            else:
                print("var_or_gene must be Var or Gen")
    gene_effects=collec.Counter(np.array(gene_list)[:,1])
    return gene_effects

In [None]:
# get the count of mutation types for candidate variants
gene_effects=get_dist_genomic(variants,"Var")
gene_effects

In [None]:
# get the distribution of gene status for candidate genes
gene_status=get_dist_genomic(genes,"Gen")
gene_status

In [None]:
def plot_distribution_genomic_data(genomic_data,namefile,var_or_gene):
    """Show the distribution of counts of candidate genes or variant per patient in the UDN database
    Parameters: genomic_data: dictionary, with UDN ID as key and list with dictionaries as value, dict contaning characteristics
                              of the considered genomic data
                namefile: string, file of the name to save the figure in 
                var_or_gene: string, "variants" if variants is considered, "genes" else
    Returns: None
    Show the distribution in a scatter plot and the counter, as well as total number of candidate genes/variants
    """
    count_gene_per_patient=collec.Counter([len(genomic_data[patient]) for patient in genomic_data])
    print(count_gene_per_patient)
    X_gene=list(count_gene_per_patient)
    Y_gene=[count_gene_per_patient[ct] for ct in X_gene]
    print("Number of total candidate ",var_or_gene," : ",np.sum([X_gene[i]*Y_gene[i] for i in range(len(X_gene))]))
    plt.figure(figsize=(10,5))
    plt.plot(X_gene,Y_gene,"o")
    plt.xticks(np.arange(0,18))
    plt.title("Distribution of number of candidate "+var_or_gene+" per patient")
    plt.xlabel("Number of candidate "+var_or_gene)
    plt.ylabel("Count of patients")
    plt.savefig(namefile,bbox_inches="tight",dpi=300)
    plt.show()

In [None]:
plot_distribution_genomic_data(variants,"Count_dist_var_per_pat_2.png","variants")

In [None]:
plot_distribution_genomic_data(genes,"Count_genes_per_pat_1.png","genes")

### Statistics

In [None]:
from scipy.stats import mannwhitneyu

All results are shown using the Mann Whitney U statistic. The closer to 0 the statistic, the more significantly different the distributions are -- this can also be assessed with the p value, if p<0.05 the distributions are significantly different

#### Diagnosed vs undiagnosed

In [None]:
print("Age at UDN Evaluation, adult")
mannwhitneyu(np.array(demographics_adult_diagnosed["\\00_Demographics\\Age at UDN Evaluation (in years)\\"]),np.array(demographics_adult_undiagnosed["\\00_Demographics\\Age at UDN Evaluation (in years)\\"]))

In [None]:
print("Age at UDN Evaluation, pediatric")
mannwhitneyu(np.array(demographics_pediatric_diagnosed["\\00_Demographics\\Age at UDN Evaluation (in years)\\"]),np.array(demographics_pediatric_undiagnosed["\\00_Demographics\\Age at UDN Evaluation (in years)\\"]))

In [None]:
print("Age at symptom onset, adult")
mannwhitneyu(np.array(demographics_adult_diagnosed["\\00_Demographics\\Age at symptom onset in years\\"]),np.array(demographics_adult_undiagnosed["\\00_Demographics\\Age at symptom onset in years\\"]))


In [None]:
print("Age at symptom onset, pediatric")
mannwhitneyu(np.array(demographics_pediatric_diagnosed["\\00_Demographics\\Age at symptom onset in years\\"]),np.array(demographics_pediatric_undiagnosed["\\00_Demographics\\Age at symptom onset in years\\"]))


In [None]:
print("Primary symptoms, adults")
print(np.median(np.multiply(list(pscount_ad),1/len(list_diagnosed_phen)*100)),np.median(np.multiply(list(pscount_and),1/len(list_undiagnosed_phen)*100)))
mannwhitneyu(np.multiply(list(pscount_ad),1/len(list_diagnosed_phen)*100),np.multiply(list(pscount_and),1/len(list_undiagnosed_phen)*100))

In [None]:
print("Primary symptoms, pediatric")
print(np.median(np.multiply(list(pscount_pd),1/len(list_diagnosed_phen)*100)),np.median(np.multiply(list(pscount_pnd),1/len(list_undiagnosed_phen)*100)))
mannwhitneyu(np.multiply(list(pscount_pd),1/len(list_diagnosed_phen)*100),np.multiply(list(pscount_pnd),1/len(list_undiagnosed_phen)*100))

In [None]:
print("Clinical sites, adults")
print(np.median(np.multiply(list(cscount_ad),1/len(list_diagnosed_phen)*100)),np.median(np.multiply(list(cscount_and),1/len(list_undiagnosed_phen)*100)))
mannwhitneyu(np.multiply(list(cscount_ad),1/len(list_diagnosed_phen)*100),np.multiply(list(cscount_and),1/len(list_undiagnosed_phen)*100))

In [None]:
print("Clinical sites, pediatric")
print(np.median(np.multiply(list(cscount_pd),1/len(list_diagnosed_phen)*100)),np.median(np.multiply(list(cscount_pnd),1/len(list_undiagnosed_phen)*100)))
mannwhitneyu(np.multiply(list(cscount_pd),1/len(list_diagnosed_phen)*100),np.multiply(list(cscount_pnd),1/len(list_undiagnosed_phen)*100))

# Clustering


In [None]:
# get the index of unique phenotypes in the phenotype Dataframe
mat_phen_ind=[]
uniquep=[]
for i,phen in enumerate(header_phen):
    if not(phen.split("\\")[-2] in uniquep):
        mat_phen_ind.append(i)
        uniquep.append(phen.split("\\")[-2])
len(mat_phen_ind)

In [None]:
# take the patient ID column out of the phenotype dataframe
matrix_phen=phenotypes.drop("Patient ID",axis=1)

In [None]:
# transform the phenotype dataframe to obtain a matrix of unique phenotypes, with only patients that have been evaluated,
# with 1 if the phenotype is positively present, 0 if negative or NaN
mat_phen=matrix_phen.iloc[:,mat_phen_ind]
mat_phen=mat_phen.loc[list(patient_phen.keys())]
mat_phen=mat_phen.replace(to_replace={"Positive": 1, "Negative": 0, np.nan: 0})

In [None]:
# transform the phenotype dataframe to obtain a matrix of unique phenotypes, with only patients that have been evaluated,
# with 1 if the phenotype is positively present, 0 if negative or NaN
mat_phen_adult=matrix_phen.iloc[:,mat_phen_ind]
mat_phen_adult=mat_phen_adult.loc[adult_patients]
mat_phen_adult=mat_phen_adult.replace(to_replace={"Positive": 1, "Negative": 0, np.nan: 0})

In [None]:
# transform the phenotype dataframe to obtain a matrix of unique phenotypes, with only patients that have been evaluated,
# with 1 if the phenotype is positively present, 0 if negative or NaN
mat_phen_pediatric=matrix_phen.iloc[:,mat_phen_ind]
mat_phen_pediatric=mat_phen_pediatric.loc[pediatric_patients]
mat_phen_pediatric=mat_phen_pediatric.replace(to_replace={"Positive": 1, "Negative": 0, np.nan: 0})

In [None]:
# The matrix is comprised of 1042 patients with at least 1 phenotype, and 3965 unique phenotypes
mat_phen.shape

In [None]:
# The matrix is comprised of 232 patients with at least 1 phenotype, and 3965 unique phenotypes
mat_phen_adult.shape

In [None]:
# The matrix is comprised of 809 patients with at least 1 phenotype, and 3965 unique phenotypes
mat_phen_pediatric.shape

In [None]:
# we compute the jaccard similarity matrix for the phenotypic matrix, total patients
from sklearn.metrics.pairwise import pairwise_distances
jac_sim_un = 1 - pairwise_distances(mat_phen, metric = "jaccard")

In [None]:
# we compute the jaccard similarity matrix for the phenotypic matrix, adult patients
jac_sim_un_adult = 1 - pairwise_distances(mat_phen_adult, metric = "jaccard")

In [None]:
# we compute the jaccard similarity matrix for the phenotypic matrix, pediatric patients
jac_sim_un_pediatric = 1 - pairwise_distances(mat_phen_pediatric, metric = "jaccard")

In [None]:
def graph_of_patients_js(UDN_IDs,sim_matrix):
    """Constructs the graph of UDN patients using the similarity matrix computed: nodes are patients, edges between patient
    i and j is proportional to the similarity between these two patients
    Parameters: UDN_IDs: list of UDN IDs of patients to consider
                sim_matrix: array, similarity matrix of pairwise similarity between each patient
    Returns : G: networkx graph of UDN patients 
              pos: array, positions of nodes 
    """
    G= nx.Graph()
    elist=[]
    print("udnlen",len(UDN_IDs))
    print("cslen",len(sim_matrix))
    for i in range(sim_matrix.shape[0]):
        G.add_node(UDN_IDs[i])
        for j in range(i,sim_matrix.shape[1]):
            elist.append((UDN_IDs[i],UDN_IDs[j],sim_matrix[i,j]))
    G.add_weighted_edges_from(elist)
    pos=nx.spring_layout(G,dim=2)
    return G,pos

In [None]:
graph_un,pos_un=graph_of_patients_js(list(patient_phen.keys()),jac_sim_un)

In [None]:
graph_un_adult,pos_un_adult=graph_of_patients_js(adult_patients,jac_sim_un_adult)

In [None]:
graph_un_pediatric,pos_un_pediatric=graph_of_patients_js(pediatric_patients,jac_sim_un_pediatric)

In [None]:
# writes the computed graph in a gml format, to be able to use Gephi to analyze it further
nx.write_gml(graph_un,"graph_un.gml")

In [None]:
# writes the computed graph in a gml format, to be able to use Gephi to analyze it further
nx.write_gml(graph_un_adult,"graph_un_adult.gml")

In [None]:
# writes the computed graph in a gml format, to be able to use Gephi to analyze it further
nx.write_gml(graph_un_pediatric,"graph_un_pediatric.gml")

In [None]:
def compute_clusters_community(graph,resolution):
    """Compute the clusters in a graph using Louvain's community detection method
    Parameters : graph: networkx graph of UDN patients computed using the pairwise similarity between patients
    Returns: clusters: dictionary with the cluster number as key and a list containing all the patients in the cluster
                       as value
    """
    partition = community_louvain.best_partition(graph,resolution=resolution)
    print("Partition done")
    clusters={}
    for node in partition.keys():
        if not(partition[node] in clusters.keys()):
            clusters[partition[node]]=[node]
        else:
            clusters[partition[node]].append(node)
    count=0
    for cluster in clusters.keys():
        print("Length of cluster ",cluster,":",len(clusters[cluster]))
        if len(clusters[cluster])==1:
            count+=1
    print("Number of clusters with only one patient (outliers) :",count)
    return clusters

In [None]:
clusters_un=compute_clusters_community(graph_un)

In [None]:
clusters_un_adult_test=compute_clusters_community(graph_un_adult,3.0)

In [None]:
clusters_un_pediatric_test=compute_clusters_community(graph_un_pediatric,1.2)

In [None]:
# we compute the indices of clusters with more than 2 patients, the indices of pairs and the indices of groups with only 1
ind_groups=[cluster for cluster in clusters_un if len(clusters_un[cluster])>2]
ind_pairs=[cluster for cluster in clusters_un if len(clusters_un[cluster])==2]
ind_outliers=[cluster for cluster in clusters_un if len(clusters_un[cluster])==1]
ind_groups,ind_pairs,ind_outliers

In [None]:
# we compute the indices for different types of groups: more than 2 or 3, only 2 or 3, more than 10...
ind_lg_groups_adult=[cluster for cluster in clusters_un_adult if len(clusters_un_adult[cluster])>9]
ind_sm_groups_adult=[cluster for cluster in clusters_un_adult if len(clusters_un_adult[cluster])<10 and len(clusters_un_adult[cluster])>2]
ind_groups_pediatric=[cluster for cluster in clusters_un_pediatric if len(clusters_un_pediatric[cluster])>2]
ind_pairs_adult=[cluster for cluster in clusters_un_adult if len(clusters_un_adult[cluster])==2]
ind_pairs_pediatric=[cluster for cluster in clusters_un_pediatric if len(clusters_un_pediatric[cluster])==2]
ind_groups_adult_test=[0,1,2,3]
ind_other_adult_test=[4,13]
ind_groups_pediatric_test=[0,1,2,3,4]
ind_other_pediatric_test=[5,6,7,8,9,10,12]
print("Lg groups adult network : ",ind_lg_groups_adult)
print("Sm groups adult network : ",ind_sm_groups_adult)
print("Pairs adult network : ",ind_pairs_adult)
print("Clusters pediatric network : ",ind_groups_pediatric)
print("Pairs pediatric network : ",ind_pairs_pediatric)

### Pair analysis

In [None]:
# Check if there are phenotypes in common between pairs; prints the phenotype in common 
def pair_analysis(clusters_un,ind_pairs,patient_phen):
    for cluster in ind_pairs:
        print("Cluster C",cluster)
        if len(clusters_un[cluster])==2:
            print(set(patient_phen[clusters_un[cluster][0]]["pos"]) & set(patient_phen[clusters_un[cluster][1]]["pos"]))
        elif len(clusters_un[cluster])==3:
            print("all : ",set(patient_phen[clusters_un[cluster][0]]["pos"]) & set(patient_phen[clusters_un[cluster][1]]["pos"]) & set(patient_phen[clusters_un[cluster][2]]["pos"]))
            print("0 and 1 : ",set(patient_phen[clusters_un[cluster][0]]["pos"]) & set(patient_phen[clusters_un[cluster][1]]["pos"]))
            print("1 and 2 : ",set(patient_phen[clusters_un[cluster][1]]["pos"]) & set(patient_phen[clusters_un[cluster][2]]["pos"]))
            print("0 and 2 : ",set(patient_phen[clusters_un[cluster][0]]["pos"]) & set(patient_phen[clusters_un[cluster][2]]["pos"]))



In [None]:
print("Adult network")
pair_analysis(clusters_un_adult_test,ind_other_adult_test,patient_phen)
print("---------------------")
print("Pediatric network")
pair_analysis(clusters_un_pediatric_test,ind_other_pediatric_test,patient_phen)

In [None]:
# nb of HPO terms for groups of 2 or 3 patients on avg
all_HPO_counts_sm=[]
print("Adult clusters")
for i in ind_other_adult_test:
    all_HPO_counts_sm.append(np.average(HPO_count_adult_test[i]))
    print("Cluster C",i," : ",np.average(HPO_count_adult_test[i]))
print("Pediatric clusters")
for i in ind_other_pediatric_test:
    all_HPO_counts_sm.append(np.average(HPO_count_pediatric_test[i]))
    print("Cluster C",i," : ",np.average(HPO_count_pediatric_test[i]))
print("Overall average : ",np.average(all_HPO_counts_sm)," CI 95% : ",get_CI(all_HPO_counts_sm))

### Cluster analysis

In [None]:
def calculate_diag_OR(clusters,clusters_ind,status):
    """Calculate the Odds Ratio for the probability of being diagnosed linked to being in a certain cluster
    Parameters: clusters: dictionary with cluster number as key and list of patients in cluster as value
                clusters_ind: list, indices of cluster to take into account 
                status: string, status of the patient (if patient's case is solved or not)
    Returns: OR_diag: dictionary with cluster number as key and the Odds Ratio (OR) for each cluster
    """
    count_diag_clusters={cluster: 0 for cluster in clusters_ind}
    for cluster in clusters_ind:
        for patient in clusters[cluster]:
            if status.loc[patient]["\\13_Status\\"]=="solved":
                count_diag_clusters[cluster]+=1
    OR_diag,IC={},{}
    def IC_func(sign,OR,a,b,c,d):
        if (a==0 or b==0 or c==0 or d==0):
            return None
        if sign=="up":
            return np.exp(np.log(OR)+1.96*np.sqrt(1/a+1/b+1/c+1/d))
        else:
            return np.exp(np.log(OR)-1.96*np.sqrt(1/a+1/b+1/c+1/d))
    for cluster in count_diag_clusters:
        count_diag_notin=np.sum([count_diag_clusters[cl] for cl in clusters_ind if not(cl==cluster)])
        OR_diag[cluster]=(count_diag_clusters[cluster]/count_diag_notin)/((len(clusters[cluster])-count_diag_clusters[cluster])/np.sum([len(clusters[cl])-count_diag_clusters[cl] for cl in clusters_ind]))
        IC[cluster]={"up": IC_func("up",OR_diag[cluster],count_diag_clusters[cluster],(len(clusters[cluster])-count_diag_clusters[cluster]),count_diag_notin,np.sum([len(clusters[cl])-count_diag_clusters[cl] for cl in clusters_ind]))
                    ,"low": IC_func("low",OR_diag[cluster],count_diag_clusters[cluster],(len(clusters[cluster])-count_diag_clusters[cluster]),count_diag_notin,np.sum([len(clusters[cl])-count_diag_clusters[cl] for cl in clusters_ind]))}
    return OR_diag,IC

In [None]:
OR_diag_lg_adult,IC_lg_adult=calculate_diag_OR(clusters_un_adult,ind_lg_groups_adult,status)

In [None]:
OR_diag_sm_adult,IC_sm_adult=calculate_diag_OR(clusters_un_adult,ind_sm_groups_adult,status)

In [None]:
OR_diag_pediatric,IC_pediatric=calculate_diag_OR(clusters_un_pediatric,ind_groups_pediatric,status)

In [None]:
OR_diag_adult_test,IC_adult_test=calculate_diag_OR(clusters_un_adult_test,ind_groups_adult_test,status)

In [None]:
OR_diag_pediatric_test,IC_pediatric_test=calculate_diag_OR(clusters_un_pediatric_test,ind_groups_pediatric_test,status)

In [None]:
def phenotype_enrichment_analysis(patients_clustered,patient_phen,polarity_HPO):
    """Get the phenotypes shared by the most patients in the cluster according to polarity (positive or negative)
    Parameters: patients_clustered: list of patients in the cluster 
                patient_phen: dictionary of unique phenotypes associated with each patient; key is patient, value is dictionary
                with key "pos" or "neg" and value list of unique phenotypes with positive or negative association
                polarity_HPO: string, "pos" or "neg", polarity wanted for the phenotype enrichment analysis
    Returns: phen_ranked: list of best phenotypes ranked according to their representation in the cluster
             values: list of proportion of patients presenting the phenotype in the phen_ranked same position (ex: values[i]
             will have the represention of phenotype phen_ranked[i])
    """
    phen_count={}
    for patient in patients_clustered:
        for phen in patient_phen[patient][polarity_HPO]:
            if not(phen in phen_count):
                phen_count[phen]=1/len(patients_clustered)
            else:
                phen_count[phen]+=1/len(patients_clustered)
    phen_ranked=np.array([phen for phen in phen_count.keys()])
    values=np.array([phen_count[phen] for phen in phen_ranked])
    indrank=np.argsort(values)[::-1]
    phen_ranked=phen_ranked[indrank]
    values=values[indrank]
    return phen_ranked,values

In [None]:
def get_HPO_count(patients_clustered,HPO_terms):
    """get the count of HPO terms for patients in the cluster, and the average
    Parameters: patients_clustered: dictionary with cluster number as key and list of patients in the cluster as value
                HPO_terms: dictionary with patient as key and count of HPO terms for the patient as value
    Returns: HPO_cluster: dictionary with cluster number as key and list of HPO numbers for each patient in the cluster as value
             avg_HPO_clusters: dictionary with cluster number as key and average number of HPO terms per patient as value
    """
    HPO_cluster = {i: [] for i in patients_clustered.keys()}
    for cluster in patients_clustered:
        for patient in patients_clustered[cluster]:
            HPO_cluster[cluster].append(HPO_terms[patient])
    avg_HPO_clusters = {cluster: np.average(HPO_cluster[cluster]) for cluster in patients_clustered.keys()}
    CI_HPO_clusters = {cluster: get_CI(HPO_cluster[cluster]) for cluster in patients_clustered.keys()}
    return HPO_cluster,avg_HPO_clusters,CI_HPO_clusters

In [None]:
HPO_count_adult,avg_HPO_clusters_adult,CI_HPO_clusters_adult=get_HPO_count(clusters_un_adult,HPO_terms)
HPO_count_pediatric,avg_HPO_clusters_pediatric,CI_HPO_clusters_pediatric=get_HPO_count(clusters_un_pediatric,HPO_terms)

In [None]:
HPO_count_adult_test,avg_HPO_clusters_adult_test,CI_HPO_clusters_adult_test=get_HPO_count(clusters_un_adult_test,HPO_terms)
HPO_count_pediatric_test,avg_HPO_clusters_pediatric_test,CI_HPO_clusters_pediatric_test=get_HPO_count(clusters_un_pediatric_test,HPO_terms)

In [None]:
# get the ranked positively and negatively associated phenotyeps for patients in each cluster (phen_ranked_pos 
# and phen_ranked_neg) 
# phen_ranked_pos (or _neg) is a dictionary with cluster number as key, and two arrays as value, one with the label
# of phenotypes ranked to their composition, another with the composition of said phenotype in the cluster
def get_phen_ranked(clusters_un,ind_groups):
    phen_ranked_pos,phen_ranked_neg={cluster: [] for cluster in ind_groups},{cluster: [] for cluster in ind_groups}
    for cluster in ind_groups:
        phen_ranked_pos[cluster]=phenotype_enrichment_analysis(clusters_un[cluster],patient_phen,"pos")
        phen_ranked_neg[cluster]=phenotype_enrichment_analysis(clusters_un[cluster],patient_phen,"neg")
    return phen_ranked_pos,phen_ranked_neg

In [None]:
phen_ranked_pos_lg_adult,phen_ranked_neg_lg_adult=get_phen_ranked(clusters_un_adult,ind_lg_groups_adult)
phen_ranked_pos_sm_adult,phen_ranked_neg_sm_adult=get_phen_ranked(clusters_un_adult,ind_sm_groups_adult)
phen_ranked_pos_pediatric,phen_ranked_neg_pediatric=get_phen_ranked(clusters_un_pediatric,ind_groups_pediatric)

In [None]:
phen_ranked_pos_TEST_adult,phen_ranked_neg_TEST_adult=get_phen_ranked(clusters_un_adult_test,ind_groups_adult_test)
phen_ranked_pos_TEST_pediatric,phen_ranked_neg_TEST_pediatric=get_phen_ranked(clusters_un_pediatric_test,ind_groups_pediatric_test)

In [None]:
def show_best_phenotypes_clusters(phen_ranked,nb,clusters_un):
    """Shows the nb best ranked phenotypes for each cluster that has ranked phenotypes
    Parameters: phen_ranked: dictionary with cluster number as key, two arrays as value, one with list of phenotypes 
                             ranked according to composition, second with composition of each phenotype
                nb: int, number of best phenotypes to show
    Returns: None
    Shows the nb best phenotypes for each cluster with their composition
    """
    for cluster in phen_ranked:
        print("Cluster ",cluster)
        print("Cluster len ",len(clusters_un[cluster]))
        n=(nb if len(phen_ranked[cluster][0])>10 else len(phen_ranked[cluster][0]))
        for i in range(n):
            print(phen_ranked[cluster][0][i],phen_ranked[cluster][1][i])

In [None]:
show_best_phenotypes_clusters(phen_ranked_pos_TEST_adult,5,clusters_un_adult_test)

In [None]:
def heatmap_phen(clusters_un,phen_ranked,ind_groups,ad_or_ped,nb_phen,figsize,vmin,vmax,figname):
    """Displays heatmap of phenotype enrichment analysis for each cluster with analyzed composition
    Parameters: clusters_un: dictionary with cluster number as key and list of patients in the cluster as value
                phen_ranked: dictionary with cluster number as key, two arrays as value, one with list of phenotypes 
                             ranked according to composition, second with composition of each phenotype
                ind_groups: list of int, indices to take into consideration
                ad_or_ped: str, "adult" or "pediatric", changes the display
                nb_phen: int, number of best phen to display
                figsize: int, size of the figure displayed
                vmin: int, minimum value for the heatmap (here percentage)
                vmax: int, max value for the heatmap (here percentage)
                figname: str, name under which you save the heatmap
    Returns: None
    Shows the heatmap of phenotype enrichment analysis for each cluster
    """
    if ad_or_ped=="adult":
        cluster_list=["Cluster C"+str(cluster+1)+"A, N="+str(len(clusters_un[cluster])) for cluster in ind_groups]
    elif ad_or_ped=="pediatric":
        cluster_list=["Cluster C"+str(cluster+1)+"P, N="+str(len(clusters_un[cluster])) for cluster in ind_groups]
    list_phen_max=[]
    for cluster in ind_groups:
        i,j=0,0
        while j<nb_phen:
            if not(phen_ranked[cluster][0][i]) in list_phen_max:
                list_phen_max.append(phen_ranked[cluster][0][i])
                j+=1
            i+=1
    heatmap_mat=[[] for i in range(len(list_phen_max))]
    for i,phen in enumerate(list_phen_max):
        for cluster in ind_groups:
            if phen in phen_ranked[cluster][0]:
                indphen=np.where(phen_ranked[cluster][0]==phen)[0][0]
                heatmap_mat[i].append(phen_ranked[cluster][1][indphen]*100)
            else:
                heatmap_mat[i].append(0)
    sns.set()
    fig,ax=plt.subplots(figsize=(figsize,figsize))
    sns.heatmap(heatmap_mat,cbar=True,cmap="YlGnBu",xticklabels=cluster_list,yticklabels=list_phen_max,ax=ax,vmin=vmin,vmax=vmax)
    plt.ylabel("Phenotypes")
    plt.savefig(figname+".png",bbox_inches="tight",dpi=350)
    plt.show()

In [None]:
# heatmap for positive associations, adult
heatmap_phen(clusters_un_adult_test,phen_ranked_pos_TEST_adult,ind_groups_adult_test,"adult",5,12,0,50,"heatmap_adult_clusters_test_1")

In [None]:
# heatmap for positive associations, pediatric
heatmap_phen(clusters_un_pediatric_test,phen_ranked_pos_TEST_pediatric,ind_groups_pediatric_test,"pediatric",5,12,0,75,"heatmap_pediatric_clusters_test_1")

In [None]:
def metadata_collection(patients_clustered,metadata):
    """Get the metadata for each cluster 
    Parameters: patients_clustered: dictionary with cluster number as key and list of patients in the cluster as value
                metadata: dataframe with metadata
    Returns: metadata_clusters: dictionary with clusters as keys and dictionary as value, with key the metadata considered
                                and list of values for patients in the cluster as value
    """
    metadata_clusters={cl: {meta: [] for meta in list(metadata.columns)} for cl in patients_clustered.keys()}
    for cl in patients_clustered:
        for patient in patients_clustered[cl]:
            for meta in list(metadata.columns)[1:]:
                metadata_clusters[cl][meta].append(metadata.loc[patient][meta])
    return metadata_clusters

In [None]:
# get the demographics for the patient in the cluster 
demographics_coll_adult=metadata_collection(clusters_un_adult,demographics)
demographics_coll_pediatric=metadata_collection(clusters_un_pediatric,demographics)

In [None]:
demographics_coll_adult_test=metadata_collection(clusters_un_adult_test,demographics)
demographics_coll_pediatric_test=metadata_collection(clusters_un_pediatric_test,demographics)

In [None]:
# show the average and 95% CI for age at UDN evaluation
def show_metadata_clusters(ind_groups,demographics_coll,attribute):
    """Shows the average and 95% CI for a certain attribute
    Parameters: ind_groups: list of int, indices of clusters to consider
                demographics_coll: dictionary with clusters as keys and dictionary as value, with key the metadata considered
                                and list of values for patients in the cluster as value
                attribute: str, attribute to consider (must be in demographics_coll)
    Returns: None
    Shows the avg and 95% CI
    """
    for cluster in ind_groups:
        lst=np.array(demographics_coll[cluster][attribute])
        lst=lst[np.logical_not(np.isnan(lst))]
        print("Cluster C",cluster,"Average ",attribute," : ",np.average(lst)," CI 95% : ",get_CI(lst))

In [None]:
def get_metadata_clusters(ind_groups,demographics_coll,attribute):
    """Returns the average and 95% CI for an attribute
    Parameters: ind_groups: list of int, indices of clusters to consider
                demographics_coll: dictionary with clusters as keys and dictionary as value, with key the metadata considered
                                and list of values for patients in the cluster as value
                attribute: str, attribute to consider (must be in demographics_coll)
    Returns: avg_att: dictionnary with clusters as keys and average of considered attribute as value
             CI_att: dictionnary with clusters as keys and tuple with lower and upper CI 95% as value"""
    avg_att,CI_att={cl: 0 for cl in ind_groups},{cl: (0,0) for cl in ind_groups}
    for cluster in ind_groups:
        lst=np.array(demographics_coll[cluster][attribute])
        lst=lst[np.logical_not(np.isnan(lst))]
        avg_att[cluster]=np.average(lst)
        CI_att[cluster]=get_CI(lst)
    return avg_att,CI_att

In [None]:
avg_onset_adult,CI_onset_adult=get_metadata_clusters(ind_groups_adult,demographics_coll_adult,'\\00_Demographics\\Age at symptom onset in years\\')
avg_UDN_eval_adult,CI_UDN_eval_adult=get_metadata_clusters(ind_groups_adult,demographics_coll_adult,'\\00_Demographics\\Age at UDN Evaluation (in years)\\')

In [None]:
avg_onset_adult_test,CI_onset_adult_test=get_metadata_clusters(ind_groups_adult_test,demographics_coll_adult_test,'\\00_Demographics\\Age at symptom onset in years\\')
avg_UDN_eval_adult_test,CI_UDN_eval_adult_test=get_metadata_clusters(ind_groups_adult_test,demographics_coll_adult_test,'\\00_Demographics\\Age at UDN Evaluation (in years)\\')

In [None]:
avg_onset_pediatric,CI_onset_pediatric=get_metadata_clusters(ind_groups_pediatric,demographics_coll_pediatric,'\\00_Demographics\\Age at symptom onset in years\\')
avg_UDN_eval_pediatric,CI_UDN_eval_pediatric=get_metadata_clusters(ind_groups_pediatric,demographics_coll_pediatric,'\\00_Demographics\\Age at UDN Evaluation (in years)\\')

In [None]:
avg_onset_pediatric_test,CI_onset_pediatric_test=get_metadata_clusters(ind_groups_pediatric_test,demographics_coll_pediatric_test,'\\00_Demographics\\Age at symptom onset in years\\')
avg_UDN_eval_pediatric_test,CI_UDN_eval_pediatric_test=get_metadata_clusters(ind_groups_pediatric_test,demographics_coll_pediatric_test,'\\00_Demographics\\Age at UDN Evaluation (in years)\\')

In [None]:
show_metadata_clusters(ind_groups_pediatric,demographics_coll_pediatric,'\\00_Demographics\\Age at symptom onset in years\\')

In [None]:
def get_distrib(attribute,demographics_coll):
    """Get a distribution for an attribute
    Parameters: attribute: string, attribute we want the distribution of
    Returns: counter of the collection library (distribution)
    """
    counter={}
    for cluster in demographics_coll:
        dc=np.array(demographics_coll[cluster][attribute])
        if type(dc[0])==np.float64:
            dc=dc[np.logical_not(np.isnan(dc))]
        counter[cluster]=dict(collec.Counter(dc))
    return counter

In [None]:
gender_distrib_adult=get_distrib('\\00_Demographics\\Gender\\',demographics_coll_adult)

In [None]:
gender_distrib_adult_test=get_distrib('\\00_Demographics\\Gender\\',demographics_coll_adult_test)

In [None]:
gender_distrib_pediatric=get_distrib('\\00_Demographics\\Gender\\',demographics_coll_pediatric)

In [None]:
gender_distrib_pediatric_test=get_distrib('\\00_Demographics\\Gender\\',demographics_coll_pediatric_test)
gender_distrib_pediatric_test

### Statistics clusters

In [None]:
# get the Kruskal Wallis for the distribution of HPO count between clusters \\ adult
kr_HPO_ad=kruskal(HPO_count_adult_test[0],HPO_count_adult_test[1],HPO_count_adult_test[2],HPO_count_adult_test[3])

In [None]:
# get the Kruskal Wallis for the distribution of HPO count between clusters \\ pediatric
kr_HPO_ped=kruskal(HPO_count_pediatric_test[0],HPO_count_pediatric_test[1],HPO_count_pediatric_test[2],HPO_count_pediatric_test[3],HPO_count_pediatric_test[4])

In [None]:
def get_stats_value(value_considered,ad_or_ped,ind_clusters,demographics_coll):
    """get the Kruskal Wallis U index and p-value for a type of demographics
    Parameters: value_considered: string, type of demographics we want the KW U index and p-value
                ad_or_ped: str, "adult" or "pediatric", to consider one or the other cluster
                ind_clusters: list of int, indices of clusters to consider
                demographics_coll: dictionary with clusters as keys and dictionary as value, with key the metadata considered
                                and list of values for patients in the cluster as value
    Returns: Kruskal-Wallis statistic, 2-D tuple with H-index (0) and p-value (1)
    Prints the KW H index and p-value
    """
    dc={i: [] for i in ind_clusters}
    for i in ind_clusters:
        dc[i]=np.array(demographics_coll[i][value_considered])
        if type(dc[i][0])==np.float64:
            dc[i]=dc[i][np.logical_not(np.isnan(dc[i]))]
    if ad_or_ped=="adult":
        print(kruskal(dc[0],dc[1],dc[2],dc[3]))
        return kruskal(dc[0],dc[1],dc[2],dc[3])
    elif ad_or_ped=="pediatric":
        print(kruskal(dc[0],dc[1],dc[2],dc[3],dc[4]))
        return kruskal(dc[0],dc[1],dc[2],dc[3],dc[4])
    else:
        print("ad_or_ped can only be adult or pediatric")

In [None]:
# KW test for Age at UDN evaluation // adult
kr_UDN_ad=get_stats_value('\\00_Demographics\\Age at UDN Evaluation (in years)\\',"adult",ind_groups_adult_test,demographics_coll_adult_test)

In [None]:
# KW test for Age at UDN evaluation // pediatric
kr_UDN_ped=get_stats_value('\\00_Demographics\\Age at UDN Evaluation (in years)\\',"pediatric",ind_groups_pediatric_test,demographics_coll_pediatric_test)

In [None]:
# KW test for Age at symptom onset // adult
kr_onset_ad=get_stats_value('\\00_Demographics\\Age at symptom onset in years\\',"adult",ind_groups_adult_test,demographics_coll_adult_test)

In [None]:
# KW test for Age at symptom onset // pediatric
kr_onset_ped=get_stats_value('\\00_Demographics\\Age at symptom onset in years\\',"pediatric",ind_groups_pediatric_test,demographics_coll_pediatric_test)

## Creating tables

In [None]:
from docx import Document
from docx.shared import Inches
n_ped,n_ad=5,4

def create_table(n_ad,a_or_p,clusters_un,avg_HPO_clusters,CI_HPO_clusters,gender_distrib,OR_diag,IC_OR,avg_onset,CI_onset,avg_UDN_eval,CI_UDN_eval,docname):
    """Creates a word document with automatically rendered table of cluster characteristics
    Parameters: n_ad: number of clusters 
                a_or_p: str, "A" or "P", changes the display 
                clusters_un: dictionnary with cluster as key and list of UDN IDs as value
                avg_HPO_clusters: dictionnary with cluster as key and avg HPO per patient as value
                CI_HPO_clusters: dictionnary with cluster as key and tuple of 95% CI for avg HPO as value
                gender_distrib: dictionnary with cluster as key and count of female/male as value
                OR_diag: dictionnary with cluster as key and OR as value
                IC_OR: dictionnary with cluster as key and tuple of 95% CI for OR as value
                avg_onset: dictionnary with cluster as key and avg onset as value
                CI_onset: dictionnary with cluster as key and tuple of 95% CI for onset as value
                avg_UDN_eval: dictionnary with cluster as key and avg UDN eval as value
                CI_UDN_eval: dictionnary with cluster as key and tuple of 95% CI for avg UDN eval as value
                docname: str, name of document to save
    Returns: None
    Saves a word document with table of cluster characteristics
                
    """
    document = Document()

    document.add_heading('Tables'+docname, 0)

    table = document.add_table(rows=1, cols=n_ad+1)
    hdr_cells = table.rows[0].cells
    hdr_cells[0].text = 'Clusters'
    for i in range(1,n_ad+1):
        hdr_cells[i].text = "Cluster C"+str(i)+a_or_p
    row_cells = table.add_row().cells
    row_cells[0].text = "# of patients per cluster"
    for i in range(1,n_ad+1):
        row_cells[i].text = str(len(clusters_un[i-1]))
    row_cells = table.add_row().cells
    row_cells[0].text = "Female:male ratio"
    for i in range(1,n_ad+1):
        row_cells[i].text = str(int(np.round_(gender_distrib[i-1]["Female"]*10/gender_distrib[i-1]["Male"])))+":10"
    row_cells = table.add_row().cells
    row_cells[0].text = "Avg # of HPO terms per patient"
    for i in range(1,n_ad+1):
        row_cells[i].text = str(np.round_(avg_HPO_clusters[i-1],decimals=1))+" (95% CI: "+str(np.round_(CI_HPO_clusters[i-1][0],decimals=1))+" - "+str(np.round_(CI_HPO_clusters[i-1][1],decimals=1))+")"
    row_cells = table.add_row().cells
    row_cells[0].text = "Odds ratio diagnosed"
    for i in range(1,n_ad+1):
        row_cells[i].text = str(np.round_(OR_diag[i-1],decimals=1))+" (95% CI: "+str(np.round_(IC_OR[i-1]["low"],decimals=1))+" - "+str(np.round_(IC_OR[i-1]["up"],decimals=1))+")"
    row_cells = table.add_row().cells
    row_cells[0].text = "Average age at onset in y"
    for i in range(1,n_ad+1):
        row_cells[i].text = str(np.round_(avg_onset[i-1],decimals=1))+" (95% CI: "+str(np.round_(CI_onset[i-1][0],decimals=1))+" - "+str(np.round_(CI_onset[i-1][1],decimals=1))+")"
    row_cells = table.add_row().cells
    row_cells[0].text = "Average age at UDN evaluation in y"
    for i in range(1,n_ad+1):
        row_cells[i].text = str(np.round_(avg_UDN_eval[i-1],decimals=1))+" (95% CI: "+str(np.round_(CI_UDN_eval[i-1][0],decimals=1))+" - "+str(np.round_(CI_UDN_eval[i-1][1],decimals=1))+")"
    document.add_page_break()

    document.save(docname+'.docx')

In [None]:
create_table(4,"A",clusters_un_adult_test,avg_HPO_clusters_adult_test,CI_HPO_clusters_adult_test,gender_distrib_adult_test,OR_diag_adult_test,IC_adult_test,avg_onset_adult_test,CI_onset_adult_test,avg_UDN_eval_adult_test,CI_UDN_eval_adult_test,"adult_table_test_2")

In [None]:
create_table(5,"P",clusters_un_pediatric_test,avg_HPO_clusters_pediatric_test,CI_HPO_clusters_pediatric_test,gender_distrib_pediatric_test,OR_diag_pediatric_test,IC_pediatric_test,avg_onset_pediatric_test,CI_onset_pediatric_test,avg_UDN_eval_pediatric_test,CI_UDN_eval_pediatric_test,"pediatric_table_test_2")

In [None]:
create_table(10,"P",clusters_un_pediatric,avg_HPO_clusters_pediatric,CI_HPO_clusters_pediatric,gender_distrib_pediatric,OR_diag_pediatric,IC_pediatric,avg_onset_pediatric,CI_onset_pediatric,avg_UDN_eval_pediatric,CI_UDN_eval_pediatric,"pediatric_table")

In [None]:
def create_stat_table(kr_HPO_ad,kr_HPO_ped,kr_UDN_ad,kr_UDN_ped,kr_onset_ad,kr_onset_ped,docname):
    """Creates word document to save Kruskal Wallis results for clusters
    Parameters: kr_HPO_ad: Kruskal Wallis results of HPO for adult clusters
                kr_HPO_ped: Kruskal Wallis results of HPO for pediatric clusters
                kr_UDN_ad: Kruskal Wallis results of UDN eval for adult clusters
                kr_UDN_ped: Kruskal Wallis results of UDN eval for pediatric clusters
                kr_onset_ad: Kruskal Wallis results of onset age for adult clusters
                kr_onset_ped: Kruskal Wallis results of onset age for pediatric clusters
                docname: str, name of doc to save
    Returns: None
    Saves the word document with docname
    """
    document = Document()

    document.add_heading('Tables stats '+docname, 0)

    table = document.add_table(rows=1, cols=3)
    hdr_cells = table.rows[0].cells
    hdr_cells[0].text = 'Variable'
    hdr_cells[1].text = 'Kruskal-Wallis H index and p-value'
    row_cells = table.add_row().cells
    row_cells[1].text = "Adult"
    row_cells[2].text = "Pediatric"
    row_cells = table.add_row().cells
    row_cells[0].text = "Avg # of HPO terms per patient"
    row_cells[1].text = "H = "+str(np.round_(kr_HPO_ad[0],decimals=1))+" , p = "+str(np.format_float_scientific(kr_HPO_ad[1],precision=2))
    row_cells[2].text = "H = "+str(np.round_(kr_HPO_ped[0],decimals=1))+" , p = "+str(np.format_float_scientific(kr_HPO_ped[1],precision=2))
    row_cells = table.add_row().cells
    row_cells[0].text = "Average age at onset in y"
    row_cells[1].text = "H = "+str(np.round_(kr_onset_ad[0],decimals=1))+" , p = "+str(np.format_float_scientific(kr_onset_ad[1],precision=2))
    row_cells[2].text = "H = "+str(np.round_(kr_onset_ped[0],decimals=1))+" , p = "+str(np.format_float_scientific(kr_onset_ped[1],precision=2))
    row_cells = table.add_row().cells
    row_cells[0].text = "Average age at UDN evaluation in y"
    row_cells[1].text = "H = "+str(np.round_(kr_UDN_ad[0],decimals=1))+" , p = "+str(np.format_float_scientific(kr_UDN_ad[1],precision=2))
    row_cells[2].text = "H = "+str(np.round_(kr_UDN_ped[0],decimals=1))+" , p = "+str(np.format_float_scientific(kr_UDN_ped[1],precision=2))
    document.add_page_break()

    document.save(docname+'.docx')

In [None]:
create_stat_table(kr_HPO_ad,kr_HPO_ped,kr_UDN_ad,kr_UDN_ped,kr_onset_ad,kr_HPO_ped,"statistics adult and pediatric_3")

### Get HPO numbers for phenotypes in database

In [None]:
# returns the HPO mapping, with HPO terms as value, and dictionnary as value with keys : id (the HPO id), parent 
# (list with parents of the HPO term); syn (HPO term synonym if applicable), xref (external references numbers, for UMLS, etc.)
mapping_HPO={}
with open("hpo.txt","r+") as hpo:
    lines = hpo.readlines()
    i=0
    for i in range(len(lines)):
        if lines[i].split(":")[0]=="id":
            hpoid=lines[i].split(" ")[1].split("\n")[0]
            name=""
            for namestr in lines[i+1].split(" ")[1:]:
                name+=namestr+" "
            name=name.split("\n")[0]
            mapping_HPO[name]={"id": hpoid, "xref": [], "parent": [], "syn":[]}
        if lines[i].split(" ")[0]=="xref:":
            mapping_HPO[name]["xref"].append(lines[i].split(" ")[1].split("\n")[0])
        if lines[i].split(" ")[0]=="is_a:":
            mapping_HPO[name]["parent"].append(lines[i].split(" ")[1])
        if lines[i].split(" ")[0]=="synonym:":
            namesyn=""
            for namestr in lines[i].split(" ")[1:]:
                namesyn+=namestr+" "
            namesyn=namesyn.split("\"")[1].split("'")[0]
            if len(namesyn.split("obsolete "))>1:
                namesyn=namesyn.split("obsolete ")[1]
            mapping_HPO[name]["syn"].append(namesyn)

In [None]:
# additionnal manual mapping (for mistakes/missing tokens/missing terms...)
syn_mapping={}
for dis in mapping_HPO:
    for syn in mapping_HPO[dis]["syn"]:
        syn_mapping[syn]=mapping_HPO[dis]["id"]
syn_mapping["Contracture of the distal interphalangeal joints of the fingers"]="HP:0009697"
syn_mapping["Decreased testosterone in males"]="HP:0040171"
syn_mapping["EMG myopathic abnormalities"]="HP:0003458"
syn_mapping["EMG myotonic discharges"]="HP:0100284"
syn_mapping["Increased IgE level"]="HP:0003212"
syn_mapping["EMG chronic denervation signs"]="HP:0003444"
syn_mapping["Primitive reflexes (palmomental, snout, glabellar)"]="HP:0002476"
syn_mapping["EMG slow motor conduction"]="HP:0100287"
syn_mapping["Severe Myopia"]="HP:0011003"
syn_mapping["Arthralgiaarthritis"]="HP:0005059"
syn_mapping["Decreased CSF homovanillic acid (HVA)"]="HP:0003785"
syn_mapping["Hip Subluxation"]="HP:0030043"
syn_mapping["Increased IgM level"]="HP:0003496"
syn_mapping["Hyperpigmentedhypopigmented macules"]="HP:0007441"
syn_mapping["Noninflammatory macular atrophy"]="HP:0007401"
syn_mapping["Capillary hemangiomas"]="HP:0005306"
syn_mapping["AplasiaHypoplasia of the lungs"]="HP:0006703"
syn_mapping["Abnormal serum cobalamin"]="HP:0040126"
syn_mapping["Cone-rod dystrophy"]="HP:0000548"
syn_mapping["AplasiaHypoplasia of the inner ear"]="HP:0008774"
syn_mapping["Decreased number of CD4+ T cells"]="HP:0032183"
syn_mapping["Enlarged kidneys"]="HP:0000113"
syn_mapping["Nephroblastoma (Wilms tumor)"]="HP:0002667"
syn_mapping["Cervical vertebral fusion (C2C3)"]="HP:0002949"
syn_mapping["Pulmonary hypertension"]="HP:0030950"
syn_mapping["Prominent epicanthal folds"]="HP:0000286"
syn_mapping["Cellulitis due to immunodeficiency"]="HP:0100658"
syn_mapping["Abnormal brain cholinecreatine ratio by MRS"]="HP:0012709"
syn_mapping["Small palpebral fissure"]="HP:0045025"
syn_mapping["EMG impaired neuromuscular transmission"]="HP:0100285"
syn_mapping["Absenthypoplastic coccyx"]="HP:0008436"
syn_mapping["Short tubular bones (hand)"]="HP:0001248"
syn_mapping["Increased serum Insulin-like growth factor 1"]="HP:0030269"
syn_mapping["AplasiaHypoplasia of the middle phalanx of the 4th toe"]="HP:0100373"
syn_mapping["AplasiaHypoplasia of the middle phalanx of the 5th toe"]="HP:0100374"
syn_mapping["Cortical thickening (humeral)"]="HP:0003868"
syn_mapping["Abnormality of Descemet's membrane"]="HP:0011490"
syn_mapping["Increased serum free triiodothyronine (fT3)"]="HP:0011788"
syn_mapping["Limited knee flexionextension"]="HP:0005085"
syn_mapping["Limited pronationsupination of forearm"]="HP:0006394"
syn_mapping["Biilateral vocal cord paralysis"]="HP:0012820"
syn_mapping["Macroreticular retinal dystrophy"]="HP:0000556"
syn_mapping["Congenital visual impairment"]="HP:0000505"
syn_mapping["AtrophyDegeneration affecting the brainstem"]="HP:0007366"
syn_mapping["Midface prominence"]="HP:0430026"
syn_mapping["Basal lamina 'onion bulb' formation"]="HP:0003400"
syn_mapping["Neutrophillia"]="HP:0011897"
syn_mapping["Elevated circulating parathyroid hormone (PTH) level"]="HP:0003165"
syn_mapping["Generalized cerebral atrophyhypoplasia"]="HP:0007058"
syn_mapping["Somnolence"]="HP:0001262"
syn_mapping["Increased IgA level"]="HP:0003261"
syn_mapping["Increased blood urea nitrogen (BUN)"]="HP:0003138"
syn_mapping["Decreased number of CD8+ T cells"]="HP:0410385"
syn_mapping["Abnormality of acetylcarnitine metabolism"]="HP:0012071"
syn_mapping["Hemisacrum (S2-S5)"]="HP:0009790"
syn_mapping["EMG Positive sharp waves"]="HP:0030007"
syn_mapping["Spastichyperactive bladder"]="HP:0005340"
syn_mapping["AplasiaHypoplasia of the clavicles"]="HP:0006710"
syn_mapping["Congenital exotropia"]="HP:0000577"
syn_mapping["Abnormal rapid eye movement (REM) sleep"]="HP:0002494"
syn_mapping["Flared metaphyses (elbow)"]="HP:0003950"
syn_mapping["Cortical subperiosteal resorption (humeral metaphyses)"]="HP:0003909"
syn_mapping["Abnormality of ornithine metabolism"]="HP:0012025"
syn_mapping["Oligodactyly (feet)"]="HP:0001849"
syn_mapping["AplasiaHypoplasia of the musculature of the pelvis"]="HP:0001471"
syn_mapping["Status Asthmaticus"]="HP:0012653"
syn_mapping["Vitreoretinal abnormalities"]="HP:0007773"
syn_mapping["Abnormality of natural killer cell number"]="HP:0040089"
syn_mapping["AplasiaHypoplasia of the colon"]="HP:0100811"
syn_mapping["AplasiaHypoplasia of the nasal bone"]="HP:0010940"
syn_mapping["Abnormality of B cell number"]="HP:0010975"
syn_mapping["Abnormality of lymphocytes"]="HP:0004332"
syn_mapping["AplasiaHypoplasia of the eyebrow"]="HP:0100840"
syn_mapping["AplasiaHypoplasia of the mandible"]="HP:0009118"
syn_mapping["AplasiaHypoplasia of the distal phalanges of the hand"]="HP:0009835"
syn_mapping["AplasiaHypoplasia of the middle phalanges of the hand"]="HP:0009843"
syn_mapping["AplasiaHypoplasia of the thumb"]="HP:0009601"
syn_mapping["Oligodactyly (hands)"]="HP:0012165"
syn_mapping["AplasiaHypoplasia of the middle phalanx of the 3rd toe"]="HP:0100372"
syn_mapping["AplasiaHypoplasia of the 3rd toe"]="HP:0010331"
syn_mapping["AplasiaHypoplasia of the 4th toe"]="HP:0010337"
syn_mapping["AplasiaHypoplasia of the 5th toe"]="HP:0010343"
syn_mapping["AplasiaHypoplasia of toe"]="HP:0001991"
syn_mapping["Abnormality of the metaphyses"]="HP:0003907"
syn_mapping["AplasiaHypoplasia of the ulna"]="HP:0006495"
syn_mapping["Abnormality of the fibula"]="HP:0010595"
syn_mapping["Abnormality of the metatarsal bones"]="HP:0001832"
syn_mapping["Abnormality of the tibia"]="HP:0002992"
syn_mapping["Decreasedabsent ankle reflexes"]="HP:0200101"
syn_mapping["Abnormal enzymecoenzyme activity"]="HP:0012379"
syn_mapping["Abnormality of carbohydrate metabolismhomeostasis"]="HP:0011013"
syn_mapping["Abnormality of aspartate family amino acid metabolism"]="HP:0010899"
syn_mapping["Abnormality of citrulline metabolism"]="HP:0011965"
syn_mapping["Abnormality of homocysteine metabolism"]="HP:0010919"
syn_mapping["Chromsome breakage"]="HP:0040012"
syn_mapping["Decreased activity of the pyruvate dehydrogenase (PDH) complex"]="HP:0002928"
syn_mapping["Abnormality of copper homeostasis"]="HP:0010836"
syn_mapping["Abnormal serum iron"]="HP:0040130"
syn_mapping["Abnormality of lipid metabolism"]="HP:0003119"
syn_mapping["Abnormality of fatty-acid metabolism"]="HP:0004359"
syn_mapping["Abnormality of carnitine metabolism"]="HP:0010967"
syn_mapping["Abnormality of long-chain fatty-acid metabolism"]="HP:0010964"
syn_mapping["Abnormality of glycine metabolism"]="HP:0010895"
syn_mapping["Abnormality of the esophagus"]="HP:0025270"
syn_mapping["Elevated hepatic transaminases"]="HP:0002910"
syn_mapping["Abnormality of cardiac atrium"]="HP:0005120"
syn_mapping["Ebstein's anomaly of the tricuspid valve"]="HP:0010316"
syn_mapping["Effort-induced polymorphic ventricular tachycardias"]="HP:0004758"
syn_mapping["Abnormality of circle of Willis"]="HP:0012518"
syn_mapping["Coronary artery disease"]="HP:0006704"
syn_mapping["Peripheral arterial disease"]="HP:0004950"
syn_mapping["Dilatation of the ascending aorta"]=""
syn_mapping["AplasiaHypoplasia of the tragus"]="HP:0009913"
syn_mapping["Adrenocorticotropin (ACTH) deficient adrenal insufficiency"]="HP:0011735"
syn_mapping["Abnormality of the conjunctiva"]="HP:0008054"
syn_mapping["Congenital primary aphakia"]=""
syn_mapping["AplasiaHypoplasia of the optic nerve"]="HP:0008058"
syn_mapping["Abnormality of the vitreous humor"]="HP:0004327"
syn_mapping["Congenital strabismus"]="HP:0000486"
syn_mapping["Abnormality of vision evoked potentials"]="HP:0000649"
syn_mapping["Hemianopic blurring of vision"]="HP:0001125"
syn_mapping["Congenital glaucoma"]="HP:0008007"
syn_mapping["AplasiaHypoplasia of the testes"]="HP:0010468"
syn_mapping["Aplastichypoplastic toenail"]="HP:0010624"
syn_mapping["AplasiaHypoplasia of the nails"]="HP:0008386"
syn_mapping["EMG axonal abnormality"]="HP:0003482"
syn_mapping["EMG neuropathic changes"]="HP:0003445"
syn_mapping["Abnormality of the globus pallidus"]="HP:0002454"
syn_mapping["AplasiaHypoplasia of the corpus callosum"]="HP:0007370"
syn_mapping["AplasiaHypoplasia of the cerebral white matter"]="HP:0012429"
syn_mapping["AtrophyDegeneration affecting the cerebrum"]="HP:0007369"
syn_mapping["Porencephaly"]=""
syn_mapping["AplasiaHypoplasia of the cerebellar vermis"]="HP:0006817"
syn_mapping["Brain very small"]=""
syn_mapping["AtrophyDegeneration involving the spinal cord"]="HP:0007344"
syn_mapping["AplasiaHypoplasia involving the central nervous system"]="HP:0002977"
syn_mapping["AtrophyDegeneration affecting the central nervous system"]="HP:0007367"
syn_mapping["CNS infection"]="HP:0011450"
syn_mapping["Abnormal pyramidal signs"]="HP:0007256"
syn_mapping["Hemiplegiahemiparesis"]="HP:0004374"
syn_mapping["Reduced consciousnessconfusion"]="HP:0004372"
syn_mapping["Inability to walk by childhoodadolescence"]="HP:0006915"
syn_mapping["Abnormal emotionaffect behavior"]="HP:0100851"
syn_mapping["Abnormal fearanxiety-related behavior"]="HP:0100852"
syn_mapping["Abnormality of the lung"]="HP:0002088"
syn_mapping["Abnormality of the tracheobronchial system"]=""
syn_mapping["AplasiaHypoplasia of the ribs"]="HP:0006712"
syn_mapping["Chronic recurrent multifocal osteomyelitis"]=""
syn_mapping["Increased number of peripheral CD3+ T cells"]=""

In [None]:
# list of unique HPO ID for unique phenotypes (HPO terms) in the network
list_unique_phen,HPO_unique_phen=[],[]
for phen in list(phenotypes)[1:]:
    if not(phen.split("\\")[-2] in list_unique_phen):
        list_unique_phen.append(phen.split("\\")[-2])
for phen in list_unique_phen:
    if phen in mapping_HPO:
        HPO_unique_phen.append(mapping_HPO[phen]["id"])
    else:
        HPO_unique_phen.append(syn_mapping[phen])
HPO_unique_phen

In [None]:
with open("HPO_unique_phen.txt","w") as hpoun:
    for i in range(len(HPO_unique_phen)):
        hpoun.write(HPO_unique_phen[i]+"\n")
hpoun.close()

In [None]:
# list of unique HPO terms with more than 5% representation in pediatric clusters
list_unique_phen_cl,HPO_unique_phen_cl=[],[]
for cl in phen_ranked_pos_pediatric:
    ind=[i for i in range(len(phen_ranked_pos_pediatric[cl][1])) if phen_ranked_pos_pediatric[cl][1][i]>0.05]
    for j in ind:
        list_unique_phen_cl.append(phen_ranked_pos_pediatric[cl][0][j])
for phen in list_unique_phen_cl:
    if phen in mapping_HPO:
        HPO_unique_phen_cl.append(mapping_HPO[phen]["id"])
    else:
        HPO_unique_phen_cl.append(syn_mapping[phen])
len(HPO_unique_phen_cl)

In [None]:
# saves a csv file with first column cluster number and second term HPO terms, for the 5 best phenotypes of clusters
import csv
with open('clusters_HPO_terms_adult.csv', mode='w') as clHPO:
    csvwriter = csv.writer(clHPO, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for cl in phen_ranked_pos_TEST_adult:
        for j in range(5):
            phen=phen_ranked_pos_TEST_adult[cl][0][j]
            if phen in mapping_HPO:
                csvwriter.writerow([cl, mapping_HPO[phen]["id"]])
            else:
                csvwriter.writerow([cl, syn_mapping[phen]])

In [None]:
# save all the unique HPO terms in a txt file
with open("HPO_unique_phen_cl.txt","w") as hpoun:
    for i in range(len(HPO_unique_phen_cl)):
        hpoun.write(HPO_unique_phen_cl[i]+"\n")
hpoun.close()