In [2]:
from ontoma import OnToma
import pandas as pd
from pathlib import Path

In [3]:
ROOT = Path.cwd().parent

binary = pd.read_excel(Path.joinpath(ROOT / 'data/Supp Table 8 - Collapsing analysis top hits.xlsx'), sheet_name='Binary')
quant = pd.read_excel(Path.joinpath(ROOT / 'data/Supp Table 8 - Collapsing analysis top hits.xlsx'), sheet_name='Quantitative')

In [4]:
print("Assoc with binary trait:\n", binary.iloc[0])
print("Assoc with quantitative trait:\n", quant.iloc[0])

Assoc with binary trait:
 model                                                      ptv
phenotype                         20002#1427#polycystic kidney
root          Chapter XIV Diseases of the genitourinary system
Gene                                                      PKD1
CaseQ                                                       39
CaseNQ                                                     134
Case.Freq                                               0.2254
CtrlQ                                                       43
CtrlNQ                                                  268739
Ctrl.Freq                                               0.0002
p.value                                                    0.0
OR                                                   1818.9554
OR.LCI                                               1142.1813
OR.UCI                                               2896.7369
OMIM_code                                                    1
OMIM         Polycystic kidne

## Get all phenotypes together

- 1765 associations with 844 binary and quantitative traits without cleaning (689 are binary, 155 are quantitative)
- Out of 844, 738 have mapped OMIMs.
  - There are 414 different OMIM codes. More than 90% are mappable with OnToma (387/414).
  - 106 do not have an OMIM xref. Phenotype string needs to be queried.

In [5]:
pheno = (
    pd.concat(
        [binary[['phenotype', 'OMIM']], quant.rename(columns={'Pheno':'phenotype'})[['phenotype', 'OMIM']]]
    )
    .drop_duplicates()
    .reset_index(drop=True)
)
print(pheno.shape)
pheno.head()


(1765, 2)


Unnamed: 0,phenotype,OMIM
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...
2,Source of report of Q61 (cystic kidney disease),Polycystic kidney disease 1; 173900 (3); Autos...
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney disease 1; 173900 (3); Autos...
4,Union#Q61#Q61 Cystic kidney disease,Polycystic kidney disease 1; 173900 (3); Autos...


In [6]:
pheno['extracted_omim'] = pheno['OMIM'].str.findall(r"\d{6}")
pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,[173900]
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,"[605039, 614286]"
2,Source of report of Q61 (cystic kidney disease),Polycystic kidney disease 1; 173900 (3); Autos...,[173900]
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney disease 1; 173900 (3); Autos...,[173900]
4,Union#Q61#Q61 Cystic kidney disease,Polycystic kidney disease 1; 173900 (3); Autos...,[173900]


In [7]:
pheno = pheno.explode('extracted_omim')
pheno['extracted_omim'] = 'OMIM:' + pheno.extracted_omim
pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286
2,Source of report of Q61 (cystic kidney disease),Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900


In [8]:
omims = set(pheno.extracted_omim.dropna().values)

len(omims)

414

In [9]:
pheno.drop_duplicates(subset='phenotype').extracted_omim.isna().value_counts()

False    738
True     106
Name: extracted_omim, dtype: int64

## Map OMIM codes

In [39]:
otmap = OnToma()

INFO     - ontoma.interface - Using EFO cache from /var/folders/54/2j7x_lqn343_d6hjm7mcv9rc0000gn/T/tmpuh407fuw.
INFO:ontoma.interface:Using EFO cache from /var/folders/54/2j7x_lqn343_d6hjm7mcv9rc0000gn/T/tmpuh407fuw.
INFO     - ontoma.interface - Loaded 20060 terms, 89072 xrefs, and 65125 synonyms from EFO cache.
INFO:ontoma.interface:Loaded 20060 terms, 89072 xrefs, and 65125 synonyms from EFO cache.


In [58]:
def map_omim(term):
    """
    OnToma is first queried with the code flag, if this fails common steps for a string are used.
    """
    try:
        res = otmap.find_term(term, code=True)

        if len(res) > 0:
            return res[0].id_ot_schema

        res = otmap.find_term(term)
        if len(res) > 0:
            return res[0].id_ot_schema

        return
    except Exception:
        print(term)

In [75]:
omims_mapped = {term: map_omim(term) for term in omims}

INFO     - ontoma.interface - Processed: OMIM:187800 → [OnTomaResult(query='OMIM:187800', id_normalised='ORDO:140957', id_ot_schema='Orphanet_140957', id_full_uri='http://www.orpha.net/ORDO/Orphanet_140957', label='autosomal dominant macrothrombocytopenia')]
INFO:ontoma.interface:Processed: OMIM:187800 → [OnTomaResult(query='OMIM:187800', id_normalised='ORDO:140957', id_ot_schema='Orphanet_140957', id_full_uri='http://www.orpha.net/ORDO/Orphanet_140957', label='autosomal dominant macrothrombocytopenia')]
INFO     - ontoma.interface - Processed: OMIM:158320 → [OnTomaResult(query='OMIM:158320', id_normalised='ORDO:587', id_ot_schema='Orphanet_587', id_full_uri='http://www.orpha.net/ORDO/Orphanet_587', label='muir-torre syndrome')]
INFO:ontoma.interface:Processed: OMIM:158320 → [OnTomaResult(query='OMIM:158320', id_normalised='ORDO:587', id_ot_schema='Orphanet_587', id_full_uri='http://www.orpha.net/ORDO/Orphanet_587', label='muir-torre syndrome')]
INFO     - ontoma.interface - Processed:

In [74]:
unmapped_omims = [term for term in omims_mapped.keys() if omims_mapped[term] == None]

print(len(unmapped_omims))
unmapped_omims

27


['OMIM:138900',
 'OMIM:601884',
 'OMIM:601800',
 'OMIM:601551',
 'OMIM:615264',
 'OMIM:112010',
 'OMIM:227220',
 'OMIM:615881',
 'OMIM:617321',
 'OMIM:110500',
 'OMIM:608404',
 'OMIM:617671',
 'OMIM:612797',
 'OMIM:617981',
 'OMIM:111150',
 'OMIM:616622',
 'OMIM:601816',
 'OMIM:614419',
 'OMIM:614490',
 'OMIM:601550',
 'OMIM:613463',
 'OMIM:617948',
 'OMIM:610762',
 'OMIM:138500',
 'OMIM:112050',
 'OMIM:617966',
 'OMIM:609338']

### Clean Phenotype string

Steps:
- Codes and Tags that are at the beginning of the string need to be cleaned. Ex. "41202#Q12#Q12 Congenital lens malformations"
- Split by `|` and keep the first element. Ex. "41202#Q613#Q61.3 Polycystic kidney| unspecified"
- Split and explode by `and`. Ex. "Source of report of L92 (granulomatous disorders of skin and subcutaneous tissue)"
- Remove `Other` from string. Ex. "41202#Q82#Q82 Other congenital malformations of skin"
- Remove trailing parenthesis. Ex. "Source of report of L52 (erythema nodosum)"
- Make the string lowercase.

In [8]:
pheno

Unnamed: 0,phenotype,OMIM,extracted_omim
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286
2,Source of report of Q61 (cystic kidney disease),Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900
...,...,...,...
1764,Aspartate aminotransferase,Leukemia; juvenile myelomonocytic; 607785 (3);...,OMIM:607785
1764,Aspartate aminotransferase,Leukemia; juvenile myelomonocytic; 607785 (3);...,OMIM:162210
1764,Aspartate aminotransferase,Leukemia; juvenile myelomonocytic; 607785 (3);...,OMIM:162200
1764,Aspartate aminotransferase,Leukemia; juvenile myelomonocytic; 607785 (3);...,OMIM:601321


In [12]:
# 1. Remove cases like "41272#W365#W36.5 Diagnostic extraction of bone marrow NEC" -> "Diagnostic extraction of bone marrow NEC"

pheno['cleaned_pheno'] = pheno.phenotype.str.split(r'^(\w+#\w+#\w*\.\w+\s)', expand=False).str[-1]

pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,20002#1427#polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Union#C92#C92 Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Union#C92#C92 Myeloid leukaemia
2,Source of report of Q61 (cystic kidney disease),Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,Source of report of Q61 (cystic kidney disease)
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,Polycystic kidney| adult type


In [14]:
# 2. Remove cases like "Union#C92#C92 Myeloid leukaemia" -> "Myeloid leukaemia"

pheno['cleaned_pheno'] = pheno.cleaned_pheno.str.split(r'^(Union#\w+#\w+\s)', expand=False).str[-1]
pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,20002#1427#polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia
2,Source of report of Q61 (cystic kidney disease),Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,Source of report of Q61 (cystic kidney disease)
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,Polycystic kidney| adult type


In [15]:
# 3. Remove cases like "20002#1427#polycystic kidney" -> "polycystic kidney"

pheno['cleaned_pheno'] = pheno.cleaned_pheno.str.split(r'^(\w+#\w+#)', expand=False).str[-1]
pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia
2,Source of report of Q61 (cystic kidney disease),Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,Source of report of Q61 (cystic kidney disease)
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,Polycystic kidney| adult type


In [16]:
# 4. Remove cases like "Source of report of Q61 (cystic kidney disease)" -> "cystic kidney disease"

pheno['cleaned_pheno'] = pheno.cleaned_pheno.str.split(r'^(Source of report of \w+\s)', expand=False).str[-1]
pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia
2,Source of report of Q61 (cystic kidney disease),Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,(cystic kidney disease)
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,Polycystic kidney| adult type


In [19]:
# 5. Split by `|` and keep the first element. Ex. "41202#Q613#Q61.3 Polycystic kidney| unspecified"

pheno['cleaned_pheno'] = pheno.cleaned_pheno.str.split(r'|', expand=False).str[0]
pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia
2,Source of report of Q61 (cystic kidney disease),Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,(cystic kidney disease)
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,Polycystic kidney


In [24]:
# 6. Remove trailing parenthesis. Ex. "(erythema nodosum)" -> "erythema nodosum"

pheno['cleaned_pheno'] = pheno.cleaned_pheno.str.replace('[()]', '')
pheno.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia
2,Source of report of Q61 (cystic kidney disease),Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,cystic kidney disease
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,Polycystic kidney
