In [2]:
from ontoma import OnToma
import pandas as pd
from pathlib import Path

In [3]:
ROOT = Path.cwd().parent

binary = pd.read_excel(Path.joinpath(ROOT / 'data/Supp Table 8 - Collapsing analysis top hits.xlsx'), sheet_name='Binary')
quant = pd.read_excel(Path.joinpath(ROOT / 'data/Supp Table 8 - Collapsing analysis top hits.xlsx'), sheet_name='Quantitative')

In [4]:
print("Assoc with binary trait:\n", binary.iloc[0])
print("Assoc with quantitative trait:\n", quant.iloc[0])

Assoc with binary trait:
 model                                                      ptv
phenotype                         20002#1427#polycystic kidney
root          Chapter XIV Diseases of the genitourinary system
Gene                                                      PKD1
CaseQ                                                       39
CaseNQ                                                     134
Case.Freq                                               0.2254
CtrlQ                                                       43
CtrlNQ                                                  268739
Ctrl.Freq                                               0.0002
p.value                                                    0.0
OR                                                   1818.9554
OR.LCI                                               1142.1813
OR.UCI                                               2896.7369
OMIM_code                                                    1
OMIM         Polycystic kidne

In [6]:
quant.columns

Index(['model', 'Gene', 'Field', 'Path', 'Pheno', 'nSamples', 'nCarriers',
       'nNonCarriers', 'p-value', 'beta', 'lower', 'upper', 'se', 'OMIM'],
      dtype='object')

## Get all phenotypes together

- 1765 associations with 844 binary and quantitative traits without cleaning (689 are binary, 155 are quantitative)
- 1516 **statistically significant** assocs.
- There are 746 different phenotypes, of which 702 I am able to map using OnToma (94%).
- Out of 746, 667 have OMIM notation.
  - There are 396 different OMIM codes. More than 90% are mappable with OnToma (370/396).
  - 79 do not have an OMIM xref. Phenotype string needs to be queried.

- Phenotypes after cleaning: 151 out of 534 mappable with OnToma after several cleaning steps.

In [11]:
pheno = (
    pd.concat(
        [binary[['phenotype', 'OMIM', 'p.value']], quant.rename(columns={'Pheno':'phenotype', 'p-value':'p.value'})[['phenotype', 'OMIM', 'p.value']]]
    )

    # Extract significant associations. Threshold = 2x10^-9
    .query('`p.value` < 2e-9').drop('p.value', axis=1)

    .drop_duplicates()
    .reset_index(drop=True)
)
print(pheno.shape)
pheno.head()


(1516, 2)


Unnamed: 0,phenotype,OMIM
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...
2,Source of report of Q61 (cystic kidney disease),Polycystic kidney disease 1; 173900 (3); Autos...
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney disease 1; 173900 (3); Autos...
4,Union#Q61#Q61 Cystic kidney disease,Polycystic kidney disease 1; 173900 (3); Autos...


In [17]:
pheno['extracted_omim'] = pheno['OMIM'].str.findall(r"\d{6}")
pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,[173900]
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,"[605039, 614286]"
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,"[605039, 614286]"
2,Source of report of Q61 (cystic kidney disease),Polycystic kidney disease 1; 173900 (3); Autos...,[173900]
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney disease 1; 173900 (3); Autos...,[173900]


In [18]:
pheno = pheno.explode('extracted_omim')
pheno['extracted_omim'] = 'OMIM:' + pheno.extracted_omim
pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286


In [19]:
omims = set(pheno.extracted_omim.dropna().values)

len(omims)

396

In [20]:
pheno.drop_duplicates(subset='phenotype').extracted_omim.isna().value_counts()

False    667
True      79
Name: extracted_omim, dtype: int64

## Map OMIM codes

In [None]:
otmap = OnToma()

In [22]:
def ontoma_lookup(term, query_code):
    """
    OnToma is first queried with the code flag, if this fails common steps for a string are used.
    """
    try:
        if query_code:
            res = otmap.find_term(term, code=True)

            if len(res) > 0:
                return res[0].id_ot_schema

        res = otmap.find_term(term)
        if len(res) > 0:
            return res[0].id_ot_schema

        return
    except Exception:
        print(term)

In [None]:
omims_mapped = {term: ontoma_lookup(term, query_code=True) for term in omims}

In [24]:
unmapped_omims = [term for term in omims_mapped.keys() if omims_mapped[term] == None]

print(len(unmapped_omims))
unmapped_omims

26


['OMIM:601800',
 'OMIM:608404',
 'OMIM:617981',
 'OMIM:613463',
 'OMIM:111150',
 'OMIM:112050',
 'OMIM:110500',
 'OMIM:616622',
 'OMIM:615881',
 'OMIM:610762',
 'OMIM:617671',
 'OMIM:612797',
 'OMIM:615264',
 'OMIM:138500',
 'OMIM:617966',
 'OMIM:601551',
 'OMIM:601816',
 'OMIM:112010',
 'OMIM:609338',
 'OMIM:601884',
 'OMIM:614490',
 'OMIM:138900',
 'OMIM:617948',
 'OMIM:614419',
 'OMIM:617321',
 'OMIM:601550']

### Clean Phenotype string

Steps:
- Codes and Tags that are at the beginning of the string need to be cleaned. Ex. "41202#Q12#Q12 Congenital lens malformations"
- Split by `|` and keep the first element. Ex. "41202#Q613#Q61.3 Polycystic kidney| unspecified"
- Split and explode by `and`. Ex. "Source of report of L92 (granulomatous disorders of skin and subcutaneous tissue)". Problem: "Chapter B - Endocrine System and Breast". Won't do, the problems are more prevalent than the gains
- Remove `Other` from string. Ex. "41202#Q82#Q82 Other congenital malformations of skin"
- Remove trailing parenthesis. Ex. "Source of report of L52 (erythema nodosum)"
- Make the string lowercase and remove trailing whitespaces.

In [28]:
pheno

Unnamed: 0,phenotype,OMIM,extracted_omim
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286
...,...,...,...
1514,Mean corpuscular haemoglobin,Anemia; sideroblastic; 1; 300751 (3); X-linked...,OMIM:300751
1514,Mean corpuscular haemoglobin,Anemia; sideroblastic; 1; 300751 (3); X-linked...,OMIM:300752
1514,Mean corpuscular haemoglobin,Anemia; sideroblastic; 1; 300751 (3); X-linked...,OMIM:300751
1514,Mean corpuscular haemoglobin,Anemia; sideroblastic; 1; 300751 (3); X-linked...,OMIM:300752


In [29]:
# 1. Remove cases like "41272#W365#W36.5 Diagnostic extraction of bone marrow NEC" -> "Diagnostic extraction of bone marrow NEC"

pheno['cleaned_pheno'] = pheno.phenotype.str.split(r'^(\w+#\w+#\w*\.\w+\s)', expand=False).str[-1]

pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,20002#1427#polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Union#C92#C92 Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Union#C92#C92 Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Union#C92#C92 Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Union#C92#C92 Myeloid leukaemia


In [30]:
# 2. Remove cases like "Union#C92#C92 Myeloid leukaemia" -> "Myeloid leukaemia"

pheno['cleaned_pheno'] = pheno.cleaned_pheno.str.split(r'^(Union#\w+#\w+\s)', expand=False).str[-1]
pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,20002#1427#polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia


In [31]:
# 3. Remove cases like "20002#1427#polycystic kidney" -> "polycystic kidney"

pheno['cleaned_pheno'] = pheno.cleaned_pheno.str.split(r'^(\w+#\w+#)', expand=False).str[-1]
pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia


In [32]:
# 4. Remove cases like "Source of report of Q61 (cystic kidney disease)" -> "cystic kidney disease"

pheno['cleaned_pheno'] = pheno.cleaned_pheno.str.split(r'^(Source of report of \w+\s)', expand=False).str[-1]
pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia


In [33]:
# 5. Split by `|` and keep the first element. Ex. "41202#Q613#Q61.3 Polycystic kidney| unspecified"

pheno['cleaned_pheno'] = pheno.cleaned_pheno.str.split(r'|', expand=False).str[0]
pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia


In [34]:
# 6. Remove trailing parenthesis. Ex. "(erythema nodosum)" -> "erythema nodosum"

pheno['cleaned_pheno'] = pheno.cleaned_pheno.str.replace('[()]', '')
pheno.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia


In [35]:
# 7. Remove cases like 'Union#BlockJ09-J18#J09-J18 Influenza and pneumonia' -> ' Influenza and pneumonia'

pheno['cleaned_pheno'] = pheno.cleaned_pheno.str.split(r'^Union\S+', expand=False).str[-1]

pheno[pheno['phenotype'].str.contains('Union')].head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,Polycystic kidney


In [36]:
# 8. Remove `Other` from string. Ex. "41202#Q82#Q82 Other congenital malformations of skin"

pheno['cleaned_pheno'] = pheno.cleaned_pheno.str.split(r'Other\s', expand=False).str[-1]

pheno[pheno['phenotype'].str.contains('Other')].head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
15,Union#L30#L30 Other dermatitis,Ichthyosis vulgaris; 146700 (3); Autosomal dom...,OMIM:146700,dermatitis
15,Union#L30#L30 Other dermatitis,Ichthyosis vulgaris; 146700 (3); Autosomal dom...,OMIM:605803,dermatitis
15,Union#L30#L30 Other dermatitis,Ichthyosis vulgaris; 146700 (3); Autosomal dom...,OMIM:146700,dermatitis
15,Union#L30#L30 Other dermatitis,Ichthyosis vulgaris; 146700 (3); Autosomal dom...,OMIM:605803,dermatitis
49,Union#N28#N28 Other disorders of kidney and ur...,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,disorders of kidney and ureter


In [37]:
# 9. Remove cases like 'C92 Myeloid leukaemia' -> ' Myeloid leukaemia'

pheno['cleaned_pheno'] = pheno.cleaned_pheno.str.split(r'^\w\d{2}', expand=False).str[-1]

pheno[pheno['phenotype'].str.contains(r'^\w\d{2}')].head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,polycystic kidney
5,20002#1452#eczema|dermatitis,Ichthyosis vulgaris; 146700 (3); Autosomal dom...,OMIM:146700,eczema
5,20002#1452#eczema|dermatitis,Ichthyosis vulgaris; 146700 (3); Autosomal dom...,OMIM:605803,eczema
5,20002#1452#eczema|dermatitis,Ichthyosis vulgaris; 146700 (3); Autosomal dom...,OMIM:146700,eczema
5,20002#1452#eczema|dermatitis,Ichthyosis vulgaris; 146700 (3); Autosomal dom...,OMIM:605803,eczema


In [38]:
# 10. Lower and strip string

pheno['cleaned_pheno'] = pheno.cleaned_pheno.apply(lambda X: X.strip().lower())

pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,myeloid leukaemia


In [39]:
phenos = set(pheno.cleaned_pheno.dropna().values)

len(phenos)

534

In [None]:
phenos_mapped = {pheno: ontoma_lookup(pheno, query_code=False) for pheno in phenos}

In [41]:
len([pheno for pheno in phenos_mapped if phenos_mapped[pheno] != None])

151

### Build mappings into df to coalesce

In [43]:
pheno['pheno_mapping'] = pheno['cleaned_pheno']
pheno['omim_mapping'] = pheno['extracted_omim']

pheno = pheno.replace({'pheno_mapping': phenos_mapped, 'omim_mapping': omims_mapped})
pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno,pheno_mapping,omim_mapping
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,polycystic kidney,EFO_0008620,EFO_1001496
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,myeloid leukaemia,EFO_0000222,Orphanet_97297
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,myeloid leukaemia,EFO_0000222,EFO_0000198
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,myeloid leukaemia,EFO_0000222,Orphanet_97297
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,myeloid leukaemia,EFO_0000222,EFO_0000198


In [44]:
pheno['mapping'] = pheno.bfill(axis=1).iloc[:, 5]

pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno,pheno_mapping,omim_mapping,mapping
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,polycystic kidney,EFO_0008620,EFO_1001496,EFO_1001496
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,myeloid leukaemia,EFO_0000222,Orphanet_97297,Orphanet_97297
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,myeloid leukaemia,EFO_0000222,EFO_0000198,EFO_0000198
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,myeloid leukaemia,EFO_0000222,Orphanet_97297,Orphanet_97297
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,myeloid leukaemia,EFO_0000222,EFO_0000198,EFO_0000198


In [48]:
pheno_agg = pheno.dropna(subset=['mapping']).groupby('phenotype')['mapping'].apply(set).reset_index(name='mappings')

pheno_agg.head()

False    702
Name: mappings, dtype: int64


Unnamed: 0,phenotype,mappings
0,20001#1002#breast cancer,"{Orphanet_524, MONDO_0012565, EFO_0000673, EFO..."
1,20001#1003#skin cancer,"{EFO_0000756, Orphanet_404560, Orphanet_618}"
2,20001#1020#large bowel cancer|colorectal cancer,"{EFO_0000497, Orphanet_252202, MONDO_0023113, ..."
3,20001#1022#colon cancer|sigmoid cancer,"{EFO_0000497, Orphanet_252202, MONDO_0023113, ..."
4,20001#1030#eye and|or adnexal cancer,"{MONDO_0001187, EFO_0005717, MONDO_0002629, EF..."
