In [1]:
import re

from ontoma import OnToma
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
ROOT = Path.cwd().parent

binary = pd.read_excel(Path.joinpath(ROOT / 'data/Supp Table 8 - Collapsing analysis top hits.xlsx'), sheet_name='Binary')
quant = pd.read_excel(Path.joinpath(ROOT / 'data/Supp Table 8 - Collapsing analysis top hits.xlsx'), sheet_name='Quantitative')

In [4]:
print("Assoc with binary trait:\n", binary.iloc[0])
print("Assoc with quantitative trait:\n", quant.iloc[0])

Assoc with binary trait:
 model                                                      ptv
phenotype                         20002#1427#polycystic kidney
root          Chapter XIV Diseases of the genitourinary system
Gene                                                      PKD1
CaseQ                                                       39
CaseNQ                                                     134
Case.Freq                                               0.2254
CtrlQ                                                       43
CtrlNQ                                                  268739
Ctrl.Freq                                               0.0002
p.value                                                    0.0
OR                                                   1818.9554
OR.LCI                                               1142.1813
OR.UCI                                               2896.7369
OMIM_code                                                    1
OMIM         Polycystic kidne

In [6]:
quant.columns

Index(['model', 'Gene', 'Field', 'Path', 'Pheno', 'nSamples', 'nCarriers',
       'nNonCarriers', 'p-value', 'beta', 'lower', 'upper', 'se', 'OMIM'],
      dtype='object')

## Get all phenotypes together

- 1765 associations with 844 binary and quantitative traits without cleaning (689 are binary, 155 are quantitative)
- 1516 **statistically significant** assocs.
- There are 746 different phenotypes, of which 702 I am able to map using OnToma (94%).
- Out of 746, 667 have OMIM notation.
  - There are 396 different OMIM codes. More than 90% are mappable with OnToma (370/396).
  - 79 do not have an OMIM xref. Phenotype string needs to be queried.

- Phenotypes after cleaning: 151 out of 534 mappable with OnToma after several cleaning steps.

In [52]:
pheno = (
    pd.concat(
        [binary[['phenotype', 'OMIM', 'p.value']], quant.rename(columns={'Pheno':'phenotype', 'p-value':'p.value'})[['phenotype', 'OMIM', 'p.value']]]
    )

    # Extract significant associations. Threshold = 2x10^-9
    .query('`p.value` < 2e-9').drop('p.value', axis=1)

    .drop_duplicates()
    .reset_index(drop=True)
)
print(pheno.shape)
pheno.head()


(1516, 2)


Unnamed: 0,phenotype,OMIM
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...
2,Source of report of Q61 (cystic kidney disease),Polycystic kidney disease 1; 173900 (3); Autos...
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney disease 1; 173900 (3); Autos...
4,Union#Q61#Q61 Cystic kidney disease,Polycystic kidney disease 1; 173900 (3); Autos...


In [17]:
pheno['extracted_omim'] = pheno['OMIM'].str.findall(r"\d{6}")
pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,[173900]
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,"[605039, 614286]"
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,"[605039, 614286]"
2,Source of report of Q61 (cystic kidney disease),Polycystic kidney disease 1; 173900 (3); Autos...,[173900]
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney disease 1; 173900 (3); Autos...,[173900]


In [18]:
pheno = pheno.explode('extracted_omim')
pheno['extracted_omim'] = 'OMIM:' + pheno.extracted_omim
pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286


In [19]:
omims = set(pheno.extracted_omim.dropna().values)

len(omims)

396

In [20]:
pheno.drop_duplicates(subset='phenotype').extracted_omim.isna().value_counts()

False    667
True      79
Name: extracted_omim, dtype: int64

## Map OMIM codes

In [None]:
otmap = OnToma()

In [22]:
def ontoma_lookup(term, query_code):
    """
    OnToma is first queried with the code flag, if this fails common steps for a string are used.
    """
    try:
        if query_code:
            res = otmap.find_term(term, code=True)

            if len(res) > 0:
                return res[0].id_ot_schema

        res = otmap.find_term(term)
        if len(res) > 0:
            return res[0].id_ot_schema

        return
    except Exception:
        print(term)

In [None]:
omims_mapped = {term: ontoma_lookup(term, query_code=True) for term in omims}

In [24]:
unmapped_omims = [term for term in omims_mapped.keys() if omims_mapped[term] == None]

print(len(unmapped_omims))
unmapped_omims

26


['OMIM:601800',
 'OMIM:608404',
 'OMIM:617981',
 'OMIM:613463',
 'OMIM:111150',
 'OMIM:112050',
 'OMIM:110500',
 'OMIM:616622',
 'OMIM:615881',
 'OMIM:610762',
 'OMIM:617671',
 'OMIM:612797',
 'OMIM:615264',
 'OMIM:138500',
 'OMIM:617966',
 'OMIM:601551',
 'OMIM:601816',
 'OMIM:112010',
 'OMIM:609338',
 'OMIM:601884',
 'OMIM:614490',
 'OMIM:138900',
 'OMIM:617948',
 'OMIM:614419',
 'OMIM:617321',
 'OMIM:601550']

### Clean Phenotype string

Steps:
- Codes and Tags that are at the beginning of the string need to be cleaned. Ex. "41202#Q12#Q12 Congenital lens malformations"
- Split by `|` and keep the first element. Ex. "41202#Q613#Q61.3 Polycystic kidney| unspecified"
- Split and explode by `and`. Ex. "Source of report of L92 (granulomatous disorders of skin and subcutaneous tissue)". Problem: "Chapter B - Endocrine System and Breast". Won't do, the problems are more prevalent than the gains
- Remove `Other` from string. Ex. "41202#Q82#Q82 Other congenital malformations of skin"
- Remove trailing parenthesis. Ex. "Source of report of L52 (erythema nodosum)"
- Make the string lowercase and remove trailing whitespaces.

In [28]:
pheno

Unnamed: 0,phenotype,OMIM,extracted_omim
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286
...,...,...,...
1514,Mean corpuscular haemoglobin,Anemia; sideroblastic; 1; 300751 (3); X-linked...,OMIM:300751
1514,Mean corpuscular haemoglobin,Anemia; sideroblastic; 1; 300751 (3); X-linked...,OMIM:300752
1514,Mean corpuscular haemoglobin,Anemia; sideroblastic; 1; 300751 (3); X-linked...,OMIM:300751
1514,Mean corpuscular haemoglobin,Anemia; sideroblastic; 1; 300751 (3); X-linked...,OMIM:300752


In [29]:
# 1. Remove cases like "41272#W365#W36.5 Diagnostic extraction of bone marrow NEC" -> "Diagnostic extraction of bone marrow NEC"

pheno['cleaned_pheno'] = pheno.phenotype.str.split(r'^(\w+#\w+#\w*\.\w+\s)', expand=False).str[-1]

pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,20002#1427#polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Union#C92#C92 Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Union#C92#C92 Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Union#C92#C92 Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Union#C92#C92 Myeloid leukaemia


In [30]:
# 2. Remove cases like "Union#C92#C92 Myeloid leukaemia" -> "Myeloid leukaemia"

pheno['cleaned_pheno'] = pheno.cleaned_pheno.str.split(r'^(Union#\w+#\w+\s)', expand=False).str[-1]
pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,20002#1427#polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia


In [31]:
# 3. Remove cases like "20002#1427#polycystic kidney" -> "polycystic kidney"

pheno['cleaned_pheno'] = pheno.cleaned_pheno.str.split(r'^(\w+#\w+#)', expand=False).str[-1]
pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia


In [32]:
# 4. Remove cases like "Source of report of Q61 (cystic kidney disease)" -> "cystic kidney disease"

pheno['cleaned_pheno'] = pheno.cleaned_pheno.str.split(r'^(Source of report of \w+\s)', expand=False).str[-1]
pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia


In [33]:
# 5. Split by `|` and keep the first element. Ex. "41202#Q613#Q61.3 Polycystic kidney| unspecified"

pheno['cleaned_pheno'] = pheno.cleaned_pheno.str.split(r'|', expand=False).str[0]
pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia


In [34]:
# 6. Remove trailing parenthesis. Ex. "(erythema nodosum)" -> "erythema nodosum"

pheno['cleaned_pheno'] = pheno.cleaned_pheno.str.replace('[()]', '')
pheno.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia


In [35]:
# 7. Remove cases like 'Union#BlockJ09-J18#J09-J18 Influenza and pneumonia' -> ' Influenza and pneumonia'

pheno['cleaned_pheno'] = pheno.cleaned_pheno.str.split(r'^Union\S+', expand=False).str[-1]

pheno[pheno['phenotype'].str.contains('Union')].head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,Myeloid leukaemia
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,Polycystic kidney


In [36]:
# 8. Remove `Other` from string. Ex. "41202#Q82#Q82 Other congenital malformations of skin"

pheno['cleaned_pheno'] = pheno.cleaned_pheno.str.split(r'Other\s', expand=False).str[-1]

pheno[pheno['phenotype'].str.contains('Other')].head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
15,Union#L30#L30 Other dermatitis,Ichthyosis vulgaris; 146700 (3); Autosomal dom...,OMIM:146700,dermatitis
15,Union#L30#L30 Other dermatitis,Ichthyosis vulgaris; 146700 (3); Autosomal dom...,OMIM:605803,dermatitis
15,Union#L30#L30 Other dermatitis,Ichthyosis vulgaris; 146700 (3); Autosomal dom...,OMIM:146700,dermatitis
15,Union#L30#L30 Other dermatitis,Ichthyosis vulgaris; 146700 (3); Autosomal dom...,OMIM:605803,dermatitis
49,Union#N28#N28 Other disorders of kidney and ur...,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,disorders of kidney and ureter


In [37]:
# 9. Remove cases like 'C92 Myeloid leukaemia' -> ' Myeloid leukaemia'

pheno['cleaned_pheno'] = pheno.cleaned_pheno.str.split(r'^\w\d{2}', expand=False).str[-1]

pheno[pheno['phenotype'].str.contains(r'^\w\d{2}')].head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,polycystic kidney
5,20002#1452#eczema|dermatitis,Ichthyosis vulgaris; 146700 (3); Autosomal dom...,OMIM:146700,eczema
5,20002#1452#eczema|dermatitis,Ichthyosis vulgaris; 146700 (3); Autosomal dom...,OMIM:605803,eczema
5,20002#1452#eczema|dermatitis,Ichthyosis vulgaris; 146700 (3); Autosomal dom...,OMIM:146700,eczema
5,20002#1452#eczema|dermatitis,Ichthyosis vulgaris; 146700 (3); Autosomal dom...,OMIM:605803,eczema


In [38]:
# 10. Lower and strip string

pheno['cleaned_pheno'] = pheno.cleaned_pheno.apply(lambda X: X.strip().lower())

pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,myeloid leukaemia


In [39]:
phenos = set(pheno.cleaned_pheno.dropna().values)

len(phenos)

534

In [None]:
phenos_mapped = {pheno: ontoma_lookup(pheno, query_code=False) for pheno in phenos}

In [41]:
len([pheno for pheno in phenos_mapped if phenos_mapped[pheno] != None])

151

### Build mappings into df to coalesce

In [43]:
pheno['pheno_mapping'] = pheno['cleaned_pheno']
pheno['omim_mapping'] = pheno['extracted_omim']

pheno = pheno.replace({'pheno_mapping': phenos_mapped, 'omim_mapping': omims_mapped})
pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno,pheno_mapping,omim_mapping
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,polycystic kidney,EFO_0008620,EFO_1001496
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,myeloid leukaemia,EFO_0000222,Orphanet_97297
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,myeloid leukaemia,EFO_0000222,EFO_0000198
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,myeloid leukaemia,EFO_0000222,Orphanet_97297
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,myeloid leukaemia,EFO_0000222,EFO_0000198


In [44]:
pheno['mapping'] = pheno.bfill(axis=1).iloc[:, 5]

pheno.head()

Unnamed: 0,phenotype,OMIM,extracted_omim,cleaned_pheno,pheno_mapping,omim_mapping,mapping
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,OMIM:173900,polycystic kidney,EFO_0008620,EFO_1001496,EFO_1001496
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,myeloid leukaemia,EFO_0000222,Orphanet_97297,Orphanet_97297
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,myeloid leukaemia,EFO_0000222,EFO_0000198,EFO_0000198
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:605039,myeloid leukaemia,EFO_0000222,Orphanet_97297,Orphanet_97297
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,OMIM:614286,myeloid leukaemia,EFO_0000222,EFO_0000198,EFO_0000198


In [48]:
pheno_agg = pheno.dropna(subset=['mapping']).groupby('phenotype')['mapping'].apply(set).reset_index(name='mappings')

pheno_agg.head()

False    702
Name: mappings, dtype: int64


Unnamed: 0,phenotype,mappings
0,20001#1002#breast cancer,"{Orphanet_524, MONDO_0012565, EFO_0000673, EFO..."
1,20001#1003#skin cancer,"{EFO_0000756, Orphanet_404560, Orphanet_618}"
2,20001#1020#large bowel cancer|colorectal cancer,"{EFO_0000497, Orphanet_252202, MONDO_0023113, ..."
3,20001#1022#colon cancer|sigmoid cancer,"{EFO_0000497, Orphanet_252202, MONDO_0023113, ..."
4,20001#1030#eye and|or adnexal cancer,"{MONDO_0001187, EFO_0005717, MONDO_0002629, EF..."


## 2. A different approach.
Let's not use the phenotype strings and parse the information differently.
We know that most of the associations (specially for binary traits) are covered with data from OMIM/ClinVar.
Since the novelty is low, we want to suggest the best possible mapping with a 3-step strategy.

1. Exact matches with OnToma from the cleaned phenotype string as obtained above.
2. Cross reference between ICD10 and EFO from the codes mentiones in the phenotype string.
3. Highest ranking phenotype in the OT Platform for the given gene.

### 2.1. Union phenotypes

We want to handle union phenotypes with special care as they can some times be a conglomerate of diseases ("Source of report of L28 (lichen simplex chronicus and prurigo)") rather than an aggregated phenotype ("Union#J931#J93.1 Other spontaneous pneumothorax").

Data about them is collected in the Supp table 1.

In [3]:
union = pd.read_excel(Path.joinpath(ROOT / 'data/Supp table 1 - Studied phenotypes.xlsx'), sheet_name='Union Mapping')

union.head()

Unnamed: 0,Phenotype,Diagnoses - ICD10,Diagnoses - main ICD10,Underlying (primary) cause of death: ICD10,Contributory (secondary) causes of death: ICD10,Type of cancer: ICD10,"Non-cancer illness code, self-reported",Other Fields #1,Other Fields #2,Derived: First Occurrence (300K_v2 only),Chapter
0,Union#A01#A01 Typhoid and paratyphoid fevers,41270#A01#A01 Typhoid and paratyphoid fevers,41202#A01#A01 Typhoid and paratyphoid fevers,40001#A01#A01 Typhoid and paratyphoid fevers,40002#A01#A01 Typhoid and paratyphoid fevers,.,20002#1577#typhoid fever,.,.,130003,Chapter I Certain infectious and parasitic dis...
1,Union#A02#A02 Other Salmonella infections,41270#A02#A02 Other Salmonella infections,41202#A02#A02 Other Salmonella infections,40001#A02#A02 Other Salmonella infections,40002#A02#A02 Other Salmonella infections,.,.,.,.,130005,Chapter I Certain infectious and parasitic dis...
2,Union#A020#A02.0 Salmonella gastro-enteritis,41270#A020#A02.0 Salmonella gastro-enteritis,41202#A020#A02.0 Salmonella gastro-enteritis,40001#A020#A02.0 Salmonella gastro-enteritis,40002#A020#A02.0 Salmonella gastro-enteritis,.,.,.,.,.,Chapter I Certain infectious and parasitic dis...
3,Union#A03#A03 Shigellosis,41270#A03#A03 Shigellosis,41202#A03#A03 Shigellosis,40001#A03#A03 Shigellosis,40002#A03#A03 Shigellosis,.,.,.,.,130007,Chapter I Certain infectious and parasitic dis...
4,Union#A04#A04 Other bacterial intestinal infec...,41270#A04#A04 Other bacterial intestinal infec...,41202#A04#A04 Other bacterial intestinal infec...,40001#A04#A04 Other bacterial intestinal infec...,40002#A04#A04 Other bacterial intestinal infec...,.,.,.,.,130009,Chapter I Certain infectious and parasitic dis...


In [4]:
# Columns about primary and secondary diseases seem to be identical. How often is this true?

union.iloc[:, [3,4]]

# It seems to be the same except for the code in the beginning. This code is the ID that UKB gives to their phenotypes. They make the distinction between primary and secondary. The union phenotype is the aggregation of them both.


Unnamed: 0,Underlying (primary) cause of death: ICD10,Contributory (secondary) causes of death: ICD10
0,40001#A01#A01 Typhoid and paratyphoid fevers,40002#A01#A01 Typhoid and paratyphoid fevers
1,40001#A02#A02 Other Salmonella infections,40002#A02#A02 Other Salmonella infections
2,40001#A020#A02.0 Salmonella gastro-enteritis,40002#A020#A02.0 Salmonella gastro-enteritis
3,40001#A03#A03 Shigellosis,40002#A03#A03 Shigellosis
4,40001#A04#A04 Other bacterial intestinal infec...,40002#A04#A04 Other bacterial intestinal infec...
...,...,...
4906,40001#W449#W44.9 Unspecified place,40002#W449#W44.9 Unspecified place
4907,40001#W45#W45 Foreign body or object entering ...,40002#W45#W45 Foreign body or object entering ...
4908,40001#W54#W54 Bitten or struck by dog,40002#W54#W54 Bitten or struck by dog
4909,40001#W55#W55 Bitten or struck by other mammals,40002#W55#W55 Bitten or struck by other mammals


In [5]:
# Let's compare them removing the codes

tmp = union.copy().iloc[:, [3,4]]


tmp['primary'] = tmp.iloc[:, 0].str.replace(r'^\d{5}', '')
tmp['secondary'] = tmp.iloc[:, 1].str.replace(r'^\d{5}', '')
tmp = tmp.iloc[:, [2,3]]

comparison_column = np.where(tmp['primary'] == tmp['secondary'], True, False)
tmp['equal'] = comparison_column

tmp.head()

  
  import sys


Unnamed: 0,primary,secondary,equal
0,#A01#A01 Typhoid and paratyphoid fevers,#A01#A01 Typhoid and paratyphoid fevers,True
1,#A02#A02 Other Salmonella infections,#A02#A02 Other Salmonella infections,True
2,#A020#A02.0 Salmonella gastro-enteritis,#A020#A02.0 Salmonella gastro-enteritis,True
3,#A03#A03 Shigellosis,#A03#A03 Shigellosis,True
4,#A04#A04 Other bacterial intestinal infections,#A04#A04 Other bacterial intestinal infections,True


In [6]:
print(tmp.query('equal == False').drop_duplicates().shape)

tmp.query('equal == False').drop_duplicates()

# The differences in the string are minimal...

(21, 3)


Unnamed: 0,primary,secondary,equal
651,#ChapterI#Chapter I Certain infectious and par...,#Chapter I#Chapter I Certain infectious and pa...,False
652,#ChapterII#Chapter II Neoplasms,#Chapter II#Chapter II Neoplasms,False
653,#ChapterIII#Chapter III Diseases of the blood ...,#Chapter III#Chapter III Diseases of the blood...,False
654,#ChapterIV#Chapter IV Endocrine| nutritional a...,#Chapter IV#Chapter IV Endocrine| nutritional ...,False
655,#ChapterIX#Chapter IX Diseases of the circulat...,#Chapter IX#Chapter IX Diseases of the circula...,False
656,#ChapterV#Chapter V Mental and behavioural dis...,#Chapter V#Chapter V Mental and behavioural di...,False
657,#ChapterVI#Chapter VI Diseases of the nervous ...,#Chapter VI#Chapter VI Diseases of the nervous...,False
658,#ChapterVII#Chapter VII Diseases of the eye an...,#Chapter VII#Chapter VII Diseases of the eye a...,False
659,#ChapterVIII#Chapter VIII Diseases of the ear ...,#Chapter VIII#Chapter VIII Diseases of the ear...,False
660,#ChapterX#Chapter X Diseases of the respirator...,#Chapter X#Chapter X Diseases of the respirato...,False


### 2.1.2 ICD10 codes extraction

... **To find the cross reference between ICD10 and EFO from the codes mentiones in the phenotype string.**

Relevant notes:
- They all start with "Union..."
- Some of them are are a "block" of codes. For example, "Union#BlockA00-A09#A00-A09 Intestinal infectious diseases" refers to all codes between A00 and A09. I have to extract these.
- Some of the phenotypes refer to complete ICD-10 Chapters, for which the group of terms is not provided. We want to drop these. For example, 'Union#ChapterI#Chapter I Certain infectious and parasitic diseases'.


In [7]:
union.head()

Unnamed: 0,Phenotype,Diagnoses - ICD10,Diagnoses - main ICD10,Underlying (primary) cause of death: ICD10,Contributory (secondary) causes of death: ICD10,Type of cancer: ICD10,"Non-cancer illness code, self-reported",Other Fields #1,Other Fields #2,Derived: First Occurrence (300K_v2 only),Chapter
0,Union#A01#A01 Typhoid and paratyphoid fevers,41270#A01#A01 Typhoid and paratyphoid fevers,41202#A01#A01 Typhoid and paratyphoid fevers,40001#A01#A01 Typhoid and paratyphoid fevers,40002#A01#A01 Typhoid and paratyphoid fevers,.,20002#1577#typhoid fever,.,.,130003,Chapter I Certain infectious and parasitic dis...
1,Union#A02#A02 Other Salmonella infections,41270#A02#A02 Other Salmonella infections,41202#A02#A02 Other Salmonella infections,40001#A02#A02 Other Salmonella infections,40002#A02#A02 Other Salmonella infections,.,.,.,.,130005,Chapter I Certain infectious and parasitic dis...
2,Union#A020#A02.0 Salmonella gastro-enteritis,41270#A020#A02.0 Salmonella gastro-enteritis,41202#A020#A02.0 Salmonella gastro-enteritis,40001#A020#A02.0 Salmonella gastro-enteritis,40002#A020#A02.0 Salmonella gastro-enteritis,.,.,.,.,.,Chapter I Certain infectious and parasitic dis...
3,Union#A03#A03 Shigellosis,41270#A03#A03 Shigellosis,41202#A03#A03 Shigellosis,40001#A03#A03 Shigellosis,40002#A03#A03 Shigellosis,.,.,.,.,130007,Chapter I Certain infectious and parasitic dis...
4,Union#A04#A04 Other bacterial intestinal infec...,41270#A04#A04 Other bacterial intestinal infec...,41202#A04#A04 Other bacterial intestinal infec...,40001#A04#A04 Other bacterial intestinal infec...,40002#A04#A04 Other bacterial intestinal infec...,.,.,.,.,130009,Chapter I Certain infectious and parasitic dis...


In [23]:
# The phenotype usually follows the pattern: "Union#{code1}#{code2} {label}" (except when there is a chapter). Ex. "Union#A410#A41.0 Septicaemia due to Staphylococcus aureus".
# I want code2 because it is the more exact one to the ICD10 nomenclature

tmp = union.copy()[~union['Phenotype'].str.contains('Chapter')]

tmp['icd_code'] = tmp.Phenotype.str.split('#', expand=False).str[-1].str.split(' ').str[0]
tmp.head(5)

Unnamed: 0,Phenotype,Diagnoses - ICD10,Diagnoses - main ICD10,Underlying (primary) cause of death: ICD10,Contributory (secondary) causes of death: ICD10,Type of cancer: ICD10,"Non-cancer illness code, self-reported",Other Fields #1,Other Fields #2,Derived: First Occurrence (300K_v2 only),Chapter,icd_code
0,Union#A01#A01 Typhoid and paratyphoid fevers,41270#A01#A01 Typhoid and paratyphoid fevers,41202#A01#A01 Typhoid and paratyphoid fevers,40001#A01#A01 Typhoid and paratyphoid fevers,40002#A01#A01 Typhoid and paratyphoid fevers,.,20002#1577#typhoid fever,.,.,130003,Chapter I Certain infectious and parasitic dis...,A01
1,Union#A02#A02 Other Salmonella infections,41270#A02#A02 Other Salmonella infections,41202#A02#A02 Other Salmonella infections,40001#A02#A02 Other Salmonella infections,40002#A02#A02 Other Salmonella infections,.,.,.,.,130005,Chapter I Certain infectious and parasitic dis...,A02
2,Union#A020#A02.0 Salmonella gastro-enteritis,41270#A020#A02.0 Salmonella gastro-enteritis,41202#A020#A02.0 Salmonella gastro-enteritis,40001#A020#A02.0 Salmonella gastro-enteritis,40002#A020#A02.0 Salmonella gastro-enteritis,.,.,.,.,.,Chapter I Certain infectious and parasitic dis...,A02.0
3,Union#A03#A03 Shigellosis,41270#A03#A03 Shigellosis,41202#A03#A03 Shigellosis,40001#A03#A03 Shigellosis,40002#A03#A03 Shigellosis,.,.,.,.,130007,Chapter I Certain infectious and parasitic dis...,A03
4,Union#A04#A04 Other bacterial intestinal infec...,41270#A04#A04 Other bacterial intestinal infec...,41202#A04#A04 Other bacterial intestinal infec...,40001#A04#A04 Other bacterial intestinal infec...,40002#A04#A04 Other bacterial intestinal infec...,.,.,.,.,130009,Chapter I Certain infectious and parasitic dis...,A04


In [24]:
# Blocks are also nicely extracted
tmp[tmp.Phenotype.str.contains('Block')]

Unnamed: 0,Phenotype,Diagnoses - ICD10,Diagnoses - main ICD10,Underlying (primary) cause of death: ICD10,Contributory (secondary) causes of death: ICD10,Type of cancer: ICD10,"Non-cancer illness code, self-reported",Other Fields #1,Other Fields #2,Derived: First Occurrence (300K_v2 only),Chapter,icd_code
162,Union#BlockA00-A09#A00-A09 Intestinal infectio...,41270#BlockA00-A09#A00-A09 Intestinal infectio...,41202#BlockA00-A09#A00-A09 Intestinal infectio...,40001#BlockA00-A09#A00-A09 Intestinal infectio...,40002#BlockA00-A09#A00-A09 Intestinal infectio...,.,.,.,.,.,Chapter I Certain infectious and parasitic dis...,A00-A09
163,Union#BlockA15-A19#A15-A19 Tuberculosis,41270#BlockA15-A19#A15-A19 Tuberculosis,41202#BlockA15-A19#A15-A19 Tuberculosis,40001#BlockA15-A19#A15-A19 Tuberculosis,40002#BlockA15-A19#A15-A19 Tuberculosis,.,.,.,.,.,Chapter I Certain infectious and parasitic dis...,A15-A19
164,Union#BlockA20-A28#A20-A28 Certain zoonotic ba...,41270#BlockA20-A28#A20-A28 Certain zoonotic ba...,41202#BlockA20-A28#A20-A28 Certain zoonotic ba...,40001#BlockA20-A28#A20-A28 Certain zoonotic ba...,40002#BlockA20-A28#A20-A28 Certain zoonotic ba...,.,.,.,.,.,Chapter I Certain infectious and parasitic dis...,A20-A28
165,Union#BlockA30-A49#A30-A49 Other bacterial dis...,41270#BlockA30-A49#A30-A49 Other bacterial dis...,41202#BlockA30-A49#A30-A49 Other bacterial dis...,40001#BlockA30-A49#A30-A49 Other bacterial dis...,40002#BlockA30-A49#A30-A49 Other bacterial dis...,.,.,.,.,.,Chapter I Certain infectious and parasitic dis...,A30-A49
166,Union#BlockA50-A64#A50-A64 Infections with a p...,41270#BlockA50-A64#A50-A64 Infections with a p...,41202#BlockA50-A64#A50-A64 Infections with a p...,40001#BlockA50-A64#A50-A64 Infections with a p...,40002#BlockA50-A64#A50-A64 Infections with a p...,.,.,.,.,.,Chapter I Certain infectious and parasitic dis...,A50-A64
...,...,...,...,...,...,...,...,...,...,...,...,...
389,Union#BlockZ30-Z39#Z30-Z39 Persons encounterin...,41270#BlockZ30-Z39#Z30-Z39 Persons encounterin...,41202#BlockZ30-Z39#Z30-Z39 Persons encounterin...,40001#BlockZ30-Z39#Z30-Z39 Persons encounterin...,40002#BlockZ30-Z39#Z30-Z39 Persons encounterin...,.,.,.,.,.,Chapter XXI Factors influencing health status ...,Z30-Z39
390,Union#BlockZ40-Z54#Z40-Z54 Persons encounterin...,41270#BlockZ40-Z54#Z40-Z54 Persons encounterin...,41202#BlockZ40-Z54#Z40-Z54 Persons encounterin...,40001#BlockZ40-Z54#Z40-Z54 Persons encounterin...,40002#BlockZ40-Z54#Z40-Z54 Persons encounterin...,.,.,.,.,.,Chapter XXI Factors influencing health status ...,Z40-Z54
391,Union#BlockZ55-Z65#Z55-Z65 Persons with potent...,41270#BlockZ55-Z65#Z55-Z65 Persons with potent...,41202#BlockZ55-Z65#Z55-Z65 Persons with potent...,40001#BlockZ55-Z65#Z55-Z65 Persons with potent...,40002#BlockZ55-Z65#Z55-Z65 Persons with potent...,.,.,.,.,.,Chapter XXI Factors influencing health status ...,Z55-Z65
392,Union#BlockZ70-Z76#Z70-Z76 Persons encounterin...,41270#BlockZ70-Z76#Z70-Z76 Persons encounterin...,41202#BlockZ70-Z76#Z70-Z76 Persons encounterin...,40001#BlockZ70-Z76#Z70-Z76 Persons encounterin...,40002#BlockZ70-Z76#Z70-Z76 Persons encounterin...,.,.,.,.,.,Chapter XXI Factors influencing health status ...,Z70-Z76


In [25]:
blocks = list({block for block in tmp[tmp.Phenotype.str.contains('Block')].icd_code})

blocks[:5]

['R80-R82', 'J20-J22', 'O20-O29', 'O85-O92', 'S20-S29']

In [26]:
# I'll extract all codes contained between the blocks

def extract_codes_in_a_block(block:str):
    """
    Ex. 'Q80-Q83' --> ['Q80', 'Q81', 'Q82', 'Q83']
    """
    try:
        first_letter = block[0]
        low_limit, up_limit = re.findall(r'\d{2}', block)
        return [first_letter + str(i) for i in range(int(low_limit), int(up_limit) + 1)]
    except Exception as e:
        print(e)
        print(f'Block {block} could not be extracted.')


In [27]:
blocks_extracted = {block: extract_codes_in_a_block(block) for block in blocks}

blocks_extracted['Q80-Q89']

['Q80', 'Q81', 'Q82', 'Q83', 'Q84', 'Q85', 'Q86', 'Q87', 'Q88', 'Q89']

In [28]:
# Bring the extracted codes from the block to the df 

tmp['icd_code'] = tmp.icd_code.map(blocks_extracted).fillna(tmp.icd_code)
tmp = tmp.explode('icd_code')

tmp.head()

Unnamed: 0,Phenotype,Diagnoses - ICD10,Diagnoses - main ICD10,Underlying (primary) cause of death: ICD10,Contributory (secondary) causes of death: ICD10,Type of cancer: ICD10,"Non-cancer illness code, self-reported",Other Fields #1,Other Fields #2,Derived: First Occurrence (300K_v2 only),Chapter,icd_code
0,Union#A01#A01 Typhoid and paratyphoid fevers,41270#A01#A01 Typhoid and paratyphoid fevers,41202#A01#A01 Typhoid and paratyphoid fevers,40001#A01#A01 Typhoid and paratyphoid fevers,40002#A01#A01 Typhoid and paratyphoid fevers,.,20002#1577#typhoid fever,.,.,130003,Chapter I Certain infectious and parasitic dis...,A01
1,Union#A02#A02 Other Salmonella infections,41270#A02#A02 Other Salmonella infections,41202#A02#A02 Other Salmonella infections,40001#A02#A02 Other Salmonella infections,40002#A02#A02 Other Salmonella infections,.,.,.,.,130005,Chapter I Certain infectious and parasitic dis...,A02
2,Union#A020#A02.0 Salmonella gastro-enteritis,41270#A020#A02.0 Salmonella gastro-enteritis,41202#A020#A02.0 Salmonella gastro-enteritis,40001#A020#A02.0 Salmonella gastro-enteritis,40002#A020#A02.0 Salmonella gastro-enteritis,.,.,.,.,.,Chapter I Certain infectious and parasitic dis...,A02.0
3,Union#A03#A03 Shigellosis,41270#A03#A03 Shigellosis,41202#A03#A03 Shigellosis,40001#A03#A03 Shigellosis,40002#A03#A03 Shigellosis,.,.,.,.,130007,Chapter I Certain infectious and parasitic dis...,A03
4,Union#A04#A04 Other bacterial intestinal infec...,41270#A04#A04 Other bacterial intestinal infec...,41202#A04#A04 Other bacterial intestinal infec...,40001#A04#A04 Other bacterial intestinal infec...,40002#A04#A04 Other bacterial intestinal infec...,.,.,.,.,130009,Chapter I Certain infectious and parasitic dis...,A04
