In [None]:
from typing import List
import re

from ontoma import OnToma
import numpy as np
import pandas as pd
from pathlib import Path

from pyspark.sql import DataFrame, SparkSession, Window
from pyspark.sql.types import DoubleType, IntegerType, StringType, StructType, StructField
import pyspark.sql.functions as F

spark = (SparkSession.builder
         .master('local[*]')
         .config("spark.driver.memory", "15g")
         .appName('spark')
         .getOrCreate())

In [2]:
ROOT = Path.cwd().parent

In [5]:

binary = pd.read_excel(Path.joinpath(ROOT / 'data/publication/Supp Table 8 - Collapsing analysis top hits.xlsx'), sheet_name='Binary')
quant = pd.read_excel(Path.joinpath(ROOT / 'data/publication/Supp Table 8 - Collapsing analysis top hits.xlsx'), sheet_name='Quantitative')

In [4]:
print("Assoc with binary trait:\n", binary.iloc[0])
print("Assoc with quantitative trait:\n", quant.iloc[0])

Assoc with binary trait:
 model                                                      ptv
phenotype                         20002#1427#polycystic kidney
root          Chapter XIV Diseases of the genitourinary system
Gene                                                      PKD1
CaseQ                                                       39
CaseNQ                                                     134
Case.Freq                                               0.2254
CtrlQ                                                       43
CtrlNQ                                                  268739
Ctrl.Freq                                               0.0002
p.value                                                    0.0
OR                                                   1818.9554
OR.LCI                                               1142.1813
OR.UCI                                               2896.7369
OMIM_code                                                    1
OMIM         Polycystic kidne

In [6]:
quant.columns

Index(['model', 'Gene', 'Field', 'Path', 'Pheno', 'nSamples', 'nCarriers',
       'nNonCarriers', 'p-value', 'beta', 'lower', 'upper', 'se', 'OMIM'],
      dtype='object')

## Get all phenotypes together

- 1765 associations with 844 binary and quantitative traits without cleaning (689 are binary, 155 are quantitative)
- 1516 **statistically significant** assocs.
- There are 746 different phenotypes, of which 702 I am able to map using OnToma (94%).
- ~Out of 746, 667 have OMIM notation.~ OMIM annotation is not an accurate way to map phenotypes.
  - There are 396 different OMIM codes. More than 90% are mappable with OnToma (370/396).
  - 79 do not have an OMIM xref. Phenotype string needs to be queried.

- Phenotypes after cleaning: 151 out of 534 mappable with OnToma after several cleaning steps.

In [6]:
pheno = (
    pd.concat(
        [binary[['phenotype', 'OMIM', 'p.value', 'Gene']], quant.rename(columns={'Pheno':'phenotype', 'p-value':'p.value'})[['phenotype', 'OMIM', 'Gene', 'p.value']]]
    )

    # Extract significant associations. Threshold = 2x10^-9
    .query('(`p.value` < 2e-9) and (Gene.notnull())').drop('p.value', axis=1)

    .drop_duplicates()
    .reset_index(drop=True)
)
print(pheno.shape)
pheno.head()


(1703, 3)


Unnamed: 0,phenotype,OMIM,Gene
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,PKD1
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,ASXL1
2,Source of report of Q61 (cystic kidney disease),Polycystic kidney disease 1; 173900 (3); Autos...,PKD1
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney disease 1; 173900 (3); Autos...,PKD1
4,Union#Q61#Q61 Cystic kidney disease,Polycystic kidney disease 1; 173900 (3); Autos...,PKD1


In [131]:
pheno['extracted_omim'] = pheno['OMIM'].str.findall(r"\d{6}")
pheno.head()

Unnamed: 0,phenotype,OMIM,Gene,extracted_omim
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,PKD1,[173900]
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,ASXL1,"[605039, 614286]"
2,Source of report of Q61 (cystic kidney disease),Polycystic kidney disease 1; 173900 (3); Autos...,PKD1,[173900]
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney disease 1; 173900 (3); Autos...,PKD1,[173900]
4,Union#Q61#Q61 Cystic kidney disease,Polycystic kidney disease 1; 173900 (3); Autos...,PKD1,[173900]


In [132]:
pheno = pheno.explode('extracted_omim')
pheno['extracted_omim'] = 'OMIM:' + pheno.extracted_omim
pheno.head()

Unnamed: 0,phenotype,OMIM,Gene,extracted_omim
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,PKD1,OMIM:173900
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,ASXL1,OMIM:605039
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,ASXL1,OMIM:614286
2,Source of report of Q61 (cystic kidney disease),Polycystic kidney disease 1; 173900 (3); Autos...,PKD1,OMIM:173900
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney disease 1; 173900 (3); Autos...,PKD1,OMIM:173900


In [133]:
omims = set(pheno.extracted_omim.dropna().values)

len(omims)

396

In [134]:
pheno.drop_duplicates(subset='phenotype').extracted_omim.isna().value_counts()

False    667
True      79
Name: extracted_omim, dtype: int64

## Map OMIM codes

In [None]:
otmap = OnToma()

In [20]:
def ontoma_lookup(term, query_code):
    """
    OnToma is first queried with the code flag, if this fails common steps for a string are used.
    """
    try:
        if query_code:
            res = otmap.find_term(term, code=True)

            if len(res) > 0:
                return res[0].id_ot_schema

        res = otmap.find_term(term)
        if len(res) > 0:
            return res[0].id_ot_schema

        return
    except Exception:
        print(term)

In [None]:
omims_mapped = {term: ontoma_lookup(term, query_code=True) for term in omims}

In [24]:
unmapped_omims = [term for term in omims_mapped.keys() if omims_mapped[term] == None]

print(len(unmapped_omims))
unmapped_omims

26


['OMIM:601800',
 'OMIM:608404',
 'OMIM:617981',
 'OMIM:613463',
 'OMIM:111150',
 'OMIM:112050',
 'OMIM:110500',
 'OMIM:616622',
 'OMIM:615881',
 'OMIM:610762',
 'OMIM:617671',
 'OMIM:612797',
 'OMIM:615264',
 'OMIM:138500',
 'OMIM:617966',
 'OMIM:601551',
 'OMIM:601816',
 'OMIM:112010',
 'OMIM:609338',
 'OMIM:601884',
 'OMIM:614490',
 'OMIM:138900',
 'OMIM:617948',
 'OMIM:614419',
 'OMIM:617321',
 'OMIM:601550']

### Clean Phenotype string

Steps:
- Codes and Tags that are at the beginning of the string need to be cleaned. Ex. "41202#Q12#Q12 Congenital lens malformations"
- Split by `|` and keep the first element. Ex. "41202#Q613#Q61.3 Polycystic kidney| unspecified"
- Split and explode by `and`. Ex. "Source of report of L92 (granulomatous disorders of skin and subcutaneous tissue)". Problem: "Chapter B - Endocrine System and Breast". Won't do, the problems are more prevalent than the gains
- Remove `Other` from string. Ex. "41202#Q82#Q82 Other congenital malformations of skin"
- Remove trailing parenthesis. Ex. "Source of report of L52 (erythema nodosum)"
- Make the string lowercase and remove trailing whitespaces.

In [156]:
ontoma = pheno.filter(items=['phenotype'])

ontoma

Unnamed: 0,phenotype
0,20002#1427#polycystic kidney
1,Union#C92#C92 Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia
2,Source of report of Q61 (cystic kidney disease)
3,Union#Q612#Q61.2 Polycystic kidney| adult type
...,...
1699,Leg predicted mass (left)
1700,Alkaline phosphatase
1701,Mean corpuscular haemoglobin
1701,Mean corpuscular haemoglobin


In [157]:
# 1. Remove cases like "41272#W365#W36.5 Diagnostic extraction of bone marrow NEC" -> "Diagnostic extraction of bone marrow NEC"

ontoma['cleaned_pheno'] = ontoma.phenotype.str.split(r'^(\w+#\w+#\w*\.\w+\s)', expand=False).str[-1]

ontoma.head()

Unnamed: 0,phenotype,cleaned_pheno
0,20002#1427#polycystic kidney,20002#1427#polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Union#C92#C92 Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Union#C92#C92 Myeloid leukaemia
2,Source of report of Q61 (cystic kidney disease),Source of report of Q61 (cystic kidney disease)
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney| adult type


In [158]:
# 2. Remove cases like "Union#C92#C92 Myeloid leukaemia" -> "Myeloid leukaemia"

ontoma['cleaned_pheno'] = ontoma.cleaned_pheno.str.split(r'^(Union#\w+#\w+\s)', expand=False).str[-1]
ontoma.head()

Unnamed: 0,phenotype,cleaned_pheno
0,20002#1427#polycystic kidney,20002#1427#polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Myeloid leukaemia
2,Source of report of Q61 (cystic kidney disease),Source of report of Q61 (cystic kidney disease)
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney| adult type


In [159]:
# 3. Remove cases like "20002#1427#polycystic kidney" -> "polycystic kidney"

ontoma['cleaned_pheno'] = ontoma.cleaned_pheno.str.split(r'^(\w+#\w+#)', expand=False).str[-1]
ontoma.head()

Unnamed: 0,phenotype,cleaned_pheno
0,20002#1427#polycystic kidney,polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Myeloid leukaemia
2,Source of report of Q61 (cystic kidney disease),Source of report of Q61 (cystic kidney disease)
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney| adult type


In [160]:
# 4. Remove cases like "Source of report of Q61 (cystic kidney disease)" -> "cystic kidney disease"

ontoma['cleaned_pheno'] = ontoma.cleaned_pheno.str.split(r'^(Source of report of \w+\s)', expand=False).str[-1]
ontoma.head()

Unnamed: 0,phenotype,cleaned_pheno
0,20002#1427#polycystic kidney,polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Myeloid leukaemia
2,Source of report of Q61 (cystic kidney disease),(cystic kidney disease)
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney| adult type


In [161]:
# 5. Split by `|` and keep the first element. Ex. "41202#Q613#Q61.3 Polycystic kidney| unspecified"

ontoma['cleaned_pheno'] = ontoma.cleaned_pheno.str.split(r'|', expand=False).str[0]
ontoma.head()

Unnamed: 0,phenotype,cleaned_pheno
0,20002#1427#polycystic kidney,polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Myeloid leukaemia
2,Source of report of Q61 (cystic kidney disease),(cystic kidney disease)
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney


In [162]:
# 6. Remove trailing parenthesis. Ex. "(erythema nodosum)" -> "erythema nodosum"

ontoma['cleaned_pheno'] = ontoma.cleaned_pheno.str.replace('[()]', '')
ontoma.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,phenotype,cleaned_pheno
0,20002#1427#polycystic kidney,polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Myeloid leukaemia
2,Source of report of Q61 (cystic kidney disease),cystic kidney disease
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney


In [163]:
# 7. Remove cases like 'Union#BlockJ09-J18#J09-J18 Influenza and pneumonia' -> ' Influenza and pneumonia'

ontoma['cleaned_pheno'] = ontoma.cleaned_pheno.str.split(r'^Union\S+', expand=False).str[-1]

ontoma[ontoma['phenotype'].str.contains('Union')].head()

Unnamed: 0,phenotype,cleaned_pheno
1,Union#C92#C92 Myeloid leukaemia,Myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Myeloid leukaemia
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney
4,Union#Q61#Q61 Cystic kidney disease,Cystic kidney disease
7,Union#D56#D56 Thalassaemia,Thalassaemia


In [164]:
# 8. Remove `Other` from string. Ex. "41202#Q82#Q82 Other congenital malformations of skin"

ontoma['cleaned_pheno'] = ontoma.cleaned_pheno.str.split(r'Other\s', expand=False).str[-1]

ontoma[ontoma['phenotype'].str.contains('Other')].head()

Unnamed: 0,phenotype,cleaned_pheno
15,Union#L30#L30 Other dermatitis,dermatitis
15,Union#L30#L30 Other dermatitis,dermatitis
49,Union#N28#N28 Other disorders of kidney and ur...,disorders of kidney and ureter
86,41200#X33#X33 Other blood transfusion,blood transfusion
86,41200#X33#X33 Other blood transfusion,blood transfusion


In [165]:
# 9. Remove cases like 'C92 Myeloid leukaemia' -> ' Myeloid leukaemia'

ontoma['cleaned_pheno'] = ontoma.cleaned_pheno.str.split(r'^\w\d{2}', expand=False).str[-1]

ontoma[ontoma['phenotype'].str.contains(r'^\w\d{2}')].head()

Unnamed: 0,phenotype,cleaned_pheno
0,20002#1427#polycystic kidney,polycystic kidney
5,20002#1452#eczema|dermatitis,eczema
5,20002#1452#eczema|dermatitis,eczema
6,40001#C92#C92 Myeloid leukaemia,Myeloid leukaemia
6,40001#C92#C92 Myeloid leukaemia,Myeloid leukaemia


In [166]:
# 10. Lower and strip string

ontoma['cleaned_pheno'] = ontoma.cleaned_pheno.apply(lambda X: X.strip().lower())

ontoma.head()

Unnamed: 0,phenotype,cleaned_pheno
0,20002#1427#polycystic kidney,polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,myeloid leukaemia
2,Source of report of Q61 (cystic kidney disease),cystic kidney disease
3,Union#Q612#Q61.2 Polycystic kidney| adult type,polycystic kidney


In [167]:
phenos = set(ontoma.cleaned_pheno.dropna().values)

len(phenos)

534

In [None]:
phenos_mapped = {pheno: ontoma_lookup(pheno, query_code=False) for pheno in phenos}

In [147]:
len([pheno for pheno in phenos_mapped if phenos_mapped[pheno] != None])

151

## 2. A different approach.
Let's not use the phenotype strings and parse the information differently.
We know that most of the associations (specially for binary traits) are covered with data from OMIM/ClinVar.
Since the novelty is low, we want to suggest the best possible mapping with a 3-step strategy.

1. Exact matches with OnToma from the cleaned phenotype string as obtained above (stored in `ontoma`).
2. Cross reference between ICD10 and EFO from the codes mentioned in the phenotype string (stored in `icd`).
3. Highest ranking phenotype in the OT Platform for the given gene (stored in `assocs`).

### 2.1. Union phenotypes

We want to handle union phenotypes with special care as they can some times be a conglomerate of diseases ("Source of report of L28 (lichen simplex chronicus and prurigo)") rather than an aggregated phenotype ("Union#J931#J93.1 Other spontaneous pneumothorax").

Data about them is collected in the Supp table 1.

In [8]:
unions = (
    pd.read_excel(Path.joinpath(ROOT / 'data/publication/Supp table 1 - Studied phenotypes.xlsx'), sheet_name='Union Mapping')

    # Merge with `pheno` to keep only the phenotypes for which there is associations
    .merge(pheno.filter(items=['phenotype']), left_on='Phenotype', right_on='phenotype', how='inner')
    .drop('phenotype', axis=1)

)

unions.head()

Unnamed: 0,Phenotype,Diagnoses - ICD10,Diagnoses - main ICD10,Underlying (primary) cause of death: ICD10,Contributory (secondary) causes of death: ICD10,Type of cancer: ICD10,"Non-cancer illness code, self-reported",Other Fields #1,Other Fields #2,Derived: First Occurrence (300K_v2 only),Chapter
0,Union#A41#A41 Other septicaemia,41270#A41#A41 Other septicaemia,41202#A41#A41 Other septicaemia,40001#A41#A41 Other septicaemia,40002#A41#A41 Other septicaemia,.,20002#1657#septicaemia | sepsis,.,.,130071,Chapter I Certain infectious and parasitic dis...
1,Union#A419#A41.9 Septicaemia| unspecified,41270#A419#A41.9 Septicaemia| unspecified,41202#A419#A41.9 Septicaemia| unspecified,40001#A419#A41.9 Septicaemia| unspecified,40002#A419#A41.9 Septicaemia| unspecified,.,.,.,.,.,Chapter I Certain infectious and parasitic dis...
2,Union#A419#A41.9 Septicaemia| unspecified,41270#A419#A41.9 Septicaemia| unspecified,41202#A419#A41.9 Septicaemia| unspecified,40001#A419#A41.9 Septicaemia| unspecified,40002#A419#A41.9 Septicaemia| unspecified,.,.,.,.,.,Chapter I Certain infectious and parasitic dis...
3,Union#B35#B35 Dermatophytosis,41270#B35#B35 Dermatophytosis,41202#B35#B35 Dermatophytosis,40001#B35#B35 Dermatophytosis,40002#B35#B35 Dermatophytosis,.,.,.,.,130227,Chapter I Certain infectious and parasitic dis...
4,Union#BlockA30-A49#A30-A49 Other bacterial dis...,41270#BlockA30-A49#A30-A49 Other bacterial dis...,41202#BlockA30-A49#A30-A49 Other bacterial dis...,40001#BlockA30-A49#A30-A49 Other bacterial dis...,40002#BlockA30-A49#A30-A49 Other bacterial dis...,.,.,.,.,.,Chapter I Certain infectious and parasitic dis...


In [169]:
# Columns about primary and secondary diseases seem to be identical. How often is this true?

unions.iloc[:, [3,4]]

# It seems to be the same except for the code in the beginning. 
# This code is the ID that UKB gives to their phenotypes. 
# They make the distinction between primary and secondary. 
# The union phenotype is the aggregation of them both.


Unnamed: 0,Underlying (primary) cause of death: ICD10,Contributory (secondary) causes of death: ICD10
0,40001#A41#A41 Other septicaemia,40002#A41#A41 Other septicaemia
1,40001#A419#A41.9 Septicaemia| unspecified,40002#A419#A41.9 Septicaemia| unspecified
2,40001#A419#A41.9 Septicaemia| unspecified,40002#A419#A41.9 Septicaemia| unspecified
3,40001#B35#B35 Dermatophytosis,40002#B35#B35 Dermatophytosis
4,40001#B35#B35 Dermatophytosis,40002#B35#B35 Dermatophytosis
...,...,...
956,40001#R31#R31 Unspecified haematuria,40002#R31#R31 Unspecified haematuria
957,40001#T82#T82 Complications of cardiac and vas...,40002#T82#T82 Complications of cardiac and vas...
958,40001#T828#T82.8 Other complications of cardia...,40002#T828#T82.8 Other complications of cardia...
959,40001#T86#T86 Failure and rejection of transpl...,40002#T86#T86 Failure and rejection of transpl...


In [170]:
# Let's compare them removing the codes

tmp = unions.copy().iloc[:, [3,4]]


tmp['primary'] = tmp.iloc[:, 0].str.replace(r'^\d{5}', '')
tmp['secondary'] = tmp.iloc[:, 1].str.replace(r'^\d{5}', '')
tmp = tmp.iloc[:, [2,3]]

comparison_column = np.where(tmp['primary'] == tmp['secondary'], True, False)
tmp['equal'] = comparison_column

tmp.head()

  
  import sys


Unnamed: 0,primary,secondary,equal
0,#A41#A41 Other septicaemia,#A41#A41 Other septicaemia,True
1,#A419#A41.9 Septicaemia| unspecified,#A419#A41.9 Septicaemia| unspecified,True
2,#A419#A41.9 Septicaemia| unspecified,#A419#A41.9 Septicaemia| unspecified,True
3,#B35#B35 Dermatophytosis,#B35#B35 Dermatophytosis,True
4,#B35#B35 Dermatophytosis,#B35#B35 Dermatophytosis,True


In [171]:
print(tmp.query('equal == False').drop_duplicates().shape)

tmp.query('equal == False').drop_duplicates()

# The differences in the string are minimal...

(5, 3)


Unnamed: 0,primary,secondary,equal
392,#ChapterII#Chapter II Neoplasms,#Chapter II#Chapter II Neoplasms,False
407,#ChapterIII#Chapter III Diseases of the blood ...,#Chapter III#Chapter III Diseases of the blood...,False
420,#ChapterIV#Chapter IV Endocrine| nutritional a...,#Chapter IV#Chapter IV Endocrine| nutritional ...,False
422,#ChapterVII#Chapter VII Diseases of the eye an...,#Chapter VII#Chapter VII Diseases of the eye a...,False
423,#ChapterXVII#Chapter XVII Congenital malformat...,#Chapter XVII#Chapter XVII Congenital malforma...,False


### 2.1.2 ICD10 codes extraction

... **To find the cross reference between ICD10 and EFO from the codes mentiones in the phenotype string.**

Results:
- Circa 50% coverage with EFO. (3238/6552)

Relevant notes:
- They all start with "Union..."
- Some of them are are a "block" of codes. For example, "Union#BlockA00-A09#A00-A09 Intestinal infectious diseases" refers to all codes between A00 and A09. I have to extract these.
- Some of the phenotypes refer to complete ICD-10 Chapters, for which the group of terms is not provided. We want to drop these. For example, 'Union#ChapterI#Chapter I Certain infectious and parasitic diseases'.


In [172]:
unions.head()

Unnamed: 0,Phenotype,Diagnoses - ICD10,Diagnoses - main ICD10,Underlying (primary) cause of death: ICD10,Contributory (secondary) causes of death: ICD10,Type of cancer: ICD10,"Non-cancer illness code, self-reported",Other Fields #1,Other Fields #2,Derived: First Occurrence (300K_v2 only),Chapter
0,Union#A41#A41 Other septicaemia,41270#A41#A41 Other septicaemia,41202#A41#A41 Other septicaemia,40001#A41#A41 Other septicaemia,40002#A41#A41 Other septicaemia,.,20002#1657#septicaemia | sepsis,.,.,130071,Chapter I Certain infectious and parasitic dis...
1,Union#A419#A41.9 Septicaemia| unspecified,41270#A419#A41.9 Septicaemia| unspecified,41202#A419#A41.9 Septicaemia| unspecified,40001#A419#A41.9 Septicaemia| unspecified,40002#A419#A41.9 Septicaemia| unspecified,.,.,.,.,.,Chapter I Certain infectious and parasitic dis...
2,Union#A419#A41.9 Septicaemia| unspecified,41270#A419#A41.9 Septicaemia| unspecified,41202#A419#A41.9 Septicaemia| unspecified,40001#A419#A41.9 Septicaemia| unspecified,40002#A419#A41.9 Septicaemia| unspecified,.,.,.,.,.,Chapter I Certain infectious and parasitic dis...
3,Union#B35#B35 Dermatophytosis,41270#B35#B35 Dermatophytosis,41202#B35#B35 Dermatophytosis,40001#B35#B35 Dermatophytosis,40002#B35#B35 Dermatophytosis,.,.,.,.,130227,Chapter I Certain infectious and parasitic dis...
4,Union#B35#B35 Dermatophytosis,41270#B35#B35 Dermatophytosis,41202#B35#B35 Dermatophytosis,40001#B35#B35 Dermatophytosis,40002#B35#B35 Dermatophytosis,.,.,.,.,130227,Chapter I Certain infectious and parasitic dis...


In [173]:
# The phenotype usually follows the pattern: "Union#{code1}#{code2} {label}" (except when there is a chapter).
# Ex. "Union#A410#A41.0 Septicaemia due to Staphylococcus aureus".
# I want code2 because it is the more exact one to the ICD10 nomenclature

icd = (
    unions.copy()[~unions['Phenotype'].str.contains('Chapter')]
    .filter(items=['Phenotype'])
)

icd['icd_code'] = icd.Phenotype.str.split('#', expand=False).str[-1].str.split(' ').str[0]
icd.head(5)

Unnamed: 0,Phenotype,icd_code
0,Union#A41#A41 Other septicaemia,A41
1,Union#A419#A41.9 Septicaemia| unspecified,A41.9
2,Union#A419#A41.9 Septicaemia| unspecified,A41.9
3,Union#B35#B35 Dermatophytosis,B35
4,Union#B35#B35 Dermatophytosis,B35


In [174]:
# Blocks are also nicely extracted
icd[icd.Phenotype.str.contains('Block')].head()

Unnamed: 0,Phenotype,icd_code
5,Union#BlockA30-A49#A30-A49 Other bacterial dis...,A30-A49
6,Union#BlockA30-A49#A30-A49 Other bacterial dis...,A30-A49
7,Union#BlockC15-C26#C15-C26 Malignant neoplasms...,C15-C26
8,Union#BlockC15-C26#C15-C26 Malignant neoplasms...,C15-C26
9,Union#BlockC15-C26#C15-C26 Malignant neoplasms...,C15-C26


In [175]:
blocks = list({block for block in icd[icd.Phenotype.str.contains('Block')].icd_code})

blocks[:5]

['I30-I52', 'Q65-Q79', 'I70-I79', 'L20-L30', 'Q20-Q28']

In [176]:
# I'll extract all codes contained between the blocks

def extract_codes_in_a_block(block:str) -> List[str]:
    """
    Ex. 'Q80-Q83' --> ['Q80', 'Q81', 'Q82', 'Q83']
    """
    try:
        first_letter = block[0]
        low_limit, up_limit = re.findall(r'\d{2}', block)
        return [first_letter + str(i) for i in range(int(low_limit), int(up_limit) + 1)]
    except Exception as e:
        print(e)
        print(f'Block {block} could not be extracted.')


In [177]:
blocks_extracted = {block: extract_codes_in_a_block(block) for block in blocks}

blocks_extracted['Q80-Q89']

['Q80', 'Q81', 'Q82', 'Q83', 'Q84', 'Q85', 'Q86', 'Q87', 'Q88', 'Q89']

In [178]:
# Bring the extracted codes from the block to the df 

icd['icd_id'] = icd.icd_code.map(blocks_extracted).fillna(icd.icd_code)
icd = icd.explode('icd_id')
icd['icd_id'] = icd.icd_id.apply(lambda X: 'ICD10:' + X)

icd.head(10)

Unnamed: 0,Phenotype,icd_code,icd_id
0,Union#A41#A41 Other septicaemia,A41,ICD10:A41
1,Union#A419#A41.9 Septicaemia| unspecified,A41.9,ICD10:A41.9
2,Union#A419#A41.9 Septicaemia| unspecified,A41.9,ICD10:A41.9
3,Union#B35#B35 Dermatophytosis,B35,ICD10:B35
4,Union#B35#B35 Dermatophytosis,B35,ICD10:B35
5,Union#BlockA30-A49#A30-A49 Other bacterial dis...,A30-A49,ICD10:A30
5,Union#BlockA30-A49#A30-A49 Other bacterial dis...,A30-A49,ICD10:A31
5,Union#BlockA30-A49#A30-A49 Other bacterial dis...,A30-A49,ICD10:A32
5,Union#BlockA30-A49#A30-A49 Other bacterial dis...,A30-A49,ICD10:A33
5,Union#BlockA30-A49#A30-A49 Other bacterial dis...,A30-A49,ICD10:A34


In [243]:
## Merge xref from the disease index

diseases = (
    spark.read.parquet('/Users/irene/Documents/dev/pyspark/22.02.4/diseases')
    .select(F.col('id').alias('efo_id'), F.col('name').alias('efo_name'), 'dbXRefs')
    .withColumn('icd_id', F.explode('dbXRefs')).drop('dbXRefs')
    .filter(F.col('icd_id').contains('ICD10'))
    .toPandas()
)

diseases.head()

Unnamed: 0,efo_id,efo_name,icd_id
0,MONDO_0001574,capillary disease,ICD10:I78.9
1,MONDO_0001574,capillary disease,ICD10:I78
2,MONDO_0011895,idiopathic hypereosinophilic syndrome,ICD10:D47.5
3,MONDO_0017375,congenital enterovirus infection,ICD10:P35.8
4,MONDO_0024305,acquired hyperprolactinemia,ICD10:E22.1


In [246]:
diseases[diseases.icd_id.str.contains('\.')].shape[0]

8263

In [180]:
icd = (
    icd[['icd_id', 'Phenotype']]
    .merge(diseases, on='icd_id', how='left').drop_duplicates()
    .rename(columns={'efo_id': 'icd_mapping'})
)

icd.head()

Unnamed: 0,icd_id,Phenotype,icd_mapping,efo_name
0,ICD10:A41,Union#A41#A41 Other septicaemia,,
1,ICD10:A41.9,Union#A419#A41.9 Septicaemia| unspecified,,
3,ICD10:B35,Union#B35#B35 Dermatophytosis,MONDO_0004678,dermatophytosis
4,ICD10:B35,Union#B35#B35 Dermatophytosis,EFO_0007511,tinea favosa
7,ICD10:A30,Union#BlockA30-A49#A30-A49 Other bacterial dis...,EFO_0001054,leprosy


### 2.3. Highest ranking phenotype in the OT Platform for the given gene

In [181]:
assocs = pheno.filter(items=['phenotype', 'Gene']).drop_duplicates()

assocs.head()

Unnamed: 0,phenotype,Gene
0,20002#1427#polycystic kidney,PKD1
1,Union#C92#C92 Myeloid leukaemia,ASXL1
2,Source of report of Q61 (cystic kidney disease),PKD1
3,Union#Q612#Q61.2 Polycystic kidney| adult type,PKD1
4,Union#Q61#Q61 Cystic kidney disease,PKD1


In [183]:
# Gene symbols have to be mapped

targets = (
    spark.read.parquet('/Users/irene/Documents/dev/pyspark/22.02.4/targets')
    .select(F.col('approvedSymbol').alias('Gene'), F.col('id').alias('targetId'))
    .toPandas()
)

assocs = (
    assocs.merge(targets, on='Gene', how='left')
    # G6PC cannot be mapped
    .dropna(axis=0, how='any')
)
print(assocs.shape[0])
assocs.head()

1700


Unnamed: 0,phenotype,Gene,targetId
0,20002#1427#polycystic kidney,PKD1,ENSG00000008710
1,Union#C92#C92 Myeloid leukaemia,ASXL1,ENSG00000171456
2,Source of report of Q61 (cystic kidney disease),PKD1,ENSG00000008710
3,Union#Q612#Q61.2 Polycystic kidney| adult type,PKD1,ENSG00000008710
4,Union#Q61#Q61 Cystic kidney disease,PKD1,ENSG00000008710


In [184]:
# Extract the top assoc from the associationByOverallIndirect dataset

w = Window.partitionBy('targetId')
top_assoc = (
    spark.read.parquet('/Users/irene/Documents/dev/pyspark/22.02.4/associationByOverallIndirect')
    .drop('evidenceCount')
    .withColumn('maxScore', F.max('score').over(w))
    .where(F.col('score') == F.col('maxScore'))
    .drop('maxScore', 'evidenceCount')
    .persist()
    .toPandas()
)

top_assoc.head()

22/03/17 10:05:29 WARN CacheManager: Asked to cache already cached data.


Unnamed: 0,diseaseId,targetId,score
0,EFO_0001444,ENSG00000059588,0.304875
1,OTAR_0000018,ENSG00000070182,0.751753
2,EFO_0001444,ENSG00000070366,0.404388
3,EFO_0000618,ENSG00000072071,0.161405
4,MONDO_0045024,ENSG00000073536,0.059111


In [185]:
assocs = (
    assocs.merge(top_assoc.drop('score', axis=1), on='targetId', how='left')
    .rename(columns={'diseaseId': 'top_assoc_mapping'})
)

assocs.head()

Unnamed: 0,phenotype,Gene,targetId,top_assoc_mapping
0,20002#1427#polycystic kidney,PKD1,ENSG00000008710,EFO_0009690
1,Union#C92#C92 Myeloid leukaemia,ASXL1,ENSG00000171456,OTAR_0000018
2,Source of report of Q61 (cystic kidney disease),PKD1,ENSG00000008710,EFO_0009690
3,Union#Q612#Q61.2 Polycystic kidney| adult type,PKD1,ENSG00000008710,EFO_0009690
4,Union#Q61#Q61 Cystic kidney disease,PKD1,ENSG00000008710,EFO_0009690


### 2.4. Build final df with all mappings

In [186]:
pheno.head()

Unnamed: 0,phenotype,OMIM,Gene,extracted_omim,cleaned_pheno
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,PKD1,OMIM:173900,polycystic kidney
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,ASXL1,OMIM:605039,myeloid leukaemia
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,ASXL1,OMIM:614286,myeloid leukaemia
2,Source of report of Q61 (cystic kidney disease),Polycystic kidney disease 1; 173900 (3); Autos...,PKD1,OMIM:173900,cystic kidney disease
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney disease 1; 173900 (3); Autos...,PKD1,OMIM:173900,polycystic kidney


In [215]:
# Strategy 1: Mappings resulting from running ontoma on cleaned phenotypes

pheno_mapped = pheno.copy()
pheno_mapped['ontoma_mapping'] = pheno_mapped['cleaned_pheno']

pheno_mapped = pheno_mapped.replace({'ontoma_mapping': phenos_mapped})
pheno_mapped.head()

Unnamed: 0,phenotype,OMIM,Gene,extracted_omim,cleaned_pheno,ontoma_mapping
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,PKD1,OMIM:173900,polycystic kidney,EFO_0008620
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,ASXL1,OMIM:605039,myeloid leukaemia,EFO_0000222
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,ASXL1,OMIM:614286,myeloid leukaemia,EFO_0000222
2,Source of report of Q61 (cystic kidney disease),Polycystic kidney disease 1; 173900 (3); Autos...,PKD1,OMIM:173900,cystic kidney disease,EFO_0008615
3,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney disease 1; 173900 (3); Autos...,PKD1,OMIM:173900,polycystic kidney,EFO_0008620


In [216]:
# Strategy 2: Mappings resulting from ICD10 cross references

pheno_mapped = (
        pheno_mapped.merge(
        icd.filter(['Phenotype', 'icd_mapping']),
        left_on='phenotype', right_on='Phenotype',
        how='left')
        .drop('Phenotype', axis=1)
)

pheno_mapped.head()

Unnamed: 0,phenotype,OMIM,Gene,extracted_omim,cleaned_pheno,ontoma_mapping,icd_mapping
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,PKD1,OMIM:173900,polycystic kidney,EFO_0008620,
1,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,ASXL1,OMIM:605039,myeloid leukaemia,EFO_0000222,MONDO_0004643
2,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,ASXL1,OMIM:614286,myeloid leukaemia,EFO_0000222,MONDO_0004643
3,Source of report of Q61 (cystic kidney disease),Polycystic kidney disease 1; 173900 (3); Autos...,PKD1,OMIM:173900,cystic kidney disease,EFO_0008615,
4,Union#Q612#Q61.2 Polycystic kidney| adult type,Polycystic kidney disease 1; 173900 (3); Autos...,PKD1,OMIM:173900,polycystic kidney,EFO_0008620,


In [217]:
# Strategy 3: Mappings resulting from the known top association for that gene

pheno_mapped = pheno_mapped.merge(
    assocs.filter(['phenotype', 'top_assoc_mapping']),
    on='phenotype',
    how='left'
)

pheno_mapped.head()

Unnamed: 0,phenotype,OMIM,Gene,extracted_omim,cleaned_pheno,ontoma_mapping,icd_mapping,top_assoc_mapping
0,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,PKD1,OMIM:173900,polycystic kidney,EFO_0008620,,EFO_0009690
1,20002#1427#polycystic kidney,Polycystic kidney disease 1; 173900 (3); Autos...,PKD1,OMIM:173900,polycystic kidney,EFO_0008620,,EFO_0009690
2,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,ASXL1,OMIM:605039,myeloid leukaemia,EFO_0000222,MONDO_0004643,OTAR_0000018
3,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,ASXL1,OMIM:605039,myeloid leukaemia,EFO_0000222,MONDO_0004643,MONDO_0045024
4,Union#C92#C92 Myeloid leukaemia,Bohring-Opitz syndrome; 605039 (3); Autosomal ...,ASXL1,OMIM:605039,myeloid leukaemia,EFO_0000222,MONDO_0004643,MONDO_0023370


In [242]:
# Prepare output 

diseases = (
    spark.read.parquet('/Users/irene/Documents/dev/pyspark/22.02.4/diseases')
    .select('id', 'name').toPandas()
)

cols = ['phenotype', 'Gene', 'ontoma_mapping', 'ontoma_mapping_label', 'icd_mapping', 'icd_mapping_label', 'top_assoc_mapping', 'top_assoc_mapping_label']

(
    pheno_mapped
    .merge(
        diseases.rename(columns={'name': 'ontoma_mapping_label'}),
        left_on='ontoma_mapping', right_on='id', how='left').drop('id', axis=1)
    .merge(
        diseases.rename(columns={'name': 'icd_mapping_label'}),
        left_on='icd_mapping', right_on='id', how='left').drop('id', axis=1)
    .merge(
        diseases.rename(columns={'name': 'top_assoc_mapping_label'}),
        left_on='top_assoc_mapping', right_on='id', how='left').drop('id', axis=1)
    .filter(cols)
    .drop_duplicates()
    .to_csv('../output/az_efo_mapping.csv', index=False)
)

In [235]:

metrics = pheno_mapped.groupby('phenotype').agg({'ontoma_mapping': 'nunique', 'icd_mapping': 'nunique', 'top_assoc_mapping': 'nunique'})
metrics['sum'] = metrics.sum(axis=1)

metrics.head()

Unnamed: 0_level_0,ontoma_mapping,icd_mapping,top_assoc_mapping,sum
phenotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
20001#1002#breast cancer,1,0,1,2
20001#1003#skin cancer,1,0,1,2
20001#1020#large bowel cancer|colorectal cancer,1,0,1,2
20001#1022#colon cancer|sigmoid cancer,1,0,1,2
20001#1030#eye and|or adnexal cancer,0,0,1,1


- 746 total phenotypes
- 260 mappable with ontoma
- 143 with crossreferences to ICD10
- 746 with a top association

### 3. All phenotypes susceptible to be mapped

- Results from ExWAS - P ≤ 2 × 10−9
- Results from PheWAS - P ≤ 2 × 10−9


In [54]:
ROOT = Path.cwd().parent

#var_binary = spark.read.csv(str(Path.joinpath(ROOT / 'data/full_set/2022_03_07/azphewas-com-450k-exwas-binary.csv.bz2')), header='True')
#var_quant = spark.read.csv(str(Path.joinpath(ROOT / 'data/full_set/2022_03_07/azphewas-com-450k-exwas-quantitative.csv.bz2')), header='True')
#gene_binary = spark.read.csv(str(Path.joinpath(ROOT / 'data/full_set/2022_03_07/azphewas-com-450k-phewas-binary.csv.bz2')), header='True')
#gene_quant = spark.read.csv(str(Path.joinpath(ROOT / 'data/full_set/2022_03_07/azphewas-com-450k-phewas-quantitative.csv.bz2')), header='True')