# Tutorial: Querying an OWL ontology using SPARQL commands

We will use owlready2 to access and query our ontology. Specifically, we will query for treatments matching variants in cBioPortal.

<font color="red">

We implement similar functionality into functions in `query_therapy_regimen.py` for modularity and task decomposition.

</font>

### Import Modules

In [1]:
import os
import re
import csv

import numpy as np
import pandas as pd
import owlready2 as or2

from sample_patients import sample_patient_records
from query_therapy_regimen import get_therapy_given_gene_variant_disease, load_ongology

### Load Ontology

**Note:** Update path to local ontology matching your local file structure.

In [2]:
local_ontology = "../ontology/oncokb.owl"
onto = or2.get_ontology(local_ontology).load()

### Load cBioPortal Mutation Data

**Note:** Update path to cBioPortal mutation data to match your local file structure.

In [3]:
# Sample mutation data for a subset of patients.
csv_path = "mutations.csv"
n_patients = int(14)
out_path = f"mutations_{n_patients}.csv"

sample_patient_records(csv_path, out_path, n_patients=n_patients)

Done


In [4]:
# Load subset of patient data
cBioPortal_mutations = pd.read_csv(out_path)
cBioPortal_mutations = cBioPortal_mutations.loc[:, ["patientId", "proteinChange", "entrezGeneId"]]
cBioPortal_mutations.head()

Unnamed: 0,patientId,proteinChange,entrezGeneId
0,Patient0001,G12C,3845
1,Patient0001,R216*,324
2,Patient0001,R505C,55294
3,Patient0001,E1286*,324
4,Patient0001,R4822H,58508


### Load Gene List

In [5]:
# Load genes matching Entrez Gene ID
gene_list = pd.read_csv("CancerGeneList.tsv", sep="\t", usecols=[0, 1])
cBioPortal_mutations = cBioPortal_mutations.merge(gene_list, left_on="entrezGeneId", right_on="Entrez_Id", how="left").drop("Entrez_Id", axis=1)
cBioPortal_mutations.head()

Unnamed: 0,patientId,proteinChange,entrezGeneId,Gene_Symbol
0,Patient0001,G12C,3845,KRAS
1,Patient0001,R216*,324,APC
2,Patient0001,R505C,55294,FBXW7
3,Patient0001,E1286*,324,APC
4,Patient0001,R4822H,58508,KMT2C


### Load Disease Data

In [6]:
clinical_data = pd.read_csv("clinical_data.csv", index_col=0).drop_duplicates()
cBioPortal_mutations = cBioPortal_mutations.merge(clinical_data, on="patientId", how="left")
cBioPortal_mutations = cBioPortal_mutations.loc[:, ["patientId", "proteinChange", "Gene_Symbol", "entrezGeneId", "name"]]

cancer_types = pd.read_csv("cancer_types.csv", index_col=0)
cBioPortal_mutations = cBioPortal_mutations.merge(cancer_types, on="name", how="left")
cBioPortal_mutations = cBioPortal_mutations.loc[:, ['patientId', 'proteinChange', 'Gene_Symbol', 'entrezGeneId', 'shortName']].dropna()
cBioPortal_mutations.head()

Unnamed: 0,patientId,proteinChange,Gene_Symbol,entrezGeneId,shortName
397,Patient0011,K642E,KIT,3815,MEL
398,Patient0011,E2653K,FAT4,79633,MEL


### Query for Patient Gene/Variant/Disease Combinations in OncoKB and OncoKB+CIViC

Query for therapy regimen associated with each gene/variant/disease combination for each patient sampled from cBioPortal.

In [7]:
unique_patients = np.unique(cBioPortal_mutations["patientId"].to_numpy())
print(f"Number of unique patients in evaluation: {len(unique_patients)}")

Number of unique patients in evaluation: 1


In [None]:
ontologies = [load_ongology("../ontology/oncokb.owl"), load_ongology("../ontology/oncokb_civic.owl")]
ontology_names = ["oncokb", "oncokb_civic"]
evidence_levels = ["1", "2", "3", "4", "R1", "R2"]

for h, ontology in enumerate(ontologies):
    n_therapies = []
    for i, patient in enumerate(unique_patients):
        if (i+1) % 1000 == 0:
            print(f"patient {(i+1)} of {len(unique_patients)}")

        patient_records = cBioPortal_mutations[cBioPortal_mutations["patientId"] == patient]
        n_mutations = patient_records.shape[0]
        n_hits = 0

        for j in range(n_mutations):
            for k in evidence_levels:
                therapies = get_therapy_given_gene_variant_disease(
                    ontology, patient_records.iloc[j, 2], patient_records.iloc[j, 1],
                    patient_records.iloc[j, 4], k
                )
                n_hits += len(therapies)

        n_therapies.append({"patient": patient, "n_therapies": n_hits})

    keys = n_therapies[0].keys()
    a_file = open(f"../output/{ontology_names[h]}.csv", "w")
    dict_writer = csv.DictWriter(a_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(n_therapies)
    a_file.close()
