# External Data Retrieval
Uses external APIs for more data. Saves data for further analysis and offline use.

In [1]:
# All imports
import pandas as pd
import pickle
from pathlib import Path
import utils

## Data retrieved from the Ensembl API

In [10]:
# Load DataFrames
train_df = pd.read_csv(
    utils.RAW_TRAIN_PATH,
    usecols=["accession", "scoreset", "ensp", "pos", "ref_long", "ref_short", "alt_long", "alt_short"],
    dtype={"accession": str, "scoreset": str, "ensp": str, "pos": int, "ref_long": str, "ref_short": str, "alt_long": str, "alt_short": str}
)
test_df = pd.read_csv(
    utils.RAW_TEST_PATH,
    usecols=["accession", "ensp", "pos", "ref_long", "ref_short", "alt_long", "alt_short"],
    dtype={"accession": str, "ensp": str, "pos": int, "ref_long": str, "ref_short": str, "alt_long": str, "alt_short": str}
)

In [None]:
# Ensembl Protein ID to Sequence mapping
train_ensp_sequence_map: dict[str, str] = {}
test_ensp_sequence_map: dict[str, str] = {}
train_sequence_series: pd.Series = train_df["ensp"].apply(lambda ensp: utils.get_full_sequence(ensp, train_ensp_sequence_map))
test_sequence_series: pd.Series = test_df["ensp"].apply(lambda ensp: utils.get_full_sequence(ensp, test_ensp_sequence_map))

In [None]:
# Save the mappings using pickle
utils.dict_to_pickle(train_ensp_sequence_map, utils.TRAIN_ENSP_SEQUENCE_MAP_PATH)
utils.dict_to_pickle(test_ensp_sequence_map, utils.TEST_ENSP_SEQUENCE_MAP_PATH)

In [2]:
# Ensembl Variant Effect Predictor (VEP)
train_vep_data: list[dict] = utils.vep_from_pickle(utils.TRAIN_VEP_DATA_PATH)

In [None]:
# Example usage of VEP data
# NOTE: "strand" is always 1. To get actual strand, use ["transcript_consequences"][0]["strand"]
print(len(train_vep_data))
print(train_vep_data[0].keys())

print(train_vep_data[0]["most_severe_consequence"])
print(train_vep_data[0]["transcript_consequences"][0]["impact"])
print(train_vep_data[0]["transcript_consequences"][0]["biotype"])
print(train_vep_data[0]["transcript_consequences"][0]["consequence_terms"])
print(train_vep_data[0]["transcript_consequences"][0]["strand"])

variant_types: set[str] = set()
impact_types: set[str] = set()
bio_types: set[str] = set()
consequence_types: list[str] = []
for row in train_vep_data:
    variant_types.add(row.get("most_severe_consequence", "error"))
    impact_types.add(row["transcript_consequences"][0].get("impact", "error"))
    bio_types.add(row["transcript_consequences"][0].get("biotype", "error"))
    for c in row["transcript_consequences"]:
        for consequence in c.get("consequence_terms", ["error"]):
            if consequence not in consequence_types:
                consequence_types.append(consequence)
print(variant_types)
print(impact_types)
print(bio_types)
print(consequence_types)

178554
dict_keys(['seq_region_name', 'input', 'assembly_name', 'allele_string', 'strand', 'transcript_consequences', 'most_severe_consequence', 'end', 'start', 'colocated_variants', 'id'])
frameshift_variant
HIGH
protein_coding
['frameshift_variant']
-1
{'frameshift_variant', 'inframe_insertion', 'stop_gained', 'splice_acceptor_variant', 'splice_donor_variant', '5_prime_UTR_variant'}
{'MODERATE', 'HIGH', 'MODIFIER', 'LOW'}
{'lncRNA', 'protein_coding_CDS_not_defined', 'nonsense_mediated_decay', 'protein_coding', 'TEC'}
['frameshift_variant', 'splice_region_variant', 'upstream_gene_variant', 'stop_gained', '3_prime_UTR_variant', 'inframe_insertion', 'stop_retained_variant', 'downstream_gene_variant', 'intron_variant', 'non_coding_transcript_variant', 'start_lost', 'start_retained_variant', 'stop_lost', '5_prime_UTR_variant', 'non_coding_transcript_exon_variant', 'NMD_transcript_variant', 'splice_acceptor_variant', 'splice_donor_variant', 'splice_polypyrimidine_tract_variant', 'splice_don

## Data retrieved from the MaveDB API