# External Data Retrieval
Uses external APIs for more data. Saves data for further analysis and offline use.

In [1]:
# All imports
import pandas as pd
import pickle
from pathlib import Path
import utils

## Data retrieved from the Ensembl API

In [10]:
# Load DataFrames
train_df = pd.read_csv(
    utils.RAW_TRAIN_PATH,
    usecols=["accession", "scoreset", "ensp", "pos", "ref_long", "ref_short", "alt_long", "alt_short"],
    dtype={"accession": str, "scoreset": str, "ensp": str, "pos": int, "ref_long": str, "ref_short": str, "alt_long": str, "alt_short": str}
)
test_df = pd.read_csv(
    utils.RAW_TEST_PATH,
    usecols=["accession", "ensp", "pos", "ref_long", "ref_short", "alt_long", "alt_short"],
    dtype={"accession": str, "ensp": str, "pos": int, "ref_long": str, "ref_short": str, "alt_long": str, "alt_short": str}
)

In [None]:
# Ensembl Protein ID to Sequence mapping
train_ensp_sequence_map: dict[str, str] = {}
test_ensp_sequence_map: dict[str, str] = {}
train_sequence_series: pd.Series = train_df["ensp"].apply(lambda ensp: utils.get_full_sequence(ensp, train_ensp_sequence_map))
test_sequence_series: pd.Series = test_df["ensp"].apply(lambda ensp: utils.get_full_sequence(ensp, test_ensp_sequence_map))

In [None]:
# Save the mappings using pickle
utils.dict_to_pickle(train_ensp_sequence_map, utils.TRAIN_ENSP_SEQUENCE_MAP_PATH)
utils.dict_to_pickle(test_ensp_sequence_map, utils.TEST_ENSP_SEQUENCE_MAP_PATH)

In [2]:
# Ensembl Variant Effect Predictor (VEP)
train_vep_data: list[dict] = utils.vep_from_pickle(utils.TRAIN_VEP_DATA_PATH)

In [None]:
# Example usage of VEP data
# NOTE: "strand" is always 1. To get actual strand, use ["transcript_consequences"][0]["strand"]
print(len(train_vep_data))
print(train_vep_data[0].keys())
print(train_vep_data[0]["most_severe_consequence"])
print(train_vep_data[0]["transcript_consequences"][0].keys())
print(train_vep_data[0])
# Print all unique variant types in the VEP data
variant_types: set[str] = set()
for row in train_vep_data:
    variant_types.add(row.get("most_severe_consequence", "error"))
print(variant_types)

178554
dict_keys(['seq_region_name', 'input', 'assembly_name', 'allele_string', 'strand', 'transcript_consequences', 'most_severe_consequence', 'end', 'start', 'colocated_variants', 'id'])
frameshift_variant
dict_keys(['cdna_end', 'consequence_terms', 'cdna_start', 'impact', 'strand', 'cds_start', 'amino_acids', 'transcript_id', 'hgnc_id', 'gene_symbol_source', 'cds_end', 'protein_start', 'variant_allele', 'gene_symbol', 'gene_id', 'protein_end', 'codons', 'biotype'])
{'seq_region_name': 'X', 'input': 'X:g.15594962..15594964delinsTTA', 'assembly_name': 'GRCh38', 'allele_string': 'C/TTA', 'strand': 1, 'transcript_consequences': [{'cdna_end': 277, 'consequence_terms': ['frameshift_variant'], 'cdna_start': 277, 'impact': 'HIGH', 'strand': -1, 'cds_start': 228, 'amino_acids': 'Q/HX', 'transcript_id': 'ENST00000252519', 'hgnc_id': 'HGNC:13557', 'gene_symbol_source': 'HGNC', 'cds_end': 228, 'protein_start': 76, 'variant_allele': 'TTA', 'gene_symbol': 'ACE2', 'gene_id': 'ENSG00000130234', 'pr

## Data retrieved from the MaveDB API