# External Data Retrieval
Uses external APIs for more data. Saves data for further analysis and offline use.

In [1]:
# All imports
import pandas as pd
import pickle
from pathlib import Path
import utils

## Data retrieved from the Ensembl API

In [2]:
# Load DataFrames
train_df = pd.read_csv(
    utils.RAW_TRAIN_PATH,
    usecols=["accession", "scoreset", "ensp", "pos", "ref_long", "ref_short", "alt_long", "alt_short"],
    dtype={"accession": str, "scoreset": str, "ensp": str, "pos": int, "ref_long": str, "ref_short": str, "alt_long": str, "alt_short": str}
)
test_df = pd.read_csv(
    utils.RAW_TEST_PATH,
    usecols=["accession", "ensp", "pos", "ref_long", "ref_short", "alt_long", "alt_short"],
    dtype={"accession": str, "ensp": str, "pos": int, "ref_long": str, "ref_short": str, "alt_long": str, "alt_short": str}
)

In [None]:
# Ensembl Protein ID to Sequence mapping
train_ensp_sequence_map: dict[str, str] = {}
test_ensp_sequence_map: dict[str, str] = {}
train_sequence_series: pd.Series = train_df["ensp"].apply(lambda ensp: utils.get_full_sequence(ensp, train_ensp_sequence_map))
test_sequence_series: pd.Series = test_df["ensp"].apply(lambda ensp: utils.get_full_sequence(ensp, test_ensp_sequence_map))

In [None]:
# Save the mappings using pickle
utils.dict_to_pickle(train_ensp_sequence_map, utils.TRAIN_ENSP_SEQUENCE_MAP_PATH)
utils.dict_to_pickle(test_ensp_sequence_map, utils.TEST_ENSP_SEQUENCE_MAP_PATH)

In [51]:
# Ensembl Variant Effect Predictor (VEP)
# TODO: will process pickle files later
train_vep_data: list[list[dict]] = []
with open("../data/train/train_vep.pkl", "rb") as f:
    num_batches: int = 0
    while True:
        try:
            train_vep_data.append(pickle.load(f))
            num_batches += 1
            if (num_batches == 1398):
                break
        except EOFError:
            break


In [63]:
i = 0
for item in train_vep_data:
    for vep_data in item:
        if vep_data.get("transcript_consequences", None) is None:
            print(i)
        i += 1

print(i)
print(train_vep_data[i // 100 - 1][i % 100 - 1])
print(train_vep_data[-1][-1])

139800
{'most_severe_consequence': 'frameshift_variant', 'seq_region_name': '1', 'strand': 1, 'allele_string': 'T/GAA', 'id': '1:g.45342988..45342990delinsGAA', 'input': '1:g.45342988..45342990delinsGAA', 'end': 45342988, 'assembly_name': 'GRCh38', 'start': 45342988, 'transcript_consequences': [{'variant_allele': 'GAA', 'hgnc_id': 'HGNC:7527', 'distance': 2908, 'impact': 'MODIFIER', 'strand': -1, 'gene_id': 'ENSG00000132781', 'transcript_id': 'ENST00000354383', 'biotype': 'protein_coding', 'gene_symbol_source': 'HGNC', 'gene_symbol': 'MUTYH', 'consequence_terms': ['upstream_gene_variant']}, {'strand': -1, 'hgnc_id': 'HGNC:7527', 'variant_allele': 'GAA', 'distance': 2873, 'impact': 'MODIFIER', 'biotype': 'protein_coding', 'transcript_id': 'ENST00000355498', 'gene_symbol': 'MUTYH', 'gene_symbol_source': 'HGNC', 'consequence_terms': ['upstream_gene_variant'], 'gene_id': 'ENSG00000132781'}, {'gene_id': 'ENSG00000070759', 'gene_symbol': 'TESK2', 'gene_symbol_source': 'HGNC', 'consequence_te

In [53]:
with open("../data/train/train_vep_restore.pkl", "ab") as f:
    for batch in train_vep_data:
        pickle.dump(batch, f)

## Data retrieved from the MaveDB API