# External Data Retrieval
Uses external APIs for more data. Saves data for further analysis and offline use.

In [1]:
# All imports
import pandas as pd
import pickle
from pathlib import Path
import utils

## Data retrieved from the Ensembl API

In [2]:
# Load DataFrames
train_df = pd.read_csv(
    utils.RAW_TRAIN_PATH,
    usecols=["accession", "scoreset", "ensp", "pos", "ref_long", "ref_short", "alt_long", "alt_short"],
    dtype={"accession": str, "scoreset": str, "ensp": str, "pos": int, "ref_long": str, "ref_short": str, "alt_long": str, "alt_short": str}
)
test_df = pd.read_csv(
    utils.RAW_TEST_PATH,
    usecols=["accession", "ensp", "pos", "ref_long", "ref_short", "alt_long", "alt_short"],
    dtype={"accession": str, "ensp": str, "pos": int, "ref_long": str, "ref_short": str, "alt_long": str, "alt_short": str}
)

In [None]:
# Ensembl Protein ID to Sequence mapping
train_ensp_sequence_map: dict[str, str] = {}
test_ensp_sequence_map: dict[str, str] = {}
train_sequence_series: pd.Series = train_df["ensp"].apply(lambda ensp: utils.get_full_sequence(ensp, train_ensp_sequence_map))
test_sequence_series: pd.Series = test_df["ensp"].apply(lambda ensp: utils.get_full_sequence(ensp, test_ensp_sequence_map))

In [None]:
# Save the mappings using pickle
utils.dict_to_pickle(train_ensp_sequence_map, utils.TRAIN_ENSP_SEQUENCE_MAP_PATH)
utils.dict_to_pickle(test_ensp_sequence_map, utils.TEST_ENSP_SEQUENCE_MAP_PATH)

In [6]:
# Ensembl Variant Effect Predictor (VEP)
train_vep_data: list[dict] = []
with open("../data/train/train_vep.pkl", "rb") as f:
    num_batches: int = 0
    while True:
        try:
            batch: list[dict] = pickle.load(f)
            for item in batch:
                train_vep_data.append(item)
            num_batches += 1
            # if (num_batches == 1588):
            #     break
        except EOFError:
            break


In [7]:
i = 0
for item in train_vep_data:
    if item.get("transcript_consequences", None) is None:
        print(i)
    i += 1

print("------")
print(i)

------
178554


In [8]:
with open("../data/train/train_vep_full.pkl", "ab") as f:
    for item in train_vep_data:
        pickle.dump(item, f)

## Data retrieved from the MaveDB API