# External Data Retrieval
Uses external APIs for more data. Saves data for further analysis and offline use.

In [1]:
# All imports
import pandas as pd
import numpy as np
import torch
import pickle
from pathlib import Path
import utils

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

## Data retrieved from the Ensembl API

In [2]:
# Load DataFrames
train_df = pd.read_csv(
    utils.RAW_TRAIN_PATH,
    usecols=["accession", "scoreset", "ensp", "pos", "ref_long", "ref_short", "alt_long", "alt_short"],
    dtype={"accession": str, "scoreset": str, "ensp": str, "pos": int, "ref_long": str, "ref_short": str, "alt_long": str, "alt_short": str}
)
test_df = pd.read_csv(
    utils.RAW_TEST_PATH,
    usecols=["accession", "ensp", "pos", "ref_long", "ref_short", "alt_long", "alt_short"],
    dtype={"accession": str, "ensp": str, "pos": int, "ref_long": str, "ref_short": str, "alt_long": str, "alt_short": str}
)

In [None]:
# Ensembl Protein ID to Sequence mapping
train_ensp_sequence_map: dict[str, str] = {}
test_ensp_sequence_map: dict[str, str] = {}
train_sequence_series: pd.Series = train_df["ensp"].apply(lambda ensp: utils.get_full_sequence(ensp, train_ensp_sequence_map))
test_sequence_series: pd.Series = test_df["ensp"].apply(lambda ensp: utils.get_full_sequence(ensp, test_ensp_sequence_map))

In [None]:
# Save the mappings using pickle
utils.dict_to_pickle(train_ensp_sequence_map, utils.ENSP_SEQUENCE_MAP_PATH)
utils.dict_to_pickle(test_ensp_sequence_map, utils.ENSP_SEQUENCE_MAP_PATH)

In [None]:
# Ensembl Variant Effect Predictor (VEP)
train_vep_data: list[dict] = utils.vep_from_pickle(utils.TRAIN_VEP_DATA_PATH)

In [None]:
# Example usage of VEP data
# NOTE: "strand" is always 1. To get actual strand, use ["transcript_consequences"][0]["strand"]
print(len(train_vep_data))
print(train_vep_data[0].keys())

print(train_vep_data[0]["most_severe_consequence"])
print(train_vep_data[0]["transcript_consequences"][0]["impact"])
print(train_vep_data[0]["transcript_consequences"][0]["biotype"])
print(train_vep_data[0]["transcript_consequences"][0]["consequence_terms"])
print(train_vep_data[0]["transcript_consequences"][0]["strand"])

variant_types: set[str] = set()
impact_types: set[str] = set()
bio_types: set[str] = set()
consequence_types: list[str] = []
for row in train_vep_data:
    variant_types.add(row.get("most_severe_consequence", "error"))
    impact_types.add(row["transcript_consequences"][0].get("impact", "error"))
    bio_types.add(row["transcript_consequences"][0].get("biotype", "error"))
    for c in row["transcript_consequences"]:
        for consequence in c.get("consequence_terms", ["error"]):
            if consequence not in consequence_types:
                consequence_types.append(consequence)
print(variant_types)
print(impact_types)
print(bio_types)
print(consequence_types)

## Data retrieved from the MaveDB API

# Embeddings from ESM C

In [3]:
ensp_sequence_map = utils.pickle_to_dict(utils.ENSP_SEQUENCE_MAP_PATH)

In [4]:
ensp_embeddings_map: dict[str, torch.Tensor] = {}
num_embeddings = 0
for ensp, seq in ensp_sequence_map.items():
    ensp_embeddings_map[ensp] = utils.get_embedding(seq)
    num_embeddings += 1
    print(f"Processed {num_embeddings} embeddings")

Processed 1 embeddings
Processed 2 embeddings
Processed 3 embeddings
Processed 4 embeddings
Processed 5 embeddings
Processed 6 embeddings
Processed 7 embeddings
Processed 8 embeddings
Processed 9 embeddings
Processed 10 embeddings
Processed 11 embeddings
Processed 12 embeddings
Processed 13 embeddings
Processed 14 embeddings
Processed 15 embeddings
Processed 16 embeddings
Processed 17 embeddings
Processed 18 embeddings
Processed 19 embeddings
Processed 20 embeddings
Processed 21 embeddings
Processed 22 embeddings
Processed 23 embeddings
Processed 24 embeddings
Processed 25 embeddings
Processed 26 embeddings
Processed 27 embeddings
Processed 28 embeddings
Processed 29 embeddings
Processed 30 embeddings
Processed 31 embeddings
Processed 32 embeddings
Processed 33 embeddings
Processed 34 embeddings
Processed 35 embeddings
Processed 36 embeddings
Processed 37 embeddings
Processed 38 embeddings
Processed 39 embeddings
Processed 40 embeddings
Processed 41 embeddings
Processed 42 embeddings
P

In [5]:
print(len(ensp_embeddings_map))
for ensp in ensp_embeddings_map:
    print(ensp, ensp_embeddings_map[ensp].shape)

136
ENSP00000252519.3 torch.Size([1152])
ENSP00000378426.2 torch.Size([1152])
ENSP00000478114.2 torch.Size([1152])
ENSP00000155840.2 torch.Size([1152])
ENSP00000262916.6 torch.Size([1152])
ENSP00000381803.3 torch.Size([1152])
ENSP00000229335.6 torch.Size([1152])
ENSP00000509238.1 torch.Size([1152])
ENSP00000370839.6 torch.Size([1152])
ENSP00000352035.2 torch.Size([1152])
ENSP00000315417.3 torch.Size([1152])
ENSP00000498217.1 torch.Size([1152])
ENSP00000368332.4 torch.Size([1152])
ENSP00000419199.1 torch.Size([1152])
ENSP00000284629.2 torch.Size([1152])
ENSP00000257861.3 torch.Size([1152])
ENSP00000358081.4 torch.Size([1152])
ENSP00000338814.5 torch.Size([1152])
ENSP00000494522.3 torch.Size([1152])
ENSP00000419081.2 torch.Size([1152])
ENSP00000380256.4 torch.Size([1152])
ENSP00000352264.5 torch.Size([1152])
ENSP00000392028.1 torch.Size([1152])
ENSP00000493738.1 torch.Size([1152])
ENSP00000311083.5 torch.Size([1152])
ENSP00000221996.5 torch.Size([1152])
ENSP00000225387.3 torch.Size([1152

In [6]:
utils.dict_to_pickle(ensp_embeddings_map, utils.ENSP_EMBEDDINGS_MAP_PATH)