# External Data Retrieval
Uses external APIs for more data. Saves data for further analysis and offline use.

In [1]:
# All imports
import pandas as pd
import numpy as np
import torch
import pickle
from pathlib import Path
import utils

## Data retrieved from the Ensembl API

In [2]:
# Load DataFrames
train_df = pd.read_csv(
    utils.RAW_TRAIN_PATH,
    usecols=["accession", "scoreset", "ensp", "pos", "ref_long", "ref_short", "alt_long", "alt_short"],
    dtype={"accession": str, "scoreset": str, "ensp": str, "pos": int, "ref_long": str, "ref_short": str, "alt_long": str, "alt_short": str}
)
test_df = pd.read_csv(
    utils.RAW_TEST_PATH,
    usecols=["accession", "ensp", "pos", "ref_long", "ref_short", "alt_long", "alt_short"],
    dtype={"accession": str, "ensp": str, "pos": int, "ref_long": str, "ref_short": str, "alt_long": str, "alt_short": str}
)

In [None]:
# Ensembl Protein ID to Sequence mapping
train_ensp_sequence_map: dict[str, str] = {}
test_ensp_sequence_map: dict[str, str] = {}
train_sequence_series: pd.Series = train_df["ensp"].apply(lambda ensp: utils.get_full_sequence(ensp, train_ensp_sequence_map))
test_sequence_series: pd.Series = test_df["ensp"].apply(lambda ensp: utils.get_full_sequence(ensp, test_ensp_sequence_map))

In [None]:
# Save the mappings using pickle
utils.dict_to_pickle(train_ensp_sequence_map, utils.TRAIN_ENSP_SEQUENCE_MAP_PATH)
utils.dict_to_pickle(test_ensp_sequence_map, utils.TEST_ENSP_SEQUENCE_MAP_PATH)

In [None]:
# Ensembl Variant Effect Predictor (VEP)
train_vep_data: list[dict] = utils.vep_from_pickle(utils.TRAIN_VEP_DATA_PATH)

In [None]:
# Example usage of VEP data
# NOTE: "strand" is always 1. To get actual strand, use ["transcript_consequences"][0]["strand"]
print(len(train_vep_data))
print(train_vep_data[0].keys())

print(train_vep_data[0]["most_severe_consequence"])
print(train_vep_data[0]["transcript_consequences"][0]["impact"])
print(train_vep_data[0]["transcript_consequences"][0]["biotype"])
print(train_vep_data[0]["transcript_consequences"][0]["consequence_terms"])
print(train_vep_data[0]["transcript_consequences"][0]["strand"])

variant_types: set[str] = set()
impact_types: set[str] = set()
bio_types: set[str] = set()
consequence_types: list[str] = []
for row in train_vep_data:
    variant_types.add(row.get("most_severe_consequence", "error"))
    impact_types.add(row["transcript_consequences"][0].get("impact", "error"))
    bio_types.add(row["transcript_consequences"][0].get("biotype", "error"))
    for c in row["transcript_consequences"]:
        for consequence in c.get("consequence_terms", ["error"]):
            if consequence not in consequence_types:
                consequence_types.append(consequence)
print(variant_types)
print(impact_types)
print(bio_types)
print(consequence_types)

## Data retrieved from the MaveDB API

# Embeddings from ESM C

In [3]:
train_ensp_sequence_map = utils.pickle_to_dict(utils.TRAIN_ENSP_SEQUENCE_MAP_PATH)
test_ensp_sequence_map = utils.pickle_to_dict(utils.TEST_ENSP_SEQUENCE_MAP_PATH)

In [5]:
# Find elements in test_ensp_sequence_map not in train_ensp_sequence_map
missing_in_train = {k: v for k, v in test_ensp_sequence_map.items() if k not in train_ensp_sequence_map}
print(f"Number of sequences in test set not in train set: {len(missing_in_train)}")

# Find elements in train_ensp_sequence_map not in test_ensp_sequence_map
missing_in_test = {k: v for k, v in train_ensp_sequence_map.items() if k not in test_ensp_sequence_map}
print(f"Number of sequences in train set not in test set: {len(missing_in_test)}")

Number of sequences in test set not in train set: 0
Number of sequences in train set not in test set: 0


In [None]:
# Find ENSPs in test set that are not in train set
missing_in_train = {k: v for k, v in test_ensp_sequence_map.items() if k not in train_ensp_sequence_map}
print(f"Number of sequences in test set not in train set: {len(missing_in_train)}")

# Find ENSPs in train set that are not in test set
missing_in_test = {k: v for k, v in train_ensp_sequence_map.items() if k not in test_ensp_sequence_map}
print(f"Number of sequences in train set not in test set: {len(missing_in_test)}")

In [4]:
train_ensp_embeddings_map: dict[str, torch.Tensor] = {}
for ensp, seq in train_ensp_sequence_map.items():
    train_ensp_embeddings_map[ensp] = utils.get_embedding(seq)

  return torch.tensor(x)


In [5]:
print(len(train_ensp_embeddings_map))
for ensp in train_ensp_embeddings_map:
    print(ensp, train_ensp_embeddings_map[ensp].shape)

136
ENSP00000252519.3 torch.Size([805, 1152])
ENSP00000378426.2 torch.Size([163, 1152])
ENSP00000478114.2 torch.Size([1863, 1152])
ENSP00000155840.2 torch.Size([676, 1152])
ENSP00000262916.6 torch.Size([695, 1152])
ENSP00000381803.3 torch.Size([360, 1152])
ENSP00000229335.6 torch.Size([198, 1152])
ENSP00000509238.1 torch.Size([188, 1152])
ENSP00000370839.6 torch.Size([318, 1152])
ENSP00000352035.2 torch.Size([872, 1152])
ENSP00000315417.3 torch.Size([326, 1152])
ENSP00000498217.1 torch.Size([411, 1152])
ENSP00000368332.4 torch.Size([562, 1152])
ENSP00000419199.1 torch.Size([587, 1152])
ENSP00000284629.2 torch.Size([475, 1152])
ENSP00000257861.3 torch.Size([819, 1152])
ENSP00000358081.4 torch.Size([575, 1152])
ENSP00000338814.5 torch.Size([447, 1152])
ENSP00000494522.3 torch.Size([505, 1152])
ENSP00000419081.2 torch.Size([153, 1152])
ENSP00000380256.4 torch.Size([786, 1152])
ENSP00000352264.5 torch.Size([639, 1152])
ENSP00000392028.1 torch.Size([2997, 1152])
ENSP00000493738.1 torch.Size

In [6]:
utils.dict_to_pickle(train_ensp_embeddings_map, utils.TRAIN_ENSP_EMBEDDINGS_MAP_PATH)