In [6]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import sys
from pathlib import Path

repo_path = Path().absolute().parent
data_path = repo_path / "data"
sys.path.append(str(repo_path / "src"))

In [8]:
data_path = repo_path / "data"

In [3]:
import utils

utils.setup_data_dir()

File already exists at /Users/rj/personal/GenePT-tools/data/GenePT_emebdding_v2.zip
Extracting files...
Extracting GenePT_emebdding_v2/
Skipping GenePT_emebdding_v2/NCBI_UniProt_summary_of_genes.json - already exists with same size
Skipping GenePT_emebdding_v2/GenePT_gene_embedding_ada_text.pickle - already exists with same size
Skipping GenePT_emebdding_v2/GenePT_gene_protein_embedding_model_3_text.pickle. - already exists with same size
Skipping GenePT_emebdding_v2/NCBI_summary_of_genes.json - already exists with same size
Extraction complete!
Setup finished!


# Load data

In [90]:
import json
import pickle
import pandas as pd

with open(
    repo_path / "data" / "GenePT_emebdding_v2" / "NCBI_summary_of_genes.json", "r"
) as f:
    ncbi_summary_of_genes = json.load(f)

with open(
    repo_path / "data" / "GenePT_emebdding_v2" / "NCBI_UniProt_summary_of_genes.json",
    "r",
) as f:
    ncbi_uniprot_summary_of_genes = json.load(f)

with open(
    repo_path
    / "data"
    / "GenePT_emebdding_v2"
    / "GenePT_gene_embedding_ada_text.pickle",
    "rb",
) as f:
    genept_embedding_ada_text = pickle.load(f)

with open(
    repo_path
    / "data"
    / "GenePT_emebdding_v2"
    / "GenePT_gene_protein_embedding_model_3_text.pickle.",
    "rb",
) as f:
    genept_embedding_large_3 = pickle.load(f)

# both the gene_id and ensembl_id are non-unique
gene_info_table_df = pd.read_parquet(data_path / "gene_info_table.parquet")
gene_info_table_df = gene_info_table_df.reset_index().rename(
    columns={"index": "gene_name"}
)

In [77]:
def flatten_embedding(gene_id, embedding):
    return [gene_id, *embedding]


def embedding_dimension(embedding_dict):
    return len(next(iter(embedding_dict.values())))


ncbi_summary_of_genes_df = pd.DataFrame(
    ncbi_summary_of_genes.items(), columns=["gene_id", "description"]
).set_index("gene_id")
ncbi_uniprot_summary_of_genes_df = pd.DataFrame(
    ncbi_uniprot_summary_of_genes.items(), columns=["gene_id", "description"]
).set_index("gene_id")
genept_embedding_ada_text_df = pd.DataFrame(
    [
        flatten_embedding(gene_id, embedding)
        for gene_id, embedding in genept_embedding_ada_text.items()
    ],
    columns=["gene_id", *range(embedding_dimension(genept_embedding_ada_text))],
).set_index("gene_id")
genept_embedding_large_3_df = pd.DataFrame(
    [
        flatten_embedding(gene_id, embedding)
        for gene_id, embedding in genept_embedding_large_3.items()
    ],
    columns=["gene_id", *range(embedding_dimension(genept_embedding_large_3))],
).set_index("gene_id")

In [78]:
print(
    f"Memory usage of ncbi_summary_of_genes_df: {ncbi_summary_of_genes_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB"
)
print(
    f"Memory usage of ncbi_uniprot_summary_of_genes_df: {ncbi_uniprot_summary_of_genes_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB"
)
print(
    f"Memory usage of genept_embedding_ada_text_df: {genept_embedding_ada_text_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB"
)
print(
    f"Memory usage of genept_embedding_large_3_df: {genept_embedding_large_3_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB"
)

Memory usage of ncbi_summary_of_genes_df: 13.37 MB
Memory usage of ncbi_uniprot_summary_of_genes_df: 21.19 MB
Memory usage of genept_embedding_ada_text_df: 1104.15 MB
Memory usage of genept_embedding_large_3_df: 3141.53 MB


In [80]:
genept_embedding_large_3_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,3062,3063,3064,3065,3066,3067,3068,3069,3070,3071
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-0.01274,-0.018946,-0.000315,-0.002249,0.000806,-0.008889,0.012874,-0.021547,0.007848,-0.000621,...,-0.008572,0.027384,0.025506,-0.025798,-0.013755,-0.013857,0.017296,-0.014403,-0.007316,-0.00101
A1BG-AS1,0.007532,-0.001736,-0.006132,-0.007356,0.026744,-0.023528,0.042288,0.004891,0.008089,-0.017632,...,0.000914,0.026236,0.025545,-0.016278,0.002029,-0.012032,0.010967,-0.023175,-0.015389,-0.008167
A1CF,-0.000421,0.020869,-0.014518,0.001609,-0.012133,-0.005558,0.015109,-0.022753,0.002784,-0.001344,...,0.005719,0.008235,0.031373,-0.013686,-0.003167,-0.029448,0.017611,-0.00664,-0.012428,-0.027936
A2M,-0.018102,0.009826,-0.010828,0.037944,0.002778,-0.028767,-0.013993,-0.026498,0.00711,-0.00907,...,-0.003185,0.000203,-0.007097,-0.008017,-4.9e-05,-0.010028,-0.00148,-0.01219,0.00399,0.015846
A2M-AS1,0.021392,0.014905,-0.012388,0.003324,0.002296,-0.03529,0.016554,0.007071,0.049048,-0.010633,...,0.00596,0.013611,0.01085,-0.004093,0.032661,-0.010186,0.007305,-0.01805,-0.012423,-0.029334


In [30]:
with open(
    repo_path / "data/generated/batch-requests/full_batch_response.jsonl", "r"
) as f:
    full_batch_response = [json.loads(line) for line in f.readlines()]
    responses = [response["response"] for response in full_batch_response]

    contents = [
        response["body"]["choices"][0]["message"]["content"] for response in responses
    ]

In [33]:
full_batch_response_df = pd.DataFrame(
    contents,
    columns=["generated_description"],
    index=ncbi_uniprot_summary_of_genes_df.index,
)

In [37]:
print(
    f"Memory usage of full_batch_response_df: {full_batch_response_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB"
)

Memory usage of full_batch_response_df: 140.86 MB


In [56]:
embedding_dir = data_path / "generated/embeddings"
combined_embedding_path = (
    embedding_dir / "embedding_associations_age_drugs_pathways_openai_large.parquet"
)
combined_embedding_df = pd.read_parquet(combined_embedding_path)

# Create huggingface dataset

In [91]:
# Create the directory if it doesn't exist
huggingface_dataset_path = data_path / "huggingface_dataset"
huggingface_dataset_path.mkdir(exist_ok=True, parents=True)

# Save each DataFrame to Parquet
ncbi_summary_of_genes_df.to_parquet(huggingface_dataset_path / "ncbi_summary.parquet")
ncbi_uniprot_summary_of_genes_df.to_parquet(
    huggingface_dataset_path / "ncbi_uniprot_summary.parquet"
)
gene_info_table_df.to_parquet(huggingface_dataset_path / "gene_info.parquet")
full_batch_response_df.to_parquet(
    huggingface_dataset_path / "generated_descriptions_gpt4o_mini_combined.parquet"
)

In [92]:
with open(huggingface_dataset_path / "README.md", "w") as f:
    f.write(
        """---
configs:
- config_name: ncbi_summary
  data_files: "ncbi_summary.parquet"
  default: true
- config_name: ncbi_uniprot_summary
  data_files: "ncbi_uniprot_summary.parquet"
- config_name: gene_info
  data_files: "gene_info.parquet"
- config_name: generated_descriptions_gpt4o_mini_combined
  data_files: "generated_descriptions_gpt4o_mini_combined.parquet"
---

---

# Gene Description Dataset

This dataset reproduces and expands upon the GenePT project and paper, and makes it easier to reproce and access using standard tools.
The goal is to allow users to compose embeddings across dimensions in order to specialize for specific tasks, and add to the existing
base embeddings by generating new descriptions and embedding them in the same space.
        
## Dataset Description

This dataset
  1. reproduces the data from the GenePT project and paper in a more easy-to-use format. This paper itself aggregates data from multiple sources, so please refer to the paper and repository for detailed source information.
     Citation: Chen YT, Zou J. (2023+) GenePT: A Simple But Effective Foundation Model for Genes and Cells Built From ChatGPT. bioRxiv preprint: https://www.biorxiv.org/content/10.1101/2023.10.16.562533v2.
     GitHub: https://github.com/yiqunchen/GenePT
  2. Adds descriptions of genes extracted from various LLMs, across multiple dimensions, such as regulatory pathways, drug interactions, etc. Currently we use GPT-4o-mini to generate descriptions, and only
     have a combined description that includes several factors.  We will add composable dimensions soon

## Dataset Structure

The dataset contains four main components:

1. NCBI Summary - Contains gene descriptions from NCBI
2. NCBI-UniProt Summary - Contains combined gene descriptions from NCBI and UniProt
3. Gene Info Table - Contains contains a mapping between gene_id, ensmble_id and gene functional annotation
4. Generated Descriptions (Combined)
    - Contains AI-generated gene descriptions 
    - Model: GPT-4o-mini
    - Factors:
      a. Associated genes
      b. Aging related information
      c. Drug interactions
      d. Pathways and biological processes

## Source Data:
Some of the data was sourced from the following sources upstream of the GenePT project:
- NCBI Gene Database
- UniProt Database
The licenses for these databases continue to apply where applicable.
        
"""
    )

In [93]:
from dotenv import load_dotenv

load_dotenv()

from datasets import Dataset
from huggingface_hub import HfApi
import os

# Initialize Hugging Face API
token = os.getenv("HF_WRITE_TOKEN")
api = HfApi(token=token)

In [94]:
dataset_repo_id = "honicky/genept-composable-embeddings-source-data"

In [95]:
# Create dataset configs
configs = {
    "ncbi_summary": "ncbi_summary.parquet",
    "ncbi_uniprot_summary": "ncbi_uniprot_summary.parquet",
    "gene_info": "gene_info.parquet",
    "generated_descriptions_gpt4o_mini_combined": "generated_descriptions_gpt4o_mini_combined.parquet",
}

# Upload each parquet file directly
for config_name, parquet_file in configs.items():
    api.upload_file(
        path_or_fileobj=str(huggingface_dataset_path / parquet_file),
        path_in_repo=parquet_file,
        repo_id=dataset_repo_id,
        repo_type="dataset",
    )

print("Dataset uploaded successfully to HuggingFace Hub")

No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
gene_info.parquet: 100%|██████████| 1.26M/1.26M [00:00<00:00, 2.46MB/s]
No files have been modified since last commit. Skipping to prevent empty commit.


Dataset uploaded successfully to HuggingFace Hub


# Upload embeddings


In [85]:
model_repo_id = "honicky/genept-composable-embeddings"
# Create the directory if it doesn't exist
huggingface_model_path = data_path / "huggingface_model"
huggingface_model_path.mkdir(exist_ok=True, parents=True)

In [86]:
import shutil

shutil.copy(combined_embedding_path, huggingface_model_path, follow_symlinks=True)

genept_embedding_ada_text_df.to_parquet(
    huggingface_model_path / "embedding_original_ada_text.parquet"
)
genept_embedding_large_3_df.to_parquet(
    huggingface_model_path / "embedding_original_large_3.parquet"
)

In [66]:
with open(huggingface_model_path / "README.md", "w") as f:
    f.write(
        """---
datasets:
- honicky/genept-composable-embeddings-source-data
---

# GenePT Composable Embeddings

This model is a set of embeddings for a list of about 33K functional genes, created using OpenAI embedding models (and other in the future) to embed text about the genes. Details about the process and evaluations can be found in the paper:
            
Chen YT, Zou J. (2023+) GenePT: A Simple But Effective Foundation Model for Genes and Cells Built From ChatGPT. bioRxiv preprint: https://www.biorxiv.org/content/10.1101/2023.10.16.562533v2.
            
and on GitHub: https://github.com/yiqunchen/GenePT

In this repsitory, we (not the original authors) are collecting modifications of the original embeddings with the intent of creating a set of composable embeddings for genes.  These embeddings will encode specific information about each gene regarding a set of factors, such as aging, drug interactions, pathways, etc.  The repository also contains the original embeddings. 
            
## Dataset
            
The base dataset was collected from NCBI and UniProt, and contains a set of gene descriptions.  We have used `gtp-4o-mini` (and potentially other models in the future) to generate descriptions of the genes, and other factors as mentioned above. We have collected the source datasets in the `honicky/genept-composable-embeddings-source-data` Dataset repository.

## Model
            
The model is used by multiplying the gene embedding vectors by the expression level for each genes and summing the results (e.g. a matrix multiplication). See the original paper for more details.

## Code
            
The https://github.com/honicky/GenePT-tools repository contains the latest code for building and using the models, as well as some example notebooks.

## License

The original models and data in this repository is licensed under the MIT license. The original GenePT weights are governed by the license of the original GenePT repository.
"""
    )

In [87]:
# List all files in huggingface_model_path directory
files_to_upload = os.listdir(huggingface_model_path)

# Upload each file
for file in files_to_upload:
    local_path = os.path.join(huggingface_model_path, file)
    try:
        api.upload_file(
            path_or_fileobj=local_path,
            path_in_repo=file,
            repo_id=model_repo_id,
            repo_type="model",
        )
        print(f"Successfully uploaded {file}")
    except Exception as e:
        print(f"Error uploading {file}")

embedding_original_large_3.parquet: 100%|██████████| 1.39G/1.39G [01:35<00:00, 14.6MB/s]


Successfully uploaded embedding_original_large_3.parquet


embedding_original_ada_text.parquet: 100%|██████████| 582M/582M [00:29<00:00, 19.8MB/s] 


Successfully uploaded embedding_original_ada_text.parquet


No files have been modified since last commit. Skipping to prevent empty commit.


Successfully uploaded README.md


No files have been modified since last commit. Skipping to prevent empty commit.


Successfully uploaded embedding_associations_age_drugs_pathways_openai_large.parquet


In [88]:
genept_embedding_large_3_df.shape

(133736, 3072)

In [89]:
genept_embedding_ada_text_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,-0.043067,-0.020245,-0.002102,-0.017412,-0.004772,0.018790,-0.014167,0.031064,-0.031476,-0.015532,...,0.017876,0.011861,0.018005,-0.022229,-0.010683,0.012982,-0.017103,-0.046699,-0.006304,0.005322
A1BG-AS1,-0.027003,-0.006847,0.007299,-0.025647,-0.014492,0.028187,-0.025594,0.015941,-0.013449,-0.011075,...,0.005185,-0.005292,0.023267,-0.001981,-0.007359,0.017417,-0.014718,-0.044753,-0.007658,-0.001966
A1CF,-0.040730,-0.014075,0.006621,-0.016098,-0.014863,0.028676,-0.013485,0.035556,-0.019065,-0.008430,...,0.016544,0.017621,-0.017489,-0.015796,-0.027547,0.031906,-0.001667,-0.029385,-0.015073,-0.006473
A2M,-0.029095,-0.001240,-0.015252,-0.025177,-0.012216,0.018987,-0.026927,0.009624,-0.008991,0.006676,...,0.029121,-0.007874,0.013072,-0.018191,-0.007998,0.017277,0.000805,-0.049257,0.005132,0.001392
A2M-AS1,-0.033439,-0.007015,-0.002068,-0.011076,-0.027708,0.030681,-0.032979,0.020639,-0.007839,-0.005092,...,0.012800,-0.012820,0.016814,-0.009313,-0.022518,0.005643,-0.010042,-0.035331,-0.011279,0.003676
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
KRTAP28p1,-0.023102,-0.027802,0.012746,-0.025011,-0.025319,0.011838,-0.012279,0.012532,-0.002612,-0.001223,...,0.021593,0.001784,-0.006917,-0.015584,-0.016211,0.015237,-0.004383,-0.029992,-0.011564,-0.028737
ELOC26,-0.018420,-0.001902,0.016638,-0.047502,-0.040348,0.031001,-0.014815,0.021983,-0.009484,0.016926,...,-0.009923,-0.017529,-0.000795,-0.000451,-0.016761,0.033331,-0.005880,-0.003310,0.013431,-0.021874
FASRL,-0.020013,-0.018781,0.006378,-0.025457,-0.029833,0.016620,-0.011967,0.019238,-0.029445,0.005222,...,0.003350,0.005149,0.013137,-0.017991,-0.004148,0.010685,-0.019307,-0.018268,-0.019376,-0.017714
COMT2,-0.021027,-0.005336,-0.008690,-0.036186,-0.029208,0.038274,-0.008141,0.021674,-0.031666,0.009516,...,0.014181,0.013824,-0.018714,-0.005782,-0.021146,0.024820,0.002504,-0.003615,-0.014974,-0.036582


In [74]:
combined_embedding_df.shape

(33703, 3072)