# Inference with BulkRNABert - PyTorch version from HuggingFace

[![Open All Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/instadeepai/nucleotide-transformer/blob/main/notebooks/bulk_rna_bert/inference_bulkrnabert_pytorch_example.ipynb)

## Installation and imports

In [None]:
!pip install pandas
!pip install transformers
!pip install torch

In [None]:
try:
    import nucleotide_transformer
except:
    !pip install git+https://github.com/instadeepai/nucleotide-transformer@main | tail -n 1
    import nucleotide_transformer

In [None]:
from huggingface_hub import hf_hub_download
import numpy as np
import pandas as pd
from transformers import AutoConfig, AutoModel, AutoTokenizer

# Load model

In [None]:
# Load model and tokenizer from Hugging Face
config = AutoConfig.from_pretrained(
    "InstaDeepAI/BulkRNABert",
    trust_remote_code=True,
)
config.embeddings_layers_to_save = (4,) # last transformer layer

tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/BulkRNABert", trust_remote_code=True)
model = AutoModel.from_pretrained(
    "InstaDeepAI/BulkRNABert",
    config=config,
    trust_remote_code=True,
)

## Download the data

In [None]:
# Downloading the bulk RNA-seq file from HuggingFace
csv_path = hf_hub_download(
    repo_id="InstaDeepAI/BulkRNABert",
    filename="data/tcga_sample.csv",
    repo_type="model",
)

# Load dataset and preprocess

In [None]:
gene_expression_array = pd.read_csv(csv_path).drop(["identifier"], axis=1).to_numpy()[:1, :]
gene_expression_array = np.log10(1 + gene_expression_array)
assert gene_expression_array.shape[1] == config.n_genes

# Tokenize
gene_expression_ids = tokenizer.batch_encode_plus(gene_expression_array, return_tensors="pt")["input_ids"]

# Inference

In [None]:
# Compute BulkRNABert's embeddings
gene_expression_mean_embeddings = model(gene_expression_ids)["embeddings_4"].mean(axis=1)  # embeddings can be used for downstream tasks.