# Inference with MOJO - PyTorch version from HuggingFace


[![Open All Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/instadeepai/nucleotide-transformer/blob/main/notebooks/mojo/inference_mojo_pytorch_example.ipynb)

## Installation and imports

In [None]:
!pip install pandas
!pip install transformers
!pip install torch

In [None]:
try:
    import nucleotide_transformer
except:
    !pip install git+https://github.com/instadeepai/nucleotide-transformer@main | tail -n 1
    import nucleotide_transformer

In [None]:
import numpy as np
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from huggingface_hub import hf_hub_download

# Load model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/MOJO", trust_remote_code=True)
model = AutoModel.from_pretrained(
    "InstaDeepAI/MOJO",
    trust_remote_code=True,
)

## Download, load and preprocess the data

In [None]:
n_examples = 4
omic_dict = {}

for omic in ["rnaseq", "methylation"]:
    csv_path = hf_hub_download(
        repo_id="InstaDeepAI/MOJO",
        filename=f"data/tcga_{omic}_sample.csv",
        repo_type="model",
    )
    omic_array = pd.read_csv(csv_path).drop(["identifier", "cohort"], axis=1).to_numpy()[:n_examples, :]
    if omic == "rnaseq":
        omic_array = np.log10(1 + omic_array)
    assert omic_array.shape[1] == model.config.sequence_length
    omic_dict[omic] = omic_array

In [None]:
omic_ids = {
    omic: tokens["input_ids"]
    for omic, tokens in tokenizer.batch_encode_plus(omic_dict, pad_to_fixed_length=True, return_tensors="pt").items()
}

# Inference

In [None]:
omic_mean_embeddings = model(omic_ids)["after_transformer_embedding"].mean(axis=1) # embeddings can be used for downstream tasks.