## working with ESM2 embeddings

In [None]:
! esm-extract esm2_t33_650M_UR50D data/temp/test_esm.fasta data/temp/ --include mean

In [97]:
import io
import os
from biomart import BiomartServer

def _fetchFromServer(ensemble_server, attributes):
    server = BiomartServer(ensemble_server, verbose=True)
    ensmbl = server.datasets["hsapiens_gene_ensembl"]
    print(attributes)
    res = pd.read_csv(
        io.StringIO(
            ensmbl.search({"attributes": attributes}, header=1).content.decode()
        ),
        sep="\t",
    )
    return res

def createFoldersFor(filepath):
    """
    will recursively create folders if needed until having all the folders required to save the file in this filepath
    """
    prevval = ""
    for val in os.path.expanduser(filepath).split("/")[:-1]:
        prevval += val + "/"
        if not os.path.exists(prevval):
            os.mkdir(prevval)

def getBiomartTable(
    ensemble_server="http://feb2023.archive.ensembl.org/biomart",
    useCache=False,
    cache_folder="/tmp/biomart/",
    attributes=[],
    bypass_attributes=False,
):
    """generate a genelist dataframe from ensembl's biomart

    Args:
        ensemble_server ([type], optional): [description]. Defaults to ENSEMBL_SERVER_V.
        useCache (bool, optional): [description]. Defaults to False.
        cache_folder ([type], optional): [description]. Defaults to CACHE_PATH.

    Raises:
        ValueError: [description]

    Returns:
        [type]: [description]
    """
    attr = (
        [
            "ensembl_gene_id",
            "hgnc_symbol",
            "gene_biotype",
            "entrezgene_id",
        ]
        if not bypass_attributes
        else []
    )
    assert cache_folder[-1] == "/"

    cache_folder = os.path.expanduser(cache_folder)
    createFoldersFor(cache_folder)
    cachefile = os.path.join(cache_folder, ".biomart.csv")
    if useCache & os.path.isfile(cachefile):
        print("fetching gene names from biomart cache")
        res = pd.read_csv(cachefile)
    else:
        print("downloading gene names from biomart")

        res = _fetchFromServer(ensemble_server, attr + attributes)
        res.to_csv(cachefile, index=False)
    
    res.columns = attr + attributes
    if type(res) is not type(pd.DataFrame()):
        raise ValueError("should be a dataframe")
    res = res[~(res["ensembl_gene_id"].isna() & res["hgnc_symbol"].isna())]
    res.loc[res[res.hgnc_symbol.isna()].index, "hgnc_symbol"] = res[
        res.hgnc_symbol.isna()
    ]["ensembl_gene_id"]

    return res

In [107]:
biomart2 = getBiomartTable()

downloading gene names from biomart
[BiomartServer:'http://feb2023.archive.ensembl.org/biomart/martservice'] is alive.
[BiomartServer:'http://feb2023.archive.ensembl.org/biomart/martservice'] Fetching datasets
[BiomartServer:'http://feb2023.archive.ensembl.org/biomart/martservice'] Fetching databases
[BiomartDatabase:'Ensembl Genes 109'] Fetching datasets
[BiomartDatabase:'Mouse strains 109'] Fetching datasets
[BiomartDatabase:'Sequence'] Fetching datasets
[BiomartDatabase:'Ontology'] Fetching datasets
[BiomartDatabase:'Genomic features 109'] Fetching datasets
[BiomartDatabase:'Ensembl Variation 109'] Fetching datasets
[BiomartDatabase:'Ensembl Regulation 109'] Fetching datasets
['ensembl_gene_id', 'hgnc_symbol', 'gene_biotype', 'entrezgene_id']
[BiomartDataset:'hsapiens_gene_ensembl'] Searching using following params:
{'attributes': ['ensembl_gene_id',
                'hgnc_symbol',
                'gene_biotype',
                'entrezgene_id']}
[BiomartDataset:'hsapiens_gene_ensemb

In [101]:
biomart = getBiomartTable(attributes=               
    ["ensembl_transcript_id",
        "protein_id",
        "ensembl_peptide_id",
        #"peptide",
       # "uniprotswissprot",
    ], bypass_attributes=False, useCache=False)

downloading gene names from biomart
[BiomartServer:'http://feb2023.archive.ensembl.org/biomart/martservice'] is alive.
[BiomartServer:'http://feb2023.archive.ensembl.org/biomart/martservice'] Fetching datasets
[BiomartServer:'http://feb2023.archive.ensembl.org/biomart/martservice'] Fetching databases
[BiomartDatabase:'Ensembl Genes 109'] Fetching datasets
[BiomartDatabase:'Mouse strains 109'] Fetching datasets
[BiomartDatabase:'Sequence'] Fetching datasets
[BiomartDatabase:'Ontology'] Fetching datasets
[BiomartDatabase:'Genomic features 109'] Fetching datasets
[BiomartDatabase:'Ensembl Variation 109'] Fetching datasets
[BiomartDatabase:'Ensembl Regulation 109'] Fetching datasets
['ensembl_gene_id', 'hgnc_symbol', 'gene_biotype', 'entrezgene_id', 'ensembl_transcript_id', 'protein_id', 'ensembl_peptide_id']
[BiomartDataset:'hsapiens_gene_ensembl'] Searching using following params:
{'attributes': ['ensembl_gene_id',
                'hgnc_symbol',
                'gene_biotype',
          

In [106]:
biomart[biomart.gene_biotype.isin(['Mt_tRNA', 'snRNA', 'sRNA'])]

Unnamed: 0,ensembl_gene_id,hgnc_symbol,gene_biotype,entrezgene_id,ensembl_transcript_id,protein_id,ensembl_peptide_id


In [110]:
biomart2[biomart2.gene_biotype.isin(['Mt_tRNA', 'snRNA', 'sRNA'])].head(20)

Unnamed: 0,ensembl_gene_id,hgnc_symbol,gene_biotype,entrezgene_id
0,ENSG00000210049,MT-TF,Mt_tRNA,
2,ENSG00000210077,MT-TV,Mt_tRNA,
4,ENSG00000209082,MT-TL1,Mt_tRNA,
6,ENSG00000210100,MT-TI,Mt_tRNA,
7,ENSG00000210107,MT-TQ,Mt_tRNA,
8,ENSG00000210112,MT-TM,Mt_tRNA,
10,ENSG00000210117,MT-TW,Mt_tRNA,
11,ENSG00000210127,MT-TA,Mt_tRNA,
12,ENSG00000210135,MT-TN,Mt_tRNA,
13,ENSG00000210140,MT-TC,Mt_tRNA,


In [105]:
main_genes

Unnamed: 0,ensembl_gene_id,hgnc_symbol,gene_biotype,entrezgene_id,ensembl_transcript_id,protein_id,ensembl_peptide_id
0,ENSG00000198888,MT-ND1,protein_coding,4535.0,ENST00000361390,CAA24026,ENSP00000354687
1,ENSG00000198888,MT-ND1,protein_coding,4535.0,ENST00000361390,AAB58943,ENSP00000354687
2,ENSG00000198888,MT-ND1,protein_coding,4535.0,ENST00000361390,BAA07290,ENSP00000354687
3,ENSG00000198888,MT-ND1,protein_coding,4535.0,ENST00000361390,AAP89036,ENSP00000354687
4,ENSG00000198888,MT-ND1,protein_coding,4535.0,ENST00000361390,AAP89049,ENSP00000354687
...,...,...,...,...,...,...,...
901591,ENSG00000162437,RAVER2,protein_coding,55225.0,ENST00000418058,,ENSP00000397069
901592,ENSG00000122432,SPATA1,protein_coding,100505741.0,ENST00000697276,,ENSP00000514413
901593,ENSG00000122432,SPATA1,protein_coding,100505741.0,ENST00000699524,,ENSP00000514414
901594,ENSG00000122432,SPATA1,protein_coding,100505741.0,ENST00000697277,,ENSP00000514416


In [109]:
import gget

In [118]:
len(ls)/2

47.0

In [124]:
ls[val*2].split(' ')[0][1:]

'ENST00000576342'

In [None]:
res = []
size = 600
from multiprocessing import Pool


genes = biomart2[biomart2.gene_biotype.isin(['protein_coding'])].ensembl_gene_id.tolist()
groups = [genes[i*size:(i+1)*size] for i in range(len(genes)//size)] + [genes[(len(genes)//size)*size:]]
with Pool() as p:
    results = [p.apply(gget.seq, args=(sub, True, False)) for sub in groups]

res = [item for sublist in results for item in sublist]

In [141]:
from scprint.utils import get_seq
%reload_ext autoreload
%autoreload 2

In [150]:
get_seq(biomart2[biomart2.gene_biotype.isin(['protein_coding'])].ensembl_gene_id.tolist()[:4], translate=True, isoforms=False, save=True)

Sun Nov 12 16:33:29 2023 INFO Requesting amino acid sequence of the canonical transcript ENST00000361390 of gene ENSG00000198888 from UniProt.
Sun Nov 12 16:33:29 2023 INFO Requesting amino acid sequence of the canonical transcript ENST00000361453 of gene ENSG00000198763 from UniProt.
Sun Nov 12 16:33:29 2023 INFO Requesting amino acid sequence of the canonical transcript ENST00000361624 of gene ENSG00000198804 from UniProt.
Sun Nov 12 16:33:29 2023 INFO Requesting amino acid sequence of the canonical transcript ENST00000361739 of gene ENSG00000198712 from UniProt.


> [0;32m/Users/jkobject/Documents/code/scPRINT/scprint/utils/get_seq.py[0m(300)[0;36mseq[0;34m()[0m
[0;32m    298 [0;31m            [0;32mimport[0m [0mpdb[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    299 [0;31m            [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 300 [0;31m            [0mdf_uniprot[0m [0;34m=[0m [0mdf_uniprot[0m[0;34m.[0m[0mjoin[0m[0;34m([0m[0minfo_df[0m[0;34m,[0m [0mon[0m[0;34m=[0m[0;34m"canonical_transcript"[0m[0;34m,[0m [0mhow[0m[0;34m=[0m[0;34m"inner"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    301 [0;31m[0;34m[0m[0m
[0m[0;32m    302 [0;31m[0;34m[0m[0m
[0m
             index         ensembl_id uniprot_id pdb_id ncbi_gene_id  \
0  ENSG00000198888  ENSG00000198888.2        NaN    NaN          NaN   
1  ENSG00000198763  ENSG00000198763.3        NaN    NaN          NaN   
2  ENSG00000198804  ENSG00000198804.2        NaN    NaN          NaN   

In [None]:
python launch/predict.py --config="pretrained/extract_embedding.yml" \
--data_path="./data/examples/example.fasta" --save_dir="./resuts" \
--save_frequency 1 --save_embeddings

In [102]:
biomart

Unnamed: 0,ensembl_gene_id,hgnc_symbol,gene_biotype,entrezgene_id,ensembl_transcript_id,protein_id,ensembl_peptide_id
0,ENSG00000198888,MT-ND1,protein_coding,4535.0,ENST00000361390,CAA24026,ENSP00000354687
1,ENSG00000198888,MT-ND1,protein_coding,4535.0,ENST00000361390,AAB58943,ENSP00000354687
2,ENSG00000198888,MT-ND1,protein_coding,4535.0,ENST00000361390,BAA07290,ENSP00000354687
3,ENSG00000198888,MT-ND1,protein_coding,4535.0,ENST00000361390,AAP89036,ENSP00000354687
4,ENSG00000198888,MT-ND1,protein_coding,4535.0,ENST00000361390,AAP89049,ENSP00000354687
...,...,...,...,...,...,...,...
901591,ENSG00000162437,RAVER2,protein_coding,55225.0,ENST00000418058,,ENSP00000397069
901592,ENSG00000122432,SPATA1,protein_coding,100505741.0,ENST00000697276,,ENSP00000514413
901593,ENSG00000122432,SPATA1,protein_coding,100505741.0,ENST00000699524,,ENSP00000514414
901594,ENSG00000122432,SPATA1,protein_coding,100505741.0,ENST00000697277,,ENSP00000514416


In [12]:
import os
import glob
import torch

tensor_list = []
for file in glob.glob("../../data/temp/*.pt"):
    tensor = torch.load(file)['mean_representations'][33]
    tensor_list.append(tensor)

concatenated_tensor = torch.cat(tensor_list, dim=0)


{33: tensor([-0.0224,  0.0423, -0.0291,  ..., -0.1396, -0.0235, -0.0449])}

In [None]:
# get all genes in the dataset, all species
# load them from biomart (for each species)
# for all protein codings
    # get the fasta file from uniprot using gget seq
    # if not available use gget seq from ensembl
        # use https://github.com/prestevez/dna2proteins to convert to protein
        # merge with the uniprot fasta
    # get the embedding of the fasta file using esm
    

# for all non protein codings (RNA based)
    # get the fasta file from ensembl using gget seq
    # use https://github.com/ml4bio/RNA-FM to embed the sequence

# can we have learned embeddings for just a subset of the elements of the transformer?
# can we have two KQV matrices, one for the protein coding, one for the RNA based? -> we would need at least to not have a skip connection for this first layer
# else we have an additional FCN layer that maps both to the actual embedding size
# adds 



In [None]:
def get_structural_embeddings(biotype, ids):

    if biotype == "proteins":

    if biotype == "protein_coding_genes":

    if biotype == "non_coding_genes":

    if biotype == "coding_transcripts":

    if biotype == "non_coding_transcripts":
        # get fasta file
        # subset fasta file
        # 

In [None]:
store it as an additional array in the dataset (make it behave as a varm)

add anndata idioms in the dataset

add GRN as a sparse array in the data (make it behave as a varp)

In [None]:

https://ftp.ensembl.org/pub/release-110/fasta/mus_musculus/pep/Mus_musculus.GRCm39.pep.all.fa.gz

https://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz

https://ftp.ensembl.org/pub/release-110/fasta/mus_musculus/ncrna/Mus_musculus.GRCm39.ncrna.fa.gz

In [2]:
import ftplib
import os

def list_files(ftp, match=''):
    files = ftp.nlst()
    return [file for file in files if file.endswith(match)]

def load_fasta_species(species="homo_sapiens")
    ftp = ftplib.FTP('ftp.ensembl.org')
    ftp.login()
    ftp.cwd('/pub/release-110/fasta/homo_sapiens/pep/')
    file = list_files(ftp, '.all.fa.gz')[0]
    local_file_path = '../../data/fasta/' + file
    if not os.path.exists(local_file_path):
        with open(local_file_path, 'wb') as local_file:
            ftp.retrbinary('RETR ' + file, local_file.write)
    ftp.cwd('/pub/release-110/fasta/homo_sapiens/ncrna/')
    file = list_files(ftp, '.ncrna.fa.gz')[0]
    local_file_path = '../../data/fasta/' + file
    if not os.path.exists(local_file_path):
        with open(local_file_path, 'wb') as local_file:
            ftp.retrbinary('RETR ' + file, local_file.write)
    ftp.quit()

'221 Goodbye.'

In [None]:
seqs, label, test_dl  = data.load_data_EMB(args.data_embedding) 
features = train.make_feature(model, test_dl, seqs)