In [1]:
"""
~/datasets/...


phage_fragmented/
x4
    - Dataset-1_temperate.fasta

        => >Temp_gi|149|ref|NC_013055| Burkholderia phage KS9 linear:0-12169:1
            => >Temp
    idem pour viru

    - Dataset-1_virulent.fasta
    - Dataset-2_temperate.fasta
    - Dataset-2_virulent.fasta
    
refseq_simulated_metagenome/
    - host_chr_pvog_fragments.fasta
       => BACT


>Viru
ACTG

>Temp
ACTG

>Bact
ACTG

"""

'\n~/datasets/...\n\n\nphage_fragmented/\nx4\n    - Dataset-1_temperate.fasta\n\n        => >Temp_gi|149|ref|NC_013055| Burkholderia phage KS9 linear:0-12169:1\n            => >Temp\n    idem pour viru\n\n    - Dataset-1_virulent.fasta\n    - Dataset-2_temperate.fasta\n    - Dataset-2_virulent.fasta\n    \nrefseq_simulated_metagenome/\n    - host_chr_pvog_fragments.fasta\n       => BACT\n\n\n>Viru\nACTG\n\n>Temp\nACTG\n\n>Bact\nACTG\n\n'

In [2]:
import os

# Chemins vers les dossiers contenant les fichiers .fasta
phage_dir = 'datasets/phage_fragmented/'
refseq_dir = 'datasets/refseq_simulated_metagenome/'

# Liste des fichiers à traiter
fichiers = [
    'Dataset-2_temperate_fragmented.fasta',
    'Dataset-1_virulent_fragmented.fasta',
    'Dataset-2_virulent_fragmented.fasta',
    'Dataset-1_temperate_fragmented.fasta',
    'host_chr_pvog_fragments.fasta'
]

# Chemin du fichier de sortie (dans le dossier datasets)
output_file = 'datasets/fusion_sequences.fasta'

# Créer le dossier 'datasets' si nécessaire
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# Fonction pour extraire et modifier l'en-tête
def modifier_header(header):
    # Extraire la première colonne avant le premier "|"
    first_column = header.split('|')[0]
    if first_column.startswith(">Viru"):
        return ">Viru"
    elif first_column.startswith(">Temp"):
        return ">Temp"
    else:
        return ">Bact"

# Fonction pour lire et fusionner les séquences
def fusionner_sequences():
    with open(output_file, 'w') as output:
        # Traitement des fichiers du dossier phage_fragmented
        for filename in fichiers:
            file_path = None
            if filename in os.listdir(phage_dir):
                file_path = os.path.join(phage_dir, filename)
            elif filename in os.listdir(refseq_dir):
                file_path = os.path.join(refseq_dir, filename)

            if file_path:
                with open(file_path, 'r') as file:
                    header = None
                    sequence = []
                    for line in file:
                        line = line.strip()
                        if line.startswith('>'):
                            # Si on a déjà une séquence, on l'écrit dans le fichier
                            if header:
                                output.write(f"{header}\n")
                                output.write(''.join(sequence) + '\n')
                            # Nouveau header
                            header = modifier_header(line)
                            sequence = []
                        else:
                            sequence.append(line)
                    # Ajouter la dernière séquence lue
                    if header:
                        output.write(f"{header}\n")
                        output.write(''.join(sequence) + '\n')

# Exécution de la fusion
fusionner_sequences()
print(f"Les séquences ont été fusionnées et enregistrées dans {output_file}")


Les séquences ont été fusionnées et enregistrées dans datasets/fusion_sequences.fasta


In [3]:
# Fonction pour compter les occurrences de Bact, Temp, et Viru dans le fichier
def compter_occurrences(fichier):
    counts = {'Bact': 0, 'Temp': 0, 'Viru': 0}
    
    with open(fichier, 'r') as file:
        for line in file:
            # Vérifier si la ligne est un en-tête
            if line.startswith('>'):
                # Vérifier le début de l'en-tête
                if line.startswith('>Viru'):
                    counts['Viru'] += 1
                elif line.startswith('>Temp'):
                    counts['Temp'] += 1
                else:
                    counts['Bact'] += 1
    
    return counts

# Appeler la fonction pour compter les occurrences dans le fichier fusionné
resultats = compter_occurrences('datasets/fusion_sequences.fasta')

# Affichage des résultats
print(f"Viru : {resultats['Viru']}")
print(f"Temp : {resultats['Temp']}")
print(f"Bact : {resultats['Bact']}")


Viru : 11565
Temp : 2769
Bact : 104003


In [4]:
"""# Fonction pour afficher les 50 premières lignes du fichier fusionné (en-tête + séquences)
def afficher_head(fichier, n_lignes=50):
    with open(fichier, 'r') as file:
        count = 0
        for line in file:
            if count < n_lignes:
                print(line.strip())  # Affiche chaque ligne sans sauter de ligne supplémentaire
                count += 1
            else:
                break

# Appeler la fonction pour afficher les 50 premières lignes du fichier fusionné
afficher_head('datasets/fusion_sequences_shuffled.fasta', n_lignes=50)"""


"# Fonction pour afficher les 50 premières lignes du fichier fusionné (en-tête + séquences)\ndef afficher_head(fichier, n_lignes=50):\n    with open(fichier, 'r') as file:\n        count = 0\n        for line in file:\n            if count < n_lignes:\n                print(line.strip())  # Affiche chaque ligne sans sauter de ligne supplémentaire\n                count += 1\n            else:\n                break\n\n# Appeler la fonction pour afficher les 50 premières lignes du fichier fusionné\nafficher_head('datasets/fusion_sequences_shuffled.fasta', n_lignes=50)"

In [5]:
"""from Bio import SeqIO
import random

input_fasta = "datasets/fusion_sequences.fasta"
shuffled_fasta = "datasets/fusion_sequences_shuffled.fasta"

# Lire toutes les séquences
records = list(SeqIO.parse(input_fasta, "fasta"))

# Mélanger aléatoirement
random.shuffle(records)

# Écrire dans un nouveau fichier
with open(shuffled_fasta, "w") as out_f:
    SeqIO.write(records, out_f, "fasta")

print(f"{len(records)} séquences mélangées écrites dans : {shuffled_fasta}")"""


'from Bio import SeqIO\nimport random\n\ninput_fasta = "datasets/fusion_sequences.fasta"\nshuffled_fasta = "datasets/fusion_sequences_shuffled.fasta"\n\n# Lire toutes les séquences\nrecords = list(SeqIO.parse(input_fasta, "fasta"))\n\n# Mélanger aléatoirement\nrandom.shuffle(records)\n\n# Écrire dans un nouveau fichier\nwith open(shuffled_fasta, "w") as out_f:\n    SeqIO.write(records, out_f, "fasta")\n\nprint(f"{len(records)} séquences mélangées écrites dans : {shuffled_fasta}")'

In [6]:
"""from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

# Fichier d'entrée/sortie (même nom, on écrase)
fasta_path = "datasets/fusion_sequences_shuffled.fasta"

# Lire, modifier les IDs, puis réécrire
records = []
for i, record in enumerate(SeqIO.parse(fasta_path, "fasta")):
    new_id = f"{record.id}_{i:05d}"  # Exemple : Bact_00000, Temp_00001, etc.
    new_record = SeqRecord(record.seq, id=new_id, description="")
    records.append(new_record)

with open(fasta_path, "w") as f:
    SeqIO.write(records, f, "fasta")

print(f"{len(records)} séquences réécrites avec identifiants uniques dans : {fasta_path}")"""


'from Bio import SeqIO\nfrom Bio.SeqRecord import SeqRecord\n\n# Fichier d\'entrée/sortie (même nom, on écrase)\nfasta_path = "datasets/fusion_sequences_shuffled.fasta"\n\n# Lire, modifier les IDs, puis réécrire\nrecords = []\nfor i, record in enumerate(SeqIO.parse(fasta_path, "fasta")):\n    new_id = f"{record.id}_{i:05d}"  # Exemple : Bact_00000, Temp_00001, etc.\n    new_record = SeqRecord(record.seq, id=new_id, description="")\n    records.append(new_record)\n\nwith open(fasta_path, "w") as f:\n    SeqIO.write(records, f, "fasta")\n\nprint(f"{len(records)} séquences réécrites avec identifiants uniques dans : {fasta_path}")'

In [7]:
from Bio import SeqIO

fasta_path = "datasets/fusion_sequences_shuffled.fasta"

nb_sequences = sum(1 for _ in SeqIO.parse(fasta_path, "fasta"))
print(f"Nombre total de séquences dans le fichier : {nb_sequences}")


Nombre total de séquences dans le fichier : 118337


In [8]:
from Bio import SeqIO

ids = [rec.id for rec in SeqIO.parse("datasets/fusion_sequences_shuffled.fasta", "fasta")][:10]
print(ids)


['Bact_00000', 'Bact_00001', 'Bact_00002', 'Bact_00003', 'Bact_00004', 'Bact_00005', 'Bact_00006', 'Bact_00007', 'Viru_00008', 'Bact_00009']


In [9]:
from Bio import SeqIO 
import torch
from evo import Evo
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence

# Paramètres
fasta_in = "datasets/fusion_sequences_shuffled.fasta"  # fichier shuffle
batch_size = 8
window = 256
max_seqs = 10  # Limite à 10 séquences

# Charger le modèle Evo-1-8k-base sur GPU
evo_model = Evo('evo-1-8k-base')
model, tokenizer = evo_model.model, evo_model.tokenizer
model.eval()
device = torch.device("cuda:0")
model.to(device)

# Générateur de fenêtres (on saute les vides)
def windows(seq, size=window):
    for i in range(0, len(seq), size):
        w = seq[i:i+size]
        if w:            # ignore les chaînes vides
            yield w

# Générateur limité aux N premières séquences
def process_first_n_fasta(fasta, batch_size, max_seqs):
    buffer_w, buffer_id = [], []
    count = 0
    for rec in SeqIO.parse(fasta, "fasta"):
        if count >= max_seqs:
            break
        for w in windows(str(rec.seq)):
            buffer_w.append(w)
            buffer_id.append(rec.id)
            if len(buffer_w) == batch_size:
                yield buffer_w, buffer_id
                buffer_w, buffer_id = [], []
        count += 1
    if buffer_w:
        yield buffer_w, buffer_id

# Traitement
all_embs = {}  # { seq_id: [window_emb1, window_emb2, ...] }

for windows_batch, ids_batch in tqdm(
        process_first_n_fasta(fasta_in, batch_size, max_seqs),
        desc="Batches"):
    # 1) Tokenisation
    tensors = [torch.tensor(tokenizer.tokenize(w)) for w in windows_batch]
    padded = pad_sequence(
        tensors, batch_first=True, padding_value=tokenizer.pad_id
    ).to(device).long()

    # 2) Padding mask
    padding_mask = (padded != tokenizer.pad_id).to(device)

    # 3) Inférence
    with torch.no_grad(), torch.cuda.amp.autocast():
        outputs = model(padded, padding_mask=padding_mask)[0]  # (batch, L, D)

    # 4) Pooling sur la longueur, ignore pad
    mask = padding_mask.unsqueeze(-1)         # (batch, L, 1)
    summed = (outputs * mask).sum(dim=1)      # (batch, D)
    lengths = mask.sum(dim=1)                 # (batch, 1)
    lengths[lengths == 0] = 1                 # empêche division par zéro
    embs = (summed / lengths).cpu()           # (batch, D)

    # 5) Regroupement par séquence
    for seq_id, emb in zip(ids_batch, embs):
        all_embs.setdefault(seq_id, []).append(emb)

    torch.cuda.empty_cache()

# 6) Moyenne finale : 1 embedding par séquence
final_embeddings = {
    seq_id: torch.stack(embs, 0).mean(dim=0)
    for seq_id, embs in all_embs.items()
}

# Affichage
print(f"{len(final_embeddings)} embeddings finaux générés.")
print("Séquences traitées :", list(final_embeddings.keys())[:10])
print("Dimension d’un embedding :", next(iter(final_embeddings.values())).shape)


config.json:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

configuration_hyena.py:   0%|          | 0.00/3.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/togethercomputer/evo-1-131k-base:
- configuration_hyena.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hyena.py:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

engine.py:   0%|          | 0.00/13.4k [00:00<?, ?B/s]

utils.py:   0%|          | 0.00/2.87k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/togethercomputer/evo-1-131k-base:
- utils.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/togethercomputer/evo-1-131k-base:
- engine.py
- utils.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


cache.py:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/togethercomputer/evo-1-131k-base:
- cache.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


layers.py:   0%|          | 0.00/5.39k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/togethercomputer/evo-1-131k-base:
- layers.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.py:   0%|          | 0.00/19.5k [00:00<?, ?B/s]

positional_embeddings.py:   0%|          | 0.00/4.94k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/togethercomputer/evo-1-131k-base:
- positional_embeddings.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer.py:   0%|          | 0.00/4.40k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/togethercomputer/evo-1-131k-base:
- tokenizer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/togethercomputer/evo-1-131k-base:
- model.py
- positional_embeddings.py
- tokenizer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/togethercomputer/evo-1-131k-base:
- modeling_hyena.py
- engine.py
- cache.py
- layers.py
- model.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/34.9k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00002-of-00003.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

Batches: 37it [00:41,  1.13s/it]

10 embeddings finaux générés.
Séquences traitées : ['Bact_00000', 'Bact_00001', 'Bact_00002', 'Bact_00003', 'Bact_00004', 'Bact_00005', 'Bact_00006', 'Bact_00007', 'Viru_00008', 'Bact_00009']
Dimension d’un embedding : torch.Size([512])





In [10]:
for seq_id, emb in final_embeddings.items():
    print(f"{seq_id} → {emb.tolist()[:10]}...")  # Affiche les 10 premières valeurs


Bact_00000 → [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]...
Bact_00001 → [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]...
Bact_00002 → [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]...
Bact_00003 → [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]...
Bact_00004 → [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]...
Bact_00005 → [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]...
Bact_00006 → [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]...
Bact_00007 → [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]...
Viru_00008 → [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]...
Bact_00009 → [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]...
