simple test if it works:

In [None]:
from evo import Evo
from tqdm.auto import tqdm

### Importing both datasets (RefSeq & DeePhage)

In [10]:
import os
import zipfile
import py7zr
import gdown

# ---------- FONCTIONS DE TÉLÉCHARGEMENT & EXTRACTION ----------

def download_and_extract_zip(url, extract_to, archive_name):
    print(f"Téléchargement de {archive_name} (ZIP)...")
    gdown.download(url, archive_name, quiet=False)
    print(f"Extraction de {archive_name}...")
    with zipfile.ZipFile(archive_name, 'r') as zip_ref:
        zip_ref.extractall(path=extract_to)
    os.remove(archive_name)
    print(f"{archive_name} extrait avec succès.\n")

def download_and_extract_7z_from_drive(file_id, extract_to, archive_name):
    os.makedirs(extract_to, exist_ok=True)
    print(f"Téléchargement de {archive_name} depuis Google Drive...")
    gdown.download(id=file_id, output=archive_name, quiet=False)
    print(f"Extraction de {archive_name}...")
    with py7zr.SevenZipFile(archive_name, mode='r') as archive:
        archive.extractall(path=extract_to)
    os.remove(archive_name)
    print(f"{archive_name} extrait avec succès.\n")

# ---------- CRÉATION DES DOSSIERS ----------

os.makedirs("datasets/refseq_simulated_metagenome", exist_ok=True)
os.makedirs("datasets/deephage_lifestyle", exist_ok=True)

# ---------- 1. DATASET REFSEQ ----------

refseq_url = "https://figshare.com/ndownloader/articles/19739884/versions/1"
download_and_extract_zip(
    refseq_url,
    "datasets/refseq_simulated_metagenome",
    "refseq_simulated_metagenome.zip"
)

# ---------- 2. DATASET DEEPHAGE ----------

deephage_file_id = "13KfvYgR946gnzBAaM3TYsGPUwvoqlN-Z"
download_and_extract_7z_from_drive(
    deephage_file_id,
    "datasets/deephage_lifestyle",
    "deephage-dataset.7z"
)


Téléchargement de refseq_simulated_metagenome.zip (ZIP)...


Downloading...
From: https://figshare.com/ndownloader/articles/19739884/versions/1
To: d:\K Docs\Phagos x AWS\phage-classification-hackathon\refseq_simulated_metagenome.zip
100%|██████████| 975M/975M [00:42<00:00, 23.1MB/s] 


Extraction de refseq_simulated_metagenome.zip...
refseq_simulated_metagenome.zip extrait avec succès.

Téléchargement de deephage-dataset.7z depuis Google Drive...


Downloading...
From: https://drive.google.com/uc?id=13KfvYgR946gnzBAaM3TYsGPUwvoqlN-Z
To: d:\K Docs\Phagos x AWS\phage-classification-hackathon\deephage-dataset.7z
100%|██████████| 29.5M/29.5M [00:00<00:00, 42.2MB/s]


Extraction de deephage-dataset.7z...
deephage-dataset.7z extrait avec succès.



### Chargement des séquences FASTA DeePhage

Nous utilisons uniquement les séquences **annotées manuellement** :

- `Dataset-1_virulent.fasta` → phages **lytiques** (label `1`)
- `Dataset-1_temperate.fasta` → phages **lysogéniques** (label `0`)


In [3]:
from Bio import SeqIO

def load_sequences(fasta_file, label):
    sequences = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequences.append((str(record.seq), label))
    return sequences

# Charger les deux jeux
data = []
data += load_sequences("datasets/deephage_lifestyle/Dataset-1_virulent.fasta", 1)
data += load_sequences("datasets/deephage_lifestyle/Dataset-1_temperate.fasta", 0)

print(f"{len(data)} séquences chargées.")


225 séquences chargées.


In [4]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    sequences, labels = zip(*batch)
    return list(sequences), list(labels)

dataloader = DataLoader(data, batch_size=64, shuffle=True, collate_fn=collate_fn)


### Génération des embeddings avec Evo

On encode chaque séquence d’ADN en un vecteur de représentation à l’aide du modèle `Evo`.


In [5]:
from evo import Evo
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

evo_model = Evo("evo-1-131k-base")
model, tokenizer = evo_model.model, evo_model.tokenizer
model.to(device)
model.eval()


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

StripedHyena(
  (embedding_layer): VocabParallelEmbedding(512, 4096)
  (norm): RMSNorm()
  (unembed): VocabParallelEmbedding(512, 4096)
  (blocks): ModuleList(
    (0-7): 8 x ParallelGatedConvBlock(
      (pre_norm): RMSNorm()
      (post_norm): RMSNorm()
      (filter): ParallelHyenaFilter()
      (projections): Linear(in_features=4096, out_features=12288, bias=True)
      (out_filter_dense): Linear(in_features=4096, out_features=4096, bias=True)
      (mlp): ParallelGatedMLP(
        (l1): Linear(in_features=4096, out_features=10928, bias=False)
        (l2): Linear(in_features=4096, out_features=10928, bias=False)
        (l3): Linear(in_features=10928, out_features=4096, bias=False)
      )
    )
    (8): AttentionBlock(
      (pre_norm): RMSNorm()
      (post_norm): RMSNorm()
      (inner_mha_cls): MHA(
        (Wqkv): Linear(in_features=4096, out_features=12288, bias=True)
        (inner_attn): FlashSelfAttention(
          (drop): Dropout(p=0.0, inplace=False)
        )
        

In [6]:
def get_representation(sequence):
    tokens = tokenizer.tokenize(sequence)
    input_ids = torch.tensor(tokens, dtype=torch.int32).unsqueeze(0).to(device)
    with torch.no_grad():
        logits, _ = model(input_ids)
    return logits.mean(dim=1).squeeze().cpu()


In [7]:
import numpy as np
from tqdm import tqdm

X = []
y = []

progressBar = tqdm(dataloader, desc="Loading...", leave=False)

for i, (seq, label) in enumerate(progressBar):
    try:
        # Corriger le type : si seq est une liste, prendre le premier élément
        if isinstance(seq, (list, tuple)):
            seq = seq[0]
        # Vérifier que c’est bien une string
        if not isinstance(seq, str):
            seq = str(seq)

        rep = get_representation(seq)  # doit renvoyer un np.array ou torch tensor
        X.append(rep)
        y.append(label)
    except Exception as e:
        snippet = seq[:30] if isinstance(seq, str) else str(seq)[:30]
        print(f"[{i}] Erreur: {e}, ex: {snippet}...")

# Ne pas empiler si vide
if X:
    X = np.stack(X)
    y = np.array(y)
    print("Forme finale de X:", X.shape)
    print("Forme finale de y:", y.shape)
else:
    print("Aucun embedding valide généré.")


Loading...:  50%|█████     | 2/4 [00:00<00:00,  4.10it/s]

[0] Erreur: CUDA out of memory. Tried to allocate 8.96 GiB. GPU 0 has a total capacity of 21.98 GiB of which 7.61 GiB is free. Including non-PyTorch memory, this process has 14.36 GiB memory in use. Of the allocated memory 14.05 GiB is allocated by PyTorch, and 9.73 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables), ex: agttaataatttactcgtatagctcagtgg...
[1] Erreur: CUDA out of memory. Tried to allocate 10.67 GiB. GPU 0 has a total capacity of 21.98 GiB of which 6.73 GiB is free. Including non-PyTorch memory, this process has 15.24 GiB memory in use. Of the allocated memory 14.43 GiB is allocated by PyTorch, and 526.91 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_se

Loading...:  75%|███████▌  | 3/4 [00:00<00:00,  3.21it/s]

[2] Erreur: CUDA out of memory. Tried to allocate 3.04 GiB. GPU 0 has a total capacity of 21.98 GiB of which 2.48 GiB is free. Including non-PyTorch memory, this process has 19.48 GiB memory in use. Of the allocated memory 19.17 GiB is allocated by PyTorch, and 10.24 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables), ex: gtcccgcccccataggacacgctaaccaaa...


                                                         

[3] Erreur: CUDA out of memory. Tried to allocate 11.88 GiB. GPU 0 has a total capacity of 21.98 GiB of which 5.52 GiB is free. Including non-PyTorch memory, this process has 16.45 GiB memory in use. Of the allocated memory 14.69 GiB is allocated by PyTorch, and 1.46 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables), ex: gtcaaattttcacgcccgcgaaaaatgaaa...
Aucun embedding valide généré.




In [8]:
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modèle
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Prédiction
y_pred = clf.predict(X_test)

# Résultats
print("Rapport de classification :\n")
print(classification_report(y_test, y_pred, target_names=["Lysogenic", "Lytic"]))


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# t-SNE
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
X_tsne = tsne.fit_transform(X)

# Affichage
plt.figure(figsize=(8,6))
plt.scatter(X_tsne[y==0,0], X_tsne[y==0,1], c="blue", label="Lysogenic (0)", alpha=0.6)
plt.scatter(X_tsne[y==1,0], X_tsne[y==1,1], c="red", label="Lytic (1)", alpha=0.6)
plt.legend()
plt.title("Projection t-SNE des séquences phage (Evo embeddings)")
plt.xlabel("t-SNE 1")
plt.ylabel("t-SNE 2")
plt.grid(True)
plt.show()


### Classification des phages : modèle de régression logistique

On entraîne un classifieur supervisé (`LogisticRegression`) pour prédire le style de vie du phage.
