Test if it works:

In [1]:
from evo import Evo
from tqdm.auto import tqdm

### Importing both datasets (RefSeq & DeePhage)

In [2]:
"""
%%bash
set -e  # stop on error

echo "Création des dossiers..."
mkdir -p datasets/refseq_simulated_metagenome
mkdir -p datasets/deephage_lifestyle

echo "Téléchargement Dataset 3 (refseq_simulated_metagenome)..."
wget -q -O datasets/refseq_simulated_metagenome.tar.gz https://phagos-rd-hackathon25-datasets.s3.eu-west-1.amazonaws.com/refseq_simulated_metagenome/refseq_simulated_metagenome.tar.gz
echo "Extraction Dataset 3..."
tar -xzf datasets/refseq_simulated_metagenome.tar.gz -C datasets/refseq_simulated_metagenome
rm datasets/refseq_simulated_metagenome.tar.gz
echo "Dataset 3 prêt."

echo "Téléchargement Dataset 4 (deephage_lifestyle)..."
wget -q -O datasets/deephage_lifestyle.tar.gz https://s3.eu-west-1.amazonaws.com/phagos-rd-hackathon25-datasets/deephage/deephage_lifestyle.tar.gz
echo "Extraction Dataset 4..."
tar -xzf datasets/deephage_lifestyle.tar.gz -C datasets/deephage_lifestyle
rm datasets/deephage_lifestyle.tar.gz
echo "Dataset 4 prêt."

echo "Tous les datasets ont été installés."
"""


'\n%%bash\nset -e  # stop on error\n\necho "Création des dossiers..."\nmkdir -p datasets/refseq_simulated_metagenome\nmkdir -p datasets/deephage_lifestyle\n\necho "Téléchargement Dataset 3 (refseq_simulated_metagenome)..."\nwget -q -O datasets/refseq_simulated_metagenome.tar.gz https://phagos-rd-hackathon25-datasets.s3.eu-west-1.amazonaws.com/refseq_simulated_metagenome/refseq_simulated_metagenome.tar.gz\necho "Extraction Dataset 3..."\ntar -xzf datasets/refseq_simulated_metagenome.tar.gz -C datasets/refseq_simulated_metagenome\nrm datasets/refseq_simulated_metagenome.tar.gz\necho "Dataset 3 prêt."\n\necho "Téléchargement Dataset 4 (deephage_lifestyle)..."\nwget -q -O datasets/deephage_lifestyle.tar.gz https://s3.eu-west-1.amazonaws.com/phagos-rd-hackathon25-datasets/deephage/deephage_lifestyle.tar.gz\necho "Extraction Dataset 4..."\ntar -xzf datasets/deephage_lifestyle.tar.gz -C datasets/deephage_lifestyle\nrm datasets/deephage_lifestyle.tar.gz\necho "Dataset 4 prêt."\n\necho "Tous le

### Chargement des séquences FASTA DeePhage

Nous utilisons uniquement les séquences **annotées manuellement** :

- `Dataset-1_virulent.fasta` → phages **lytiques** (label `1`)
- `Dataset-1_temperate.fasta` → phages **lysogéniques** (label `0`)


In [3]:
from Bio import SeqIO

def load_sequences(fasta_file, label):
    sequences = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequences.append((str(record.seq), label))
    return sequences

# Charger les deux jeux
data = []
data += load_sequences("datasets/deephage_lifestyle/Dataset-1_virulent.fasta", 1)
data += load_sequences("datasets/deephage_lifestyle/Dataset-1_temperate.fasta", 0)

print(f"{len(data)} séquences chargées.")


225 séquences chargées.


In [4]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    sequences, labels = zip(*batch)
    return list(sequences), list(labels)

dataloader = DataLoader(data, batch_size=64, shuffle=True, collate_fn=collate_fn)


### Génération des embeddings avec Evo

On encode chaque séquence d’ADN en un vecteur de représentation à l’aide du modèle `Evo`.


In [5]:
from evo import Evo
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

evo_model = Evo("evo-1-131k-base")
model, tokenizer = evo_model.model, evo_model.tokenizer
model.to(device)
model.eval()


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

StripedHyena(
  (embedding_layer): VocabParallelEmbedding(512, 4096)
  (norm): RMSNorm()
  (unembed): VocabParallelEmbedding(512, 4096)
  (blocks): ModuleList(
    (0-7): 8 x ParallelGatedConvBlock(
      (pre_norm): RMSNorm()
      (post_norm): RMSNorm()
      (filter): ParallelHyenaFilter()
      (projections): Linear(in_features=4096, out_features=12288, bias=True)
      (out_filter_dense): Linear(in_features=4096, out_features=4096, bias=True)
      (mlp): ParallelGatedMLP(
        (l1): Linear(in_features=4096, out_features=10928, bias=False)
        (l2): Linear(in_features=4096, out_features=10928, bias=False)
        (l3): Linear(in_features=10928, out_features=4096, bias=False)
      )
    )
    (8): AttentionBlock(
      (pre_norm): RMSNorm()
      (post_norm): RMSNorm()
      (inner_mha_cls): MHA(
        (Wqkv): Linear(in_features=4096, out_features=12288, bias=True)
        (inner_attn): FlashSelfAttention(
          (drop): Dropout(p=0.0, inplace=False)
        )
        

In [6]:
def get_representation(sequence):
    tokens = tokenizer.tokenize(sequence)
    input_ids = torch.tensor(tokens, dtype=torch.int32).unsqueeze(0).to(device)
    with torch.no_grad():
        logits, _ = model(input_ids)
    return logits.mean(dim=1).squeeze().cpu()


In [9]:
import numpy as np

X = []
y = []

progressBar = tqdm(dataloader, desc="Loading...", leave=False)

for (seq, label) in progressBar:
    try:
        rep = get_representation(seq)
        X.append(rep)
        y.append(label)
    except Exception as e:
        print(f"[{i}] Erreur: {e}, ex: {seq[0][:30]}...")


X = np.stack(X)
y = np.array(y)

print("Forme finale de X:", X.shape)
print("Forme finale de y:", y.shape)


                                           

[3] Erreur: a bytes-like object is required, not 'list', ex: taatagggaaaaagttagtataaatttaca...
[3] Erreur: a bytes-like object is required, not 'list', ex: tctgatataatataagattacctctcagaa...
[3] Erreur: a bytes-like object is required, not 'list', ex: aaagacgctgttaaataatcagcctttaaa...
[3] Erreur: a bytes-like object is required, not 'list', ex: tggtggggctacaccccttacaccataagc...




ValueError: need at least one array to stack

In [8]:
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modèle
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Prédiction
y_pred = clf.predict(X_test)

# Résultats
print("Rapport de classification :\n")
print(classification_report(y_test, y_pred, target_names=["Lysogenic", "Lytic"]))


ModuleNotFoundError: No module named 'sklearn'

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# t-SNE
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
X_tsne = tsne.fit_transform(X)

# Affichage
plt.figure(figsize=(8,6))
plt.scatter(X_tsne[y==0,0], X_tsne[y==0,1], c="blue", label="Lysogenic (0)", alpha=0.6)
plt.scatter(X_tsne[y==1,0], X_tsne[y==1,1], c="red", label="Lytic (1)", alpha=0.6)
plt.legend()
plt.title("Projection t-SNE des séquences phage (Evo embeddings)")
plt.xlabel("t-SNE 1")
plt.ylabel("t-SNE 2")
plt.grid(True)
plt.show()


### Classification des phages : modèle de régression logistique

On entraîne un classifieur supervisé (`LogisticRegression`) pour prédire le style de vie du phage.
