In [1]:
import pandas as pd
import plotly.express as px

In [2]:
fp_all_members = "all_members_7k_function_500_hyp.csv"
fp_clusterreps_with_ids = "foldseek_clusterreps_7k_function_500_hyp_with_ids.csv"
fp_clusterreps = "foldseek_clusterreps_7k_function_500_hyp.csv"
fp_clusterreps_fasta = "foldseek_clusterreps_7k_function_500_hyp.fasta"

In [3]:
def read_fasta(filepath):
    """
    Reads a FASTA file and returns a dictionary of sequences with their headers.

    Args:
        filepath (str): Path to the FASTA file.

    Returns:
        dict: A dictionary where keys are headers (without '>') and values are sequences.
    """
    fasta_dict = {}
    with open(filepath, 'r') as file:
        header = None
        sequence_parts = []
        for line in file:
            line = line.strip()
            if line.startswith(">"):
                if header:  # Save the previous header and sequence
                    fasta_dict[header] = ''.join(sequence_parts)
                header = line[1:]  # Remove '>'
                sequence_parts = []  # Reset sequence parts
            else:
                sequence_parts.append(line)
        if header:  # Add the last header and sequence
            fasta_dict[header] = ''.join(sequence_parts)
    return fasta_dict

In [4]:
df_all_members = pd.read_csv(fp_all_members)
df_clusterreps_with_ids = pd.read_csv(fp_clusterreps_with_ids)

In [5]:
df_all_members.shape

(7500, 4)

In [6]:
df_all_members.head()

Unnamed: 0,phrog,sequence,product,function
0,267,MTASLTEGHLAYRESIGAATSGSSFESGVTAGKLWLPIWSGEVVNA...,major head protein,head and packaging
1,138,MYLSESQVKKWDPILEHPDLPKIDDNYKKQVTAVLLENQEKALQEE...,major head protein,head and packaging
2,11233,MEYSFNIDYAQEFGVSEAIMIKNFQFWIKKNSANKDSNHDGRTWTF...,replication initiation protein,"DNA, RNA and nucleotide metabolism"
3,2816,MNCWHCQTELIYNGDQEMADDSRYSTMTNLSCPKCFCEVEVYLPRDAYD,protease,other
4,2662,MNPISTIRSWFTREKGMSASTIAWLRGDDLSDDGMSLSLASAQEQS...,portal protein,head and packaging


In [7]:
df_clusterreps_with_ids.shape

(7500, 4)

In [8]:
df_clusterreps_with_ids.head()

Unnamed: 0,identifier,phrog,product,function
0,phage_0001,118,head-tail adaptor,connector
1,phage_0002,1076,FmdB-like transcriptional regulator,transcription regulation
2,phage_0003,952,Rz-like spanin,lysis
3,phage_0004,4611,minor head protein,head and packaging
4,phage_0005,6050,HNH endonuclease,"DNA, RNA and nucleotide metabolism"


In [9]:
df_clusterreps_with_ids['phrog'].nunique()

2504

In [10]:
df_clusterreps_with_ids['phrog'].value_counts()

phrog
18345    67
10089    52
5668     51
6651     45
1        39
         ..
19653     1
14242     1
18020     1
31712     1
8354      1
Name: count, Length: 2504, dtype: int64

In [11]:
df_clusterreps_with_ids['product'].nunique()

457

In [12]:
df_clusterreps_with_ids['product'].value_counts()

product
tail fiber protein                         554
tail length tape measure protein           519
virion structural protein                  411
tail protein                               329
endolysin                                  178
                                          ... 
Ku-like DNA end binding                      1
nicotinamide phosphoribosyl transferase      1
CsrA-like regulator                          1
Srd anti-sigma factor                        1
phosphoesterase                              1
Name: count, Length: 457, dtype: int64

In [13]:
df_clusterreps_with_ids['function'].value_counts()

function
tail                                                 2081
head and packaging                                   1403
DNA, RNA and nucleotide metabolism                   1275
other                                                 641
transcription regulation                              500
unknown function                                      500
moron, auxiliary metabolic gene and host takeover     385
lysis                                                 378
connector                                             196
integration and excision                              141
Name: count, dtype: int64

In [15]:
df_clusterreps_with_ids["phrog"] = df_clusterreps_with_ids["phrog"].apply(lambda x: f"p-{x}")

In [16]:
df_clusterreps_with_ids

Unnamed: 0,identifier,phrog,product,function
0,phage_0001,p-118,head-tail adaptor,connector
1,phage_0002,p-1076,FmdB-like transcriptional regulator,transcription regulation
2,phage_0003,p-952,Rz-like spanin,lysis
3,phage_0004,p-4611,minor head protein,head and packaging
4,phage_0005,p-6050,HNH endonuclease,"DNA, RNA and nucleotide metabolism"
...,...,...,...,...
7495,phage_7496,p-6958,virion structural protein,head and packaging
7496,phage_7497,p-445,terminase small subunit,head and packaging
7497,phage_7498,p-18508,kinase,other
7498,phage_7499,p-309,terminase small subunit,head and packaging


In [19]:
df_clusterreps_with_ids.to_csv("phages_metadata.csv", index=False)

In [23]:
df_fp_clusterreps = pd.read_csv(fp_clusterreps)

In [24]:
df_fp_clusterreps.head()

Unnamed: 0,phrog,sequence,product,function
0,118,VTKDDIWKTLLMVRQAYQDSLDGKSISFTGVNGRAITNHDPKALRD...,head-tail adaptor,connector
1,1076,MPLYSFTCEGCQRTTDVPLRLKEMDRPTQHPRCPTCATYSYMQRVV...,FmdB-like transcriptional regulator,transcription regulation
2,952,MPGSDPETNGDLSADIRQLENALARCASQVKMIKHCQDENDAQTRQ...,Rz-like spanin,lysis
3,4611,MAQIKPEGIVQSDPQVKLIEAIVKQAYLDIFQHIQAGKDSQSVKVK...,minor head protein,head and packaging
4,6050,VTRLVIDWMDYITNIIHYHNFNFLNNIFNQLNIKYIIIIIIAKNII...,HNH endonuclease,"DNA, RNA and nucleotide metabolism"


In [15]:
sequences = read_fasta(fp_clusterreps_fasta)

In [16]:
len(sequences)

7500

In [17]:
sequences

{'phage_0001': 'VTKDDIWKTLLMVRQAYQDSLDGKSISFTGVNGRAITNHDPKALRDELEYWERRWRAVNSRGGSYKLANFL',
 'phage_0002': 'MPLYSFTCEGCQRTTDVPLRLKEMDRPTQHPRCPTCATYSYMQRVVTSASLSFKGQGWTPKHY',
 'phage_0003': 'MPGSDPETNGDLSADIRQLENALARCASQVKMIKHCQDENDAQTRQPAQSAD',
 'phage_0004': 'MAQIKPEGIVQSDPQVKLIEAIVKQAYLDIFQHIQAGKDSQSVKVKIKGLEKIVAEYDLDLQAWADVTVPGLYKEGMDNAIKQAIKDNIVYTFEDKFATFHQQTIQLIVQNAYKYTQKIADGLEEAGTSAITAEQAEKVAIQVARGEIAGSDLKTIAKNVENELRASSLSAITYKNGRNVSVDGYARTLARSILTEAQVTGIQNTHIEEGYDLVQVSDHFGECAICRPWENEVLSLTGRTRGYTTLDKAKEAGLFHSNCRHSISPYFEGLASVSKFWDVESQSYISKADPLVEQAKMMKLEGKTFADFMTVNNIKNYGNINIINVNTVKEEFFIPRFQQGGFLTQKSASGISYFAQDRINALNQALKIFAKVGKANVKDFKGGYSLDKIIDNPLLFEKYPRLKEQNVVFADFHTDKKYGLYYDNTIFINSKLYEKNPIKLTSTIVHEVQHVKQDINKMVSISEEIEKGKIKKAWDSQREIDARRQQQLFVESQESKEKLKKVWDSVQDISKENILKAYEQFTTKIGIQQYNNINEAIKTGDKAKLRRIIAKENNPEIKNSLQRMLKYL',
 'phage_0005': 'VTRLVIDWMDYITNIIHYHNFNFLNNIFNQLNIKYIIIIIIAKNIINFVYNMVFLFLLDLTCSILCKCVFKIGSWVVYKSYDGIYYLYKRVCNNTINNHIENKKDELDLSPYVIITEEEYDNLKNNKKCKIHKRNVSSLTKKMVASNQEWKCGS