# Clustering
## Input
We will cluster the sequences using MMseqs2. First we need to combine the sequences we downloaded into one file.

In [26]:
import os
from Bio import SeqIO
from collections import defaultdict

In [35]:
seq_file = "all_proteins_combined.fasta"
data_dir = "../data"
cluster_file = "mmseqs_output_cluster.tsv"
path_raw_gene_families = "../raw_gene_families"
path_clean_families = "../filtered_gene_families"

In [None]:
#combine all fasta files for mmseqs


print(f"Creating {seq_file}...")

with open(seq_file, "w") as outfile:
    count = 0
    for filename in os.listdir(data_dir):
        if filename.endswith(".faa"):
            #use filename as species label (like Bifidobacterium_longum)
            species_tag = filename.replace(".faa", "")
            
            with open(os.path.join(data_dir, filename), "r") as infile:
                for line in infile:
                    if line.startswith(">"):
                        
                        original_id = line.split()[0].lstrip(">")
                        outfile.write(f">{species_tag}|{original_id}\n")
                    else:
                        outfile.write(line)
            count += 1

print(f"Merged {count} genomes into {seq_file}")

Creating all_proteins_combined.fasta...
Merged 55 genomes into all_proteins_combined.fasta


## Running clustering script
Now we run MMseqs2 on all_proteins_combined.fasta using *run_mmseqs.py*. I decided to do this via a separate script so that I can run it on the Entropy compute cluster and make it faster.

On the cluster I simply open a bash session

*srun --pty --cpus-per-task=4 --time=100 bash*

and run the script

*time python3 run_mmseqs.py*

which utilzed only took 1min 11s, thanks to utilizing 40 CPU cores.



## Output
We obtain 3 files:
* mmseqs_output_cluster.tsv - this file says which sequence belongs to which cluster. Contains two columns: cluster name (a cluster is named after one of its representatives) and name of the sequence belonging to it. 
* mmseqs_output_rep_seq.fasta - those representative sequences clusters are named after 
* mmseqs_output_all_seqs.fasta - all the sequences reorganized in accordance with the clustering.


### Gene families

In [None]:

#load clusters
seq_dict = SeqIO.to_dict(SeqIO.parse(seq_file, "fasta"))
clusters = defaultdict(list)
with open(cluster_file) as f:
    for line in f:
        rep, member = line.strip().split('\t')
        clusters[rep].append(member)

n_clusters = len(clusters)
print(f"{n_clusters} clusters in total")


28743 clusters in total


In [32]:
#write them to indivitual files

for i, (rep, members) in enumerate(clusters.items()):
    if not i%1000:
        print(f"{i}/{n_clusters}")

    records = [seq_dict[m] for m in members if m in seq_dict]
    
    #replace special characters for valid file name
    safe_name = rep.replace("|", "_").replace("/", "_")
    output_path = os.path.join(path_raw_gene_families, f"cluster_{safe_name}.fasta")
    
    SeqIO.write(records, output_path, "fasta")

0/28743
1000/28743
2000/28743
3000/28743
4000/28743
5000/28743
6000/28743
7000/28743
8000/28743
9000/28743
10000/28743
11000/28743
12000/28743
13000/28743
14000/28743
15000/28743
16000/28743
17000/28743
18000/28743
19000/28743
20000/28743
21000/28743
22000/28743
23000/28743
24000/28743
25000/28743
26000/28743
27000/28743
28000/28743


In [37]:
#filtering paralogs and renaming the sequences to contain only names of the species
for i, (rep, members) in enumerate(clusters.items()):
    if not i%1000:
        print(f"{i}/{n_clusters}")

    species_best = {} #to save longest sequences for a given species
    all_records_renamed = []

    for member in members:
        if member not in seq_dict: continue
        
        #extract species from current long IDs 
        #(Bifidobacterium_actinocoloniiforme|WP_003817716.1 -> Bifidobacterium_actinocoloniiforme)
        species = member.split('|')[0]
        record = seq_dict[member]
        
        #only keep if longer
        if species not in species_best or len(record.seq) > len(species_best[species].seq):
            species_best[species] = record

        #renaming sequences with paralogs as well
        raw_rec = record[:]
        raw_rec.id = species
        raw_rec.description = ""
        all_records_renamed.append(raw_rec)

    #rename species to clean names
    clean_records = []
    for species, record in species_best.items():
        new_rec = record[:] 
        new_rec.id = species
        new_rec.description = "" 
        clean_records.append(new_rec)

    safe_name = rep.replace("|", "_").replace("/", "_")
    
    output_path = os.path.join(path_clean_families, f"cluster_{safe_name}.fasta")
    SeqIO.write(clean_records, output_path, "fasta")

    output_path_raw = os.path.join(path_raw_gene_families, f"cluster_{safe_name}.fasta")
    SeqIO.write(all_records_renamed, output_path_raw, "fasta")

0/28743
1000/28743
2000/28743
3000/28743
4000/28743
5000/28743
6000/28743
7000/28743
8000/28743
9000/28743
10000/28743
11000/28743
12000/28743
13000/28743
14000/28743
15000/28743
16000/28743
17000/28743
18000/28743
19000/28743
20000/28743
21000/28743
22000/28743
23000/28743
24000/28743
25000/28743
26000/28743
27000/28743
28000/28743
