labgem · jpjarnoux · Jun 12, 2024 · Jun 10, 2024 · Jun 11, 2024 · Jun 11, 2024
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -120,7 +120,7 @@ jobs:
       shell: bash -l {0}
       run: |
         cd testingDataset
-        ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv --output readclusterpang --cpu $NUM_CPUS
+        ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv --output readclusterpang  --cpu $NUM_CPUS 
         ppanggolin annotate --anno genomes.gbff.list --output readclusters --cpu $NUM_CPUS
         ppanggolin cluster --clusters clusters.tsv -p readclusters/pangenome.h5 --cpu $NUM_CPUS
         ppanggolin msa --pangenome readclusterpang/pangenome.h5 --partition persistent --phylo -o readclusterpang/msa/ -f --cpu $NUM_CPUS

diff --git a/docs/user/PangenomeAnalyses/pangenomeCluster.md b/docs/user/PangenomeAnalyses/pangenomeCluster.md
@@ -54,6 +54,11 @@ You can do this from the command line:
 
 An example of what clusters.tsv should look like is provided [here](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/clusters.tsv)
 
+```{note}
+When you provide your clustering, *PPanGGOLiN* will translate the representative gene sequence of each family and write it in the HDF5 file.
+```
+
+
 
 ### Defragmentation
 

diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py
@@ -19,7 +19,7 @@
 from ppanggolin.pangenome import Pangenome
 from ppanggolin.genome import Gene
 from ppanggolin.geneFamily import GeneFamily
-from ppanggolin.utils import read_compressed_or_not, restricted_float, run_subprocess, create_tmpdir, mk_outdir
+from ppanggolin.utils import read_compressed_or_not, restricted_float, run_subprocess, create_tmpdir
 from ppanggolin.formats.writeBinaries import write_pangenome, erase_pangenome
 from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file
 from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations, translate_genes, create_mmseqs_db
@@ -61,7 +61,8 @@ def check_pangenome_for_clustering(pangenome: Pangenome, sequences: Path, force:
     elif pangenome.status["geneSequences"] == "inFile":
         logging.getLogger("PPanGGOLiN").debug("Write sequences from pangenome file")
         write_gene_sequences_from_pangenome_file(pangenome.file, sequences, add="ppanggolin_",
-                                                 compress=False, disable_bar=disable_bar)  # write CDS sequences to the tmpFile
+                                                 compress=False,
+                                                 disable_bar=disable_bar)  # write CDS sequences to the tmpFile
     else:
         raise Exception("The pangenome does not include gene sequences, thus it is impossible to cluster "
                         "the genes in gene families. Either provide clustering results (see --clusters), "
@@ -286,7 +287,7 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool =
     date = time.strftime("_%Y-%m-%d_%H-%M-%S", time.localtime())
     dir_name = f'clustering_tmpdir_{date}_PID{os.getpid()}'
     with create_tmpdir(tmpdir, basename=dir_name, keep_tmp=keep_tmp_files) as tmp_path:
-        sequence_path = tmp_path/'nucleotide_sequences.fna'
+        sequence_path = tmp_path / 'nucleotide_sequences.fna'
         check_pangenome_for_clustering(pangenome, sequence_path, force, disable_bar=disable_bar)
         logging.getLogger("PPanGGOLiN").info("Clustering all of the genes sequences...")
         rep, tsv = first_clustering(sequence_path, tmp_path, cpu, code, coverage, identity, mode)
@@ -356,20 +357,48 @@ def infer_singletons(pangenome: Pangenome):
     logging.getLogger("PPanGGOLiN").info(f"Inferred {singleton_counter} singleton families")
 
 
-def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singleton: bool = False, force: bool = False,
-                    disable_bar: bool = False):
+def get_family_representative_sequences(pangenome: Pangenome, code: int = 11, cpu: int = 1,
+                                        tmpdir: Path = None, keep_tmp: bool = False):
+    tmpdir = Path(tempfile.gettempdir()) if tmpdir is None else tmpdir
+    with create_tmpdir(tmpdir, "get_proteins_sequences", keep_tmp) as tmp:
+        repres_path = tmp / "representative.fna"
+        with open(repres_path, "w") as repres_seq:
+            for family in pangenome.gene_families:
+                repres_seq.write(f">{family.name}\n")
+                repres_seq.write(f"{family.representative.dna}\n")
+        translate_db = translate_genes(sequences=repres_path, tmpdir=tmp, cpu=cpu,
+                                       is_single_line_fasta=True, code=code)
+        outpath = tmp / "representative_protein_genes.fna"
+        cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath]))
+        run_subprocess(cmd, msg="MMSeqs convert2fasta failed with the following error:\n")
+        with open(outpath, "r") as repres_prot:
+            lines = repres_prot.readlines()
+            while len(lines) > 0:
+                family_name = lines.pop(0).strip()[1:]
+                family_seq = lines.pop(0).strip()
+                family = pangenome.get_gene_family(family_name)
+                family.add_sequence(family_seq)
+
+
+def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singleton: bool = False,
+                    code: int = 11, cpu: int = 1, tmpdir: Path = None, keep_tmp: bool = False,
+                    force: bool = False, disable_bar: bool = False):
     """
     Get the pangenome information, the gene families and the genes with an associated gene family.
     Reads a families tsv file from mmseqs2 output and adds the gene families and the genes to the pangenome.
 
     :param pangenome: Input Pangenome
     :param families_tsv_file: MMseqs2 clustering results
     :param infer_singleton: creates a new family for each gene with no associated family
+    :param code: Genetic code used for sequence translation.
+    :param cpu: Number of CPU cores to use for clustering.
+    :param tmpdir: Path to a temporary directory for intermediate files.
+    :param keep_tmp: Keep temporary files (useful for debugging).
     :param force: force to write in the pangenome
     :param disable_bar: Allow to disable progress bar
     """
     check_pangenome_former_clustering(pangenome, force)
-    check_pangenome_info(pangenome, need_annotations=True, disable_bar=disable_bar)
+    check_pangenome_info(pangenome, need_annotations=True, need_gene_sequences=True, disable_bar=disable_bar)
 
     logging.getLogger("PPanGGOLiN").info(f"Reading {families_tsv_file.name} the gene families file ...")
     filesize = os.stat(families_tsv_file).st_size
@@ -421,10 +450,12 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet
                     f"You can either update your cluster file to ensure each gene has a cluster assignment, "
                     f"or use the '--infer_singletons' option to automatically infer a cluster for each non-clustered gene."
                 )
+    get_family_representative_sequences(pangenome, code, cpu, tmpdir, keep_tmp)
 
     pangenome.status["genesClustered"] = "Computed"
     if frag:  # if there was fragment information in the file.
         pangenome.status["defragmented"] = "Computed"
+    pangenome.status["geneFamilySequences"] = "Computed"
     pangenome.parameters["cluster"] = {}
     pangenome.parameters["cluster"]["# read_clustering_from_file"] = True
     pangenome.parameters["cluster"]["infer_singletons"] = infer_singleton
@@ -450,7 +481,8 @@ def launch(args: argparse.Namespace):
         if None in [args.tmpdir, args.cpu, args.no_defrag, args.translation_table,
                     args.coverage, args.identity, args.mode]:
             logging.getLogger("PPanGGOLiN").warning("You are using an option compatible only with clustering creation.")
-        read_clustering(pangenome, args.clusters, args.infer_singletons, args.force, disable_bar=args.disable_prog_bar)
+        read_clustering(pangenome, args.clusters, args.infer_singletons, args.translation_table,
+                        args.cpu, args.tmpdir, args.keep_tmp, args.force, disable_bar=args.disable_prog_bar)
         logging.getLogger("PPanGGOLiN").info("Done reading the cluster file")
     write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar)
 
@@ -488,10 +520,6 @@ def parser_clust(parser: argparse.ArgumentParser):
     clust.add_argument('--no_defrag', required=False, default=False, action="store_true",
                        help="DO NOT Use the defragmentation strategy to link potential fragments "
                             "with their original gene family.")
-    clust.add_argument("--translation_table", required=False, default="11",
-                       help="Translation table (genetic code) to use.")
-
-    clust.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus")
 
     read = parser.add_argument_group(title="Read clustering arguments")
     read.add_argument('--clusters', required=False, type=Path,
@@ -501,7 +529,10 @@ def parser_clust(parser: argparse.ArgumentParser):
                       help="When reading a clustering result with --clusters, if a gene is not in the provided file"
                            " it will be placed in a cluster where the gene is the only member.")
     optional = parser.add_argument_group(title="Optional arguments")
-    optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()),
+    optional.add_argument("--translation_table", required=False, default="11",
+                          help="Translation table (genetic code) to use.")
+    optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus")
+    optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()),
                           help="directory for storing temporary files")
     optional.add_argument("--keep_tmp", required=False, default=False, action="store_true",
                           help="Keeping temporary files (useful for debugging).")

diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py
@@ -188,6 +188,13 @@ def remove(self, identifier):
         del self[identifier]
 
 
+    @property
+    def representative(self) -> Gene:
+        """Get the representative gene of the family
+        :return: The representative gene of the family
+        """
+        return self.get(self.name)
+
     def contains_gene_id(self, identifier):
         """
         Check if the family contains already a gene id

diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py
@@ -591,7 +591,7 @@ def get_genes(self, begin: int = 0, end: int = None, outrange_ok: bool = False)
             raise TypeError(f"Expected type int for 'begin' and 'end', "
                             f"but received types '{type(begin)}' and '{type(end)}'.")
 
-        if begin >= end:
+        if begin > end:
             raise ValueError("The 'begin' position must be less than the 'end' position.")
 
         if end > self._genes_position[-1].position:
@@ -603,7 +603,10 @@ def get_genes(self, begin: int = 0, end: int = None, outrange_ok: bool = False)
         if end == self._genes_position[-1].position:
             return self._genes_position[begin:]
         else:
-            return self._genes_position[begin: end]
+            if begin == end:
+                return self._genes_position[begin]
+            else:
+                return self._genes_position[begin: end]
 
     @property
     def number_of_genes(self) -> int:

diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py
@@ -63,8 +63,9 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True,
 
         if args.clusters is not None:
             start_clust = time.time()
-            read_clustering(pangenome, args.clusters, disable_bar=args.disable_prog_bar,
-                            infer_singleton=args.cluster.infer_singletons)
+            read_clustering(pangenome, args.clusters, infer_singleton=args.cluster.infer_singletons,
+                            code=args.cluster.translation_table, cpu=args.cluster.cpu, tmpdir=args.tmpdir,
+                            keep_tmp=args.cluster.keep_tmp, force=args.force, disable_bar=args.disable_prog_bar)
         else:  # args.cluster is None
             if pangenome.status["geneSequences"] == "No":
                 if args.fasta is None:
@@ -78,7 +79,7 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True,
                        disable_bar=args.disable_prog_bar,
                        defrag=not args.cluster.no_defrag, code=args.cluster.translation_table,
                        coverage=args.cluster.coverage, identity=args.cluster.identity, mode=args.cluster.mode,
-                       keep_tmp_files=True)
+                       keep_tmp_files=args.cluster.keep_tmp)
         clust_time = time.time() - start_clust
 
     elif args.fasta is not None: