In [None]:
import shutil
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import slim_conservation_scoring.pipeline.group_conservation_objects as group_tools
from pathlib import Path
import slim_conservation_scoring.seqtools.general_utils as tools

table_file = "../../../benchmark/benchmark_v4/MSA_comparison/benchmark_table_MSA_comparison_ANNOTATED.csv"
table = pd.read_csv(table_file)
table = table[
    [
        "reference_index",
        "Organism",
        "Primary_Acc",
        "Accessions",
        "UniprotID",
        "regex",
        "hit_sequence",
        "gene_id",
        "hit start position",
        "hit end position",
        "verified interaction",
        "ELM_motif_class",
        "name",
        "critical_error",
        "json_file",
    ]
]

enah_df = table[
    (table["ELM_motif_class"] == "enah_LPPPP_FPPPP") & (table["verified interaction"])
].copy()
OUTPUT_DIR = Path("./ena_motif_alignments")
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

# %%
# ==============================================================================
# // copy over alignments and slices
# ==============================================================================

LEVELS = ["Metazoa", "Vertebrata"]
for i, row in enah_df.iterrows():
    folder = OUTPUT_DIR / f"{row['reference_index']}_{row['name']}_{row['gene_id']}"
    folder.mkdir(exist_ok=True, parents=True)
    json_path = Path(row["json_file"])
    og = group_tools.ConserGene(json_path)
    for level in LEVELS:
        if level not in og.levels_passing_filters:
            continue
        lvlo = og.get_level_obj(level)
        aln_file = Path(lvlo.alignment_file)
        # copy the alignment file
        shutil.copy(aln_file, folder / aln_file.name)
        shutil.copy(
            lvlo.info_dict["aln_slice_file"],
            folder / Path(lvlo.info_dict["aln_slice_file"]).name,
        )
    enah_df.loc[i, "folder"] = str(folder)

# 2333

I think I will choose reference ID 2333 (AB1IP_HUMAN, also known as RIAM) as an example where the alignment is particularly difficult

In [2]:
import slim_conservation_scoring.seqtools.general_utils as tools
import pandas as pd
import numpy as np
import slim_conservation_scoring.pipeline.group_conservation_objects as group_tools
import json
import copy
import os
import re
import sys
from pathlib import Path
from Bio import AlignIO, Seq, SeqIO, Align

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
aln_file = "./ena_motif_alignments/2333_AB1IP_HUMAN_9606_0:00294e/9606_0_00294e_Vertebrata_1587055at7742_clustered_ldos_aln.fasta"
table = pd.read_csv("./enah_table.csv")
og = group_tools.ConserGene(
    table.loc[table["reference_index"] == 2333]["json_file"].values[0]
)
lvl = og.get_level_obj("Vertebrata")
x = lvl.aln[:, lvl.hit_aln_start - 30 : lvl.hit_aln_end + 30]
query_aln = lvl.query_aln_sequence[lvl.hit_aln_start - 30 : lvl.hit_aln_end + 30]
x = tools.sort_aln_by_pid2ref(x, query_aln)
subsampled_list = tools.subsample_seqrec_list_target_num(x, target_num=20)
subsampled_id_list = [x.id for x in subsampled_list]
aln_dict = {s.id: s for s in lvl.aln}
subsampled_fl_aln = [aln_dict[i] for i in subsampled_id_list]
with open("./2333_subsampled_aln_Vertebrata.fasta", "w") as f:
    SeqIO.write(subsampled_fl_aln, f, "fasta")