# plan

- Get hit scores from the jsons.
  - need to get the sequences (from alignment file or matrix files) from there as well to build the logo.
- apply the position weighting to the scores.


steps:
- construct table with score lists


- import table
- add flanking sequence
- add score lists (by score_key/level)
- use weights to get ave scores from score lists
- get difference between aln and pairwise
- apply a few filters to get likely interesting hits
- plot logos with 5 flanking residues
  - aln w/ gaps | aln w/o gaps | pairwise
-





In [1]:
#
import json
import os
import re
import sys
from dataclasses import dataclass
from pathlib import Path

import local_conservation_analysis_pipeline.group_conservation_objects as group_tools
import local_conservation_scores.tools.pairwise_tools as pairwise_tools
import local_conservation_scores.tools.score_plots as score_plots
import local_seqtools.general_utils as tools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from Bio import Align, AlignIO, Seq, SeqIO
from local_seqtools import pssms


plt.style.use("custom_standard")
# plt.style.use('custom_small')
import seaborn as sns

# pd.options.plotting.backend = "plotly"

%load_ext autoreload
%autoreload 2

In [2]:
def convert_jsonfile_to_relative(json_file):
    return json_file.replace("/home/jch/Documents/07-pt1_pt2_combined/", "../../")

In [3]:
position_weights = {
    "DOC_WW_Pin1_4": np.array([0, 0, 0, 1, 1, 0]),  # ...([ST])P.
    "LIG_AP2alpha_2": np.array([1, 1, 1]),  # DP[FW]
    "LIG_EH_1": np.array([0, 1, 1, 1, 0]),  # .NPF.
    "LIG_SH2_GRB2like": np.array([1, 1, 1, 0]),  # (Y)([EDST]|[MLIVAFYHQW])N.
    "LIG_SH3_CIN85_PxpxPR_1": np.array([1, 0, 1, 0, 1, 1]),  # P.[AP].PR
    "enah_LPPPP_FPPPP": np.array([2, 1, 0, 1, 1]),  # [FWYL]P.[AFILTVYWP]P
    "TRAF6": np.array([0, 0, 0, 1, 0, 1, 0, 0, 1]),  # ...P.E..[FYWDE]
}

In [4]:
np.average([3, 4, 2, 2], weights=[0, 0, 1, 1])
# np.average([3,4,2,2], weights=[1,1,1,1])

2.0

In [15]:
table_file = (
    "../../benchmark/benchmark_v3/p3_conservation/benchmark_table_renamed_ANNOTATED.csv"
)
df = pd.read_csv(table_file)
df = df[
    df["ELM_motif_class"] != "LIG_14-3-3_CanoR_1"
]  # this motif has a variable length regex and so it's more difficult to apply any position weighting
df = df[
    [
        "reference_index",
        "ELM_motif_class",
        "Organism",
        "UniprotID",
        "regex",
        "hit_sequence",
        "gene_id",
        "hit start position",
        "hit end position",
        "verified interaction",
        "name",
        "json_file",
        "critical_error",
    ]
]
df = df[df["critical_error"].isna()]
df["json_file"] = df["json_file"].apply(convert_jsonfile_to_relative)

## add flanking sequence

In [16]:
import local_env_variables.env_variables as env

data_all_seqrecords_dict = env.load_data_all_odb_seqs()
df["odb_seq"] = df["gene_id"].apply(
    lambda x: (
        str(data_all_seqrecords_dict[x].seq)
        if x in data_all_seqrecords_dict.keys()
        else False
    )
)
df["flanked_hit"] = df.apply(
    lambda x: tools.pad_with_aas_or_gaps(
        x["odb_seq"], x["hit start position"], x["hit end position"] + 1, flank=5
    ),
    axis=1,
    result_type="expand",
)

In [7]:
temp = df.loc[0]
og = group_tools.ConserGene(temp["json_file"])
og.load_levels(filepath_converter=convert_jsonfile_to_relative)
# lvlaln=og.get_aln_score_obj('Metazoa', 'aln_shannon_entropy')
# for aa, s in zip(lvlaln.hit_aln_sequence, lvlaln.hit_aln_scores):
#     print(aa, s)
for level, lvlo in og.level_objects.items():
    for scorekey in lvlo.conservation_scores:
        print(level, scorekey)

Eukaryota aln_property_entropy
Eukaryota aln_shannon_entropy
Eukaryota fragpair_gapless_lf5_rf5_edssmat50
Eukaryota fragpair_gapless_lf0_rf0_edssmat50
Eukaryota frag_pairwise_gapless_embedding_lf5_rf5
Eukaryota frag_pairwise_gapless_embedding_lf0_rf0
Metazoa aln_property_entropy
Metazoa aln_shannon_entropy
Metazoa fragpair_gapless_lf5_rf5_edssmat50
Metazoa fragpair_gapless_lf0_rf0_edssmat50
Metazoa frag_pairwise_gapless_embedding_lf5_rf5
Metazoa frag_pairwise_gapless_embedding_lf0_rf0
Vertebrata aln_property_entropy
Vertebrata aln_shannon_entropy
Vertebrata fragpair_gapless_lf5_rf5_edssmat50
Vertebrata fragpair_gapless_lf5_rf5_blosum62
Vertebrata fragpair_gapless_lf2_rf2_edssmat50
Vertebrata fragpair_gapless_lf10_rf10_edssmat50
Vertebrata fragpair_gapless_lf0_rf0_edssmat50
Vertebrata frag_pairwise_gapless_embedding_lf5_rf5
Vertebrata frag_pairwise_gapless_embedding_lf0_rf0
Tetrapoda aln_property_entropy
Tetrapoda aln_shannon_entropy
Tetrapoda fragpair_gapless_lf5_rf5_edssmat50
Tetrapod

## add score lists for a pairwise score and alignment score at a specific phylogenetic level
- level - Metazoa
- score_keys
  - aln_property_entropy
  - fragpair_gapless_lf5_rf5_edssmat50

In [8]:
def json_2_z_score_list(json_file, level, scorekey):
    og = group_tools.ConserGene(
        json_file, filepath_converter=convert_jsonfile_to_relative
    )
    if level not in og.levels_passing_filters:
        return
    lvlo = og.get_level_obj(level, filepath_converter=convert_jsonfile_to_relative)
    if scorekey not in lvlo.conservation_scores:
        return
    if "hit_z_scores" not in lvlo.conservation_scores[scorekey]:
        return
    return lvlo.conservation_scores[scorekey]["hit_z_scores"]

def add_scorelist_2_df(df, level, scorekey):
    colname = f"{level}_{scorekey}_z_scores"
    df[colname] = df["json_file"].apply(
        lambda x: json_2_z_score_list(x, level, scorekey)
    )
    return df

In [28]:
from attrs import asdict, define, field, validators


@define
class PairwiseScoreResults:
    flanked_hit: str
    flanked_hit_start_position_in_idr: int
    original_hit_st_in_flanked_hit: int
    original_hit_end_in_flanked_hit: int
    score_function_name: str
    score_params: dict
    lflank: int
    rflank: int
    matrix_file: str | Path
    flanked_hit_sequence: str
    flanked_hit_scores: list
    flanked_hit_z_scores: list
    hit_sequence: str
    hit_scores: list
    hit_z_scores: list
    mat2score_params: dict

    def __attrs_post_init__(self):
        self.matrix_file = convert_jsonfile_to_relative(self.matrix_file)


@dataclass
class AlnScoreResults:
    file: str
    score_function_name: str
    score_params: dict
    hit_scores: list
    hit_z_scores: list


def slice_aln_scores(lvlo: group_tools.LevelAlnScore, aln_start, aln_end):
    hit_slice = slice(aln_start, aln_end + 1)
    hit_scores = lvlo.scores[hit_slice]
    hit_z_scores = lvlo.z_scores[hit_slice]
    hit_aln_seq = lvlo.query_aln_sequence[hit_slice]
    return hit_scores, hit_z_scores, hit_aln_seq


def json2logoplot_alnscore(
    jsonfile, score_key, with_gaps=False, axes=None, level="Vertebrata", flank=5
):
    og = group_tools.ConserGene(
        jsonfile, filepath_converter=convert_jsonfile_to_relative
    )
    lvlo = og.get_aln_score_obj(
        level, score_key, filepath_converter=convert_jsonfile_to_relative
    )
    # aln = jch_alignment.jch_alignment(lvlo.aln, og.query_gene_id)
    flst, flend, flhit = tools.pad_hit(
        og.query_idr_sequence,
        og.hit_st_in_idr,
        og.hit_end_in_idr,
        l_flank=flank,
        r_flank=flank,
    )
    query_idr, index = tools.reindex_alignment_str(
        lvlo.query_aln_sequence[lvlo.idr_aln_start : lvlo.idr_aln_end + 1]
    )
    flstaln, flendaln = index[flst], index[flend]
    flanked_hit_scores, flanked_hit_z_scores, flhit_aln_seq = slice_aln_scores(
        lvlo, flstaln + lvlo.idr_aln_start, flendaln + lvlo.idr_aln_start
    )
    idr_aln = lvlo.aln[:, lvlo.idr_aln_start : lvlo.idr_aln_end + 1]
    flhit_aln = idr_aln[:, flstaln : flendaln + 1]

    if not with_gaps:
        seqlist, query_slice, nongapinds = score_plots.strip_gaps_from_slice(
            flhit_aln, flhit_aln_seq
        )
        score_list = list(np.array(flanked_hit_z_scores)[nongapinds])
    else:
        seqlist = [str(i.seq) for i in list(flhit_aln)]
        query_slice = flhit_aln_seq
        score_list = flanked_hit_z_scores
    if axes is None:
        fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 4))
    score_plots.plot_score_bar_plot(
        ax=axes[0],
        score_list=score_list,
        query_seq=query_slice,
    )
    score_plots.plot_logo(
        ax=axes[1],
        str_list=seqlist,
        tick_label_str=query_slice,
    )
    counts = pssms.alignment_2_counts(seqlist, show_plot=False, heatmap=False)
    return counts


def json2logoplot(
    jsonfile, score_key, rbm: bool = False, axes=None, level="Vertebrata"
):
    og = group_tools.ConserGene(
        jsonfile, filepath_converter=convert_jsonfile_to_relative
    )
    lvlo = og.get_level_obj(level, filepath_converter=convert_jsonfile_to_relative)
    result = PairwiseScoreResults(**lvlo.conservation_scores[score_key])
    mat_dict = pairwise_tools.import_pairwise_matrices(result.matrix_file)
    subseqdf = mat_dict["subseq_dataframe"]
    subseqdf = subseqdf.fillna("-" * len(result.flanked_hit))
    if rbm:
        rbmdf = mat_dict["reciprocal_best_match_dataframe"]
        rbmdf = rbmdf.fillna(False)
        hitdf = pd.concat(
            [
                subseqdf.loc[result.flanked_hit_start_position_in_idr],
                rbmdf.loc[result.flanked_hit_start_position_in_idr],
            ],
            axis=1,
            keys=["subseq", "rbm"],
        )
        hitdf.loc["reference_kmer", "rbm"] = True
        seqlist = hitdf[hitdf["rbm"]]["subseq"].to_list()
    else:
        seqlist = subseqdf.loc[result.flanked_hit_start_position_in_idr].to_list()
    if axes is None:
        fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 4))
    if not rbm:
        score_plots.plot_score_bar_plot(
            ax=axes[0],
            score_list=result.flanked_hit_z_scores,
            query_seq=result.flanked_hit_sequence,
        )
    score_plots.plot_logo(
        ax=axes[1], str_list=seqlist, tick_label_str=result.flanked_hit_sequence
    )
    counts = pssms.alignment_2_counts(seqlist, show_plot=False, heatmap=False)
    return counts, result


def plots_from_df(df, level, pairkey = "fragpair_gapless_lf5_rf5_edssmat50", output_folder=None, flank=5):
    counter = 0
    for i, row in df.iterrows():
        jsonfile = row["json_file"]
        fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(20, 4))
        # _=json2logoplot(jsonfile, 'fragpair_gapless_lf5_rf5_edssmat50', rbm=True, axes=ax[:,0], level=level)
        # _=json2logoplot(jsonfile, 'frag_pairwise_gapless_embedding_lf5_rf5', rbm=False, axes=ax[:,1])
        _ = json2logoplot_alnscore(
            jsonfile,
            "aln_property_entropy",
            axes=ax[:, 0],
            level=level,
            with_gaps=True,
            flank=flank,
        )
        # automatically uses whatever scoring parameters are in the json files (should be just normal right now)
        _ = json2logoplot_alnscore(
            jsonfile,
            "aln_property_entropy",
            axes=ax[:, 1],
            level=level,
            with_gaps=False,
            flank=flank,
        )
        _ = json2logoplot(
            jsonfile,
            pairkey,
            rbm=False,
            axes=ax[:, 2],
            level=level,
        )
        for axi in ax[0, :]:
            axi.set_ylim([-4, 4])
        # set the fontsize
        # _, result=json2logoplot(jsonfile, 'fragpair_gapless_lf5_rf5_edssmat50', rbm=True, axes=ax[:,2], level=level)
        # score_plots.plot_score_bar_plot(
        #     ax=ax[0,2],
        #     score_list=[0]*5+row['score_list_fragpair_gapless_lf5_rf5_edssmat50'],# this will be the RBM score list
        #     query_seq=result.flanked_hit_sequence,
        # )
        # set fig title
        # fig.suptitle(f'{row["name"]}-{row["UniprotID"]}-{row["reference_index"]}')
        plt.tight_layout()
        if output_folder is not None:
            # output_folder2 = Path(output_folder) / f'{row["reference_index"]}'
            # output_folder2.mkdir(parents=True, exist_ok=True)
            # fig.savefig(
            #     output_folder2
            #     / f'{level}-{row["name"]}-{row["UniprotID"]}-{row["pair-aln"]:.2f}-{row["gene_id"]}.png',
            #     bbox_inches="tight",
            #     dpi=300,
            # )
            fig.savefig(
                output_folder
                / f'{counter}-{row["reference_index"]}-{row["name"]}-{row["UniprotID"]}-{row["pair-aln"]:.2f}-{row["gene_id"]}.png',
                bbox_inches="tight",
                dpi=300,
            )
            plt.close(fig)
            counter += 1

We lost a lot of motifs mostly because there weren't enough points in the z-score backgrounds. there are either too many gaps in the alignments or not enough k-mers in the pairwise method.

In [10]:
LEVEL = "Metazoa"
dftemp = df.copy()
dftemp = add_scorelist_2_df(dftemp, LEVEL, "aln_property_entropy")
dftemp = add_scorelist_2_df(dftemp, LEVEL, "fragpair_gapless_lf5_rf5_edssmat50")
df2 = dftemp[
    ~dftemp[
        [
            f"{LEVEL}_aln_property_entropy_z_scores",
            f"{LEVEL}_fragpair_gapless_lf5_rf5_edssmat50_z_scores",
        ]
    ]
    .isna()
    .any(axis=1)
].copy()
cols = [i for i in df2.columns if "_scores" in i]
df2["weight_array"] = df2["ELM_motif_class"].map(position_weights)
df2 = df2.drop("odb_seq", axis=1)
df2["weight_array"].value_counts()
# df2['weight_array'].isna().sum()
for col in cols:
    df2[col + "_weighted_mean"] = df2.apply(
        lambda x: np.average(x[col], weights=x["weight_array"]), axis=1
    )
cols = [i for i in df2.columns if "weighted_mean" in i]
rn = {
    f"{LEVEL}_aln_property_entropy_z_scores_weighted_mean": "aln",
    f"{LEVEL}_fragpair_gapless_lf5_rf5_edssmat50_z_scores_weighted_mean": "pair",
}
df2 = df2.rename(columns=rn)
df2["pair-aln"] = df2["pair"] - df2["aln"]
enadf = df2[df2["ELM_motif_class"] == "enah_LPPPP_FPPPP"].copy()
df2.to_csv("score_table.csv", index=False)

In [13]:
output_folder = Path(f"./plots_ena/")
output_folder.mkdir(exist_ok=True)
temp = enadf[enadf["pair-aln"] > 2].copy()
print(len(temp))
# temp=temp[~temp['UniprotID'].isna()]
# temp[temp['verified interaction']].sort_values('pair', ascending=False).head(10)f
temp = temp.sort_values("pair", ascending=False)
plt.rcParams.update({"font.size": 14})
plots_from_df(temp, "Metazoa", output_folder=output_folder, flank=5)

35


In [17]:
LEVEL = "Vertebrata"
dftemp = df.copy()
dftemp = add_scorelist_2_df(dftemp, LEVEL, "aln_property_entropy")
dftemp = add_scorelist_2_df(dftemp, LEVEL, "fragpair_gapless_lf5_rf5_edssmat50")
df2 = dftemp[
    ~dftemp[
        [
            f"{LEVEL}_aln_property_entropy_z_scores",
            f"{LEVEL}_fragpair_gapless_lf5_rf5_edssmat50_z_scores",
        ]
    ]
    .isna()
    .any(axis=1)
].copy()
cols = [i for i in df2.columns if "_scores" in i]
df2["weight_array"] = df2["ELM_motif_class"].map(position_weights)
df2 = df2.drop("odb_seq", axis=1)
df2["weight_array"].value_counts()
# df2['weight_array'].isna().sum()
df2.to_csv("score_table.csv", index=False)
for col in cols:
    df2[col + "_weighted_mean"] = df2.apply(
        lambda x: np.average(x[col], weights=x["weight_array"]), axis=1
    )
cols = [i for i in df2.columns if "weighted_mean" in i]
rn = {
    f"{LEVEL}_aln_property_entropy_z_scores_weighted_mean": "aln",
    f"{LEVEL}_fragpair_gapless_lf5_rf5_edssmat50_z_scores_weighted_mean": "pair",
}
df2 = df2.rename(columns=rn)
df2["pair-aln"] = df2["pair"] - df2["aln"]
enadf = df2[df2["ELM_motif_class"] == "enah_LPPPP_FPPPP"].copy()

In [29]:
output_folder = Path(f"./plots_ena_Vertebrata/")
output_folder.mkdir(exist_ok=True)
temp = enadf[enadf["pair-aln"] > 1].copy()
temp = temp[temp["pair"] > 0.5].copy()
print(len(temp))
# temp=temp[~temp['UniprotID'].isna()]
# temp[temp['verified interaction']].sort_values('pair', ascending=False).head(10)f
temp = temp.sort_values("pair", ascending=False)
# plt.rcParams.update({"font.size": 14})
plots_from_df(temp, "Vertebrata", output_folder=output_folder, flank=5)

24


In [21]:
cols

['Metazoa_aln_property_entropy_z_scores',
 'Metazoa_fragpair_gapless_lf5_rf5_edssmat50_z_scores',
 'Vertebrata_aln_property_entropy_z_scores',
 'Vertebrata_fragpair_gapless_lf5_rf5_edssmat50_z_scores']

In [20]:
df2

Unnamed: 0,reference_index,ELM_motif_class,Organism,UniprotID,regex,hit_sequence,gene_id,hit start position,hit end position,verified interaction,name,json_file,critical_error,flanked_hit,Metazoa_aln_property_entropy_z_scores,Metazoa_fragpair_gapless_lf5_rf5_edssmat50_z_scores,Vertebrata_aln_property_entropy_z_scores,Vertebrata_fragpair_gapless_lf5_rf5_edssmat50_z_scores,weight_array
0,0,LIG_AP2alpha_2,Rattus norvegicus,O08838,DP[FW],DPF,10116_0:004cdf,356,358,True,,../../benchmark/benchmark_v3/p3_conservation/c...,,LDLDFDPFKPDVT,"[-0.7849110575078907, -1.1470965302653118, -0....","[1.2646695131128232, -0.6230319877472846, 0.10...","[1.2785820647214632, 0.03832446171214316, 1.27...","[1.7302602704326209, 0.30070041842870615, 1.54...","[1, 1, 1]"
1,1,LIG_AP2alpha_2,Rattus norvegicus,Q05140,DP[FW],DPF,10116_0:002e5e,399,401,True,,../../benchmark/benchmark_v3/p3_conservation/c...,,EAPISDPFAPEPS,"[-0.3065326707116909, -1.5818396760602311, -1....","[0.8068631016990869, 0.6504573818486616, 1.619...","[-0.3757825927989007, -1.2417344591634956, -1....","[0.22488942726408537, 0.02772335993562546, -0....","[1, 1, 1]"
2,2,LIG_AP2alpha_2,Rattus norvegicus,Q05140,DP[FW],DPF,10116_0:002e5e,473,475,True,,../../benchmark/benchmark_v3/p3_conservation/c...,,ACSGNDPFAPSEG,"[0.017487595566555075, 1.2401414924355165, 1.2...","[0.306919128843599, 0.5543636025383214, 2.1653...","[1.0671007661976506, 0.88663452426079, 1.06710...","[1.638242435956166, 1.3967010985501278, 1.4820...","[1, 1, 1]"
3,3,LIG_AP2alpha_2,Homo sapiens,P98082,DP[FW],DPF,9606_0:0016b2,292,294,True,,../../benchmark/benchmark_v3/p3_conservation/c...,,PTPNPDPFRDDPF,"[0.6769662402228052, 0.372685432141188, 1.7414...","[3.662421085771397, 3.191840346290177, 4.11101...","[1.2045098328535813, 1.6961993321774382, 1.631...","[1.7308183030776918, 2.0581029493236827, 2.202...","[1, 1, 1]"
4,4,LIG_AP2alpha_2,Homo sapiens,P98082,DP[FW],DPF,9606_0:0016b2,297,299,True,,../../benchmark/benchmark_v3/p3_conservation/c...,,DPFRDDPFTQPDQ,"[-0.48778871074170704, 0.5659504569320113, 1.3...","[2.7705513589276904, 1.0505947964105804, 2.262...","[1.4522314930727687, 1.5176974743943668, 0.480...","[2.074211827645784, 1.8577131695222864, 0.8742...","[1, 1, 1]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2882,2882,TRAF6,Homo sapiens,A0A384MDR3,...P.E..[FYWDE],IKKPAEDEW,9606_0:004627,81,89,False,,../../benchmark/benchmark_v3/p3_conservation/c...,,ALFQDIKKPAEDEWGKTPD,"[1.6241450780345585, -1.4089095965466998, -1.0...","[0.8050348655032681, -1.0921869671063063, -0.5...","[1.154489476836621, -1.007434199238479, 1.1817...","[0.5451229600801136, -0.8964587731723821, 1.16...","[0, 0, 0, 1, 0, 1, 0, 0, 1]"
2883,2883,TRAF6,Homo sapiens,Q14209,...P.E..[FYWDE],SLVPLEATD,9606_0:00055f,363,371,False,,../../benchmark/benchmark_v3/p3_conservation/c...,,APPPPSLVPLEATDSLLEL,"[-1.180650083469508, -1.331294073817295, -1.46...","[-1.0958606737149268, -0.04076692679728417, -0...","[-1.048000273450755, -1.057049107954081, -1.12...","[-1.0663432621108706, -0.29635935974563504, -0...","[0, 0, 0, 1, 0, 1, 0, 0, 1]"
2885,2885,TRAF6,Homo sapiens,Q9H816,...P.E..[FYWDE],FESPEESAD,9606_0:000204,353,361,False,,../../benchmark/benchmark_v3/p3_conservation/c...,,TQGVVFESPEESADQSQAD,,"[3.644601901565253, 3.0552242871772446, 2.8860...","[2.2689217162116884, 2.06794344628975, 1.70054...","[3.952637944651848, 4.207351227077069, 3.17172...","[0, 0, 0, 1, 0, 1, 0, 0, 1]"
2886,2886,TRAF6,Homo sapiens,A4PB67,...P.E..[FYWDE],FHQPEEEIE,9606_0:00057e,215,223,False,,../../benchmark/benchmark_v3/p3_conservation/c...,,PLRTLFHQPEEEIEDGGLF,"[-1.7556914529906844, -1.7727575922158432, -1....","[-0.6420363343827739, -1.1415426882540252, -1....","[-2.2478124490435225, -2.287941325460294, -2.0...","[-0.9342903186837597, -1.1217520698203998, -1....","[0, 0, 0, 1, 0, 1, 0, 0, 1]"


In [None]:
traf = df2[df2["ELM_motif_class"] == "TRAF6"].copy()

In [None]:

output_folder = Path(f"./plots_{LEVEL}_traf")
output_folder.mkdir(exist_ok=True)
temp = traf[traf["pair-aln"] > 1].copy()
temp = temp[temp["pair"] > 1].copy()
temp = temp.sort_values("pair", ascending=False)
plots_from_df(temp, "Metazoa", output_folder=output_folder, pairkey="fragpair_gapless_lf0_rf0_edssmat50", flank=0)

Q9Y6W5
WASF2_HUMAN
Part of the WAVE complex that regulates lamellipodia formation

In [None]:
plots_from_df(
    temp[~temp["verified interaction"]]
    .sort_values("mean_score_fragpair_gapless_lf5_rf5_edssmat50", ascending=False)
    .head(10),
    "Metazoa",
)

In [None]:
vert_df.sort_values("aln-pair", ascending=False).head(10)
temp = vert_df[vert_df["aln-pair"] > 2].copy()
temp = temp[~temp["UniprotID"].isna()]
plt.rcParams.update({"font.size": 16})
plots_from_df(
    temp[temp["verified interaction"]]
    .sort_values("mean_score_fragpair_gapless_lf5_rf5_edssmat50", ascending=False)
    .head(10),
    "Vertebrata",
)

In [None]:
temp = vert_df[vert_df["reference_index"] == 2243].copy()
fig, ax = plt.subplots(nrows=2, figsize=(8, 4))
_ = json2logoplot_alnscore(
    temp["json_file"].values[0],
    "aln_property_entropy",
    axes=ax,
    level="Vertebrata",
    with_gaps=True,
    flank=0,
)
ax[0].set_ylim([-3, 3])
plt.tight_layout()

In [79]:
df2.loc[2887].to_json("./test.json", orient="records")

In [61]:
np.average(
    [
        -0.2100420216,
        -0.7622876615,
        0.2637430374,
        -0.3815422762,
        -0.9570456609,
        -1.3029297428,
        -1.2251521697,
        -1.4368219208,
        -1.327423687,
    ]
)

-0.8155002336777777

In [63]:
np.average(
    [
        -0.2100420216,
        -0.7622876615,
        0.2637430374,
        -0.3815422762,
        -0.9570456609,
        -1.3029297428,
        -1.2251521697,
        -1.4368219208,
        -1.327423687,
    ],
    weights=[0, 0, 0, 1, 0, 1, 0, 0, 1],
)

-1.0039652353333333

In [64]:
np.average([-0.3815422762, -1.3029297428, -1.327423687])
np.mean([-0.3815422762, -1.3029297428, -1.327423687])

-1.0039652353333333