# 绘图-10折同源性与2018later 同源性
> 2025-05-11     
> Author: zhenkun.shi@tib.cas.cn

## 1. 导入必要的包

In [24]:
import sys,os
sys.path.insert(0, os.path.dirname(os.path.realpath('__file__')))
sys.path.insert(1,'../')
from config import conf as cfg
import pandas as pd
import numpy as np
import json
import plotly.graph_objects as go
import tools.bioFunctionLib as bfl
from tools import btools
from IPython.display import HTML
from pandarallel import pandarallel # 导入pandaralle
from  evaluation import evTools


pandarallel.initialize(progress_bar=False)
%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 192 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 2. 读取数据

In [8]:
ds_train_fold10 = pd.read_feather(f'{cfg.DIR_DATASET}validation/fold1/train.feather')
ds_test_fold10 = pd.read_feather(f'{cfg.DIR_DATASET}validation/fold1/valid.feather')


In [10]:
ds_train_2018later = pd.read_feather(cfg.FILE_DS_TRAIN)
ds_test_2018later = pd.read_feather(cfg.FILE_DS_TEST)

In [None]:
blast_res_fold10 = bfl.getblast(train=ds_train_fold10[['uniprot_id', 'seq']], test=ds_test_fold10[['uniprot_id', 'seq']], k=1)
blast_res_2018later = bfl.getblast(train=ds_train_2018later[['uniprot_id', 'seq']], test=ds_test_2018later[['uniprot_id', 'seq']], k=1)

In [41]:
def compute_pident_distribution(df, column='pident', bin_width=10, range_max=100, merge_threshold=30):
    """
    按百分比区间统计 pident 分布频率和占比（cat_prob），并将小于 merge_threshold 的区间合并。

    参数:
        df: 输入的 pandas DataFrame
        column: 要处理的列名，默认为 'pident'
        bin_width: 每个区间的宽度，默认为10
        range_max: 最大值上限（如100）
        merge_threshold: 小于该百分比的区间将合并为一个 bin（默认合并<30%）

    返回:
        pandas DataFrame，包含：
            - pident_range: 区间标签（合并后含 <xx%）
            - count: 每个区间内的频数
            - cat_prob: 每个区间的占比（百分比，保留两位小数）
    """
    bins = np.arange(0, range_max + bin_width, bin_width)
    labels = [f'{i}–{i + bin_width}' for i in range(0, range_max, bin_width)]

    df = df.copy()
    df['pident_range'] = pd.cut(df[column], bins=bins, labels=labels, right=False)
    freq_series = df['pident_range'].value_counts().sort_index()

    result = pd.DataFrame({
        'pident_range': freq_series.index,
        'count': freq_series.values,
    })
    result['cat_prob'] = (result['count'] / len(df) * 100).round(2)

    # 提取区间起始值
    range_start = result['pident_range'].str.extract(r'(\d+)', expand=False).astype(int)

    # 拆分高低区间
    low_bins = result[range_start < merge_threshold]
    high_bins = result[range_start >= merge_threshold]

    if not low_bins.empty:
        merged = pd.DataFrame({
            'pident_range': [f'<{merge_threshold}%'],
            'count': [low_bins['count'].sum()],
            'cat_prob': [round(low_bins['count'].sum() / len(df) * 100, 2)]
        })
        result = pd.concat([merged, high_bins], ignore_index=True)

    return result

In [46]:
freq_10fold = compute_pident_distribution(blast_res_fold10)
freq_2018later = compute_pident_distribution(blast_res_2018later)

print(freq_10fold)
print(freq_2018later)

  pident_range  count  cat_prob
0         <30%    333      0.69
1        30–40   1170      2.42
2        40–50   1964      4.06
3        50–60   2812      5.81
4        60–70   3807      7.86
5        70–80   4647      9.59
6        80–90   6397     13.21
7       90–100  15984     33.00
  pident_range  count  cat_prob
0         <30%    870      9.42
1        30–40   1838     19.90
2        40–50   1380     14.94
3        50–60   1100     11.91
4        60–70    921      9.97
5        70–80    889      9.63
6        80–90    748      8.10
7       90–100   1097     11.88


In [50]:
# 示例 x 轴（索引）
x_vals = list(range(len(freq_10fold)))

# 拟合趋势线
trend_10fold = np.polyval(np.polyfit(x_vals, freq_10fold['cat_prob'], deg=1), x_vals)
trend_2018 = np.polyval(np.polyfit(x_vals, freq_2018later['cat_prob'], deg=1), x_vals)

# 创建图表
fig = go.Figure()

# 添加柱状图（10-Fold，含数值）
fig.add_trace(go.Bar(
    x=freq_10fold['pident_range'],
    y=freq_10fold['cat_prob'],
    name='10-Fold (Random)',
    marker_color='#8ECFC9',
    text=freq_10fold['cat_prob'],
    textposition='outside'
))

# 添加柱状图（Post-2018，含数值）
fig.add_trace(go.Bar(
    x=freq_2018later['pident_range'],
    y=freq_2018later['cat_prob'],
    name='Post-2018',
    marker_color='#FA7F6F',
    text=freq_2018later['cat_prob'],
    textposition='outside'
))

# 添加趋势线（10fold）
fig.add_trace(go.Scatter(
    x=freq_10fold['pident_range'],
    y=trend_10fold,
    mode='lines',
    name='Trend (10-Fold)',
    line=dict(color='#0072B5', dash='dash')
))

# 添加趋势线（2018 later）
fig.add_trace(go.Scatter(
    x=freq_2018later['pident_range'],
    y=trend_2018,
    mode='lines',
    name='Trend (Post-2018)',
    line=dict(color='#BC3C29', dash='dot')
))

# 图形样式设置
fig.update_layout(
    title='Cat_prob Distribution across Identity Intervals with Trendlines',
    xaxis_title='Identity Range (%)',
    yaxis=dict(
        title='cat_prob (%)',
        range=[0, 35],
        showline=True,
        linecolor='black',
        gridcolor='gray'
    ),
    barmode='group',  # 并排显示
    xaxis_tickangle=-45,
    width=1200,
    height=600,
    template='plotly_white'
)

fig.show()

In [51]:
ds_test_2018later

Unnamed: 0,uniprot_id,seq,reaction_id,ec_number,functionCounts,ec_specific_level,isenzyme,label
0,A9JLI2,MLGLQIFTLLSIPTLLYTYEIEPLERTSTPPEKEFGYWCTYANHCR...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,A9JLI3,MRFFSYLGLLLAGLTSLQGFSTDNLLEEELRYWCQYVKNCRFCWTC...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,A9JLI5,MLVIFLGILGLLANQVLGLPTQAEGHLRSTDNPPQEELGYWCTYME...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,A9JLI7,MLVIILGVIGLLANQVLGLPTQAGGHLRSTDNPPQEELGYWCTYME...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,B5KVH4,MAKPILLSIYLCLIIVALFNGCLAQSGGRQQHKFGQCQLNRLDALE...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...
13510,P0DW91,MSGAEEAGGGGPAAGPAGSVPAGVGVGAGAGAGVGVGAGPGAAAGP...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
13511,P0DTL6,MSGAEEAGGGGPAAGPAGSVPAGVGVGVGAGPGAAAGQAAAAALGE...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
13512,P0DW87,MSGAEEAGGGGPAAGPAGAVPAGVGVGAGPGAAAGPAAAALGEAAG...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
13513,P0DW89,MSGAEEAGGGGPAAGPAGAVPAGVGVGVGPGAAAGPAAAALGEAAG...,-,-,0,0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [52]:
# 提取酶 / 非酶 的数量
enzyme_count = len(ds_test_2018later[ds_test_2018later.reaction_id != '-'])
non_enzyme_count = len(ds_test_2018later[ds_test_2018later.reaction_id == '-'])
total_count = enzyme_count + non_enzyme_count

# 数据和颜色
labels = ['Enzyme', 'Non-Enzyme']
values = [enzyme_count, non_enzyme_count]
colors = ['#8ECFC9', '#FA7F6F']

# 创建饼图（环形）
fig = go.Figure(data=[go.Pie(
    labels=labels,
    values=values,
    marker=dict(colors=colors),
    textinfo='label+percent+value',
    hoverinfo='label+percent+value',
    hole=0.4  # 环形饼图
)])

# 添加中央注释（总数）
fig.update_layout(
    title=dict(
        text='Enzyme vs Non-Enzyme Distribution',
        x=0.5,
        font=dict(size=20)
    ),
    annotations=[dict(
        text=f'Total<br>{total_count:,}',  # 加逗号分隔
        x=0.5, y=0.5,
        font_size=16,
        showarrow=False
    )],
    showlegend=True,
    width=600,
    height=600,
    template='plotly_white'
)

fig.show()