# 🧬 P450 生成序列可视化分析

使用 Paddle 框架对模型生成的蛋白质序列进行统计分析与可视化。

In [1]:

import paddle
import matplotlib.pyplot as plt
from collections import Counter

# 读取并解析FASTA格式
def read_sequences(filepath):
    sequences = []
    with open(filepath, "r") as f:
        seq = ""
        for line in f:
            if line.startswith(">"):
                if seq:
                    sequences.append(seq.replace("-", ""))
                seq = ""
            else:
                seq += line.strip()
        if seq:
            sequences.append(seq.replace("-", ""))
    return sequences

# 加载生成序列
sequences = read_sequences("./data/gen_0.fasta")
print(f"共读取序列数: {len(sequences)}")


ModuleNotFoundError: No module named 'paddle'

## 📊 序列长度分布

In [None]:

lengths = [len(s) for s in sequences]

plt.figure(figsize=(8, 4))
plt.hist(lengths, bins=20, color='cornflowerblue', edgecolor='black')
plt.title("序列长度分布（去除填充符号 '-'）")
plt.xlabel("长度")
plt.ylabel("数量")
plt.grid(True)
plt.tight_layout()
plt.show()


## 🔤 氨基酸频率分布

In [None]:

aa_counter = Counter("".join(sequences))
aa_freq_sorted = dict(sorted(aa_counter.items()))

plt.figure(figsize=(10, 4))
plt.bar(aa_freq_sorted.keys(), aa_freq_sorted.values(), color='salmon')
plt.title("氨基酸频率分布")
plt.xlabel("氨基酸")
plt.ylabel("出现次数")
plt.grid(True)
plt.tight_layout()
plt.show()
