In [None]:
# ライブラリのインポート

import re

import numpy as np
import pandas as pd
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

In [None]:
# データセットの作成（aequorea-victoria.csv）

# CSVファイルの読み込み
df = pd.read_csv(
    "../data/raw/aequorea-victoria.csv"
)

columns = [
    "name",
    "seq",
    "states.0.ext_coeff",
    "states.0.qy",
    "states.0.brightness",
    "states.0.ex_max",
    "states.0.em_max",
]

df = df[columns]

# カラム名の変更
df = df.rename(
    columns={
        "name": "name",
        "seq": "sequence",
        "states.0.ext_coeff": "ec",
        "states.0.qy": "qy",
        "states.0.brightness": "brightness",
        "states.0.ex_max": "ex_max",
        "states.0.em_max": "em_max",
    }
)

# 欠損値の除去
df = df.dropna()

# 分位点ビンを追加
df["bin"] = pd.qcut(df["brightness"], q=10, labels=False, duplicates="drop")

df = df.groupby("bin").sample(n=8, random_state=42)

# 分位点ビンを削除
df = df.drop(columns=["bin"])

# データの確認
print(len(df))
df.head()

df.to_csv("../data/processed/aequorea-victoria.csv", index=False)

In [None]:
# FASTAファイルの作成（aequorea-victoria.fasta）

df = pd.read_csv("../data/processed/aequorea-victoria.csv")

records = []
for _, row in df.iterrows():
    record = SeqRecord(
        Seq(row["sequence"]),
        id=row["name"],
        description=f"brightness={row['brightness']}",
    )
    records.append(record)

with open("../data/processed/aequorea-victoria.fasta", "w") as handle:
    SeqIO.write(records, handle, "fasta")

In [None]:
# データセットの作成（beta-lactamase.csv）

# TEM-1
wt_sequence = "MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW"

# CSVファイルの読み込み
df = pd.read_csv("../data/raw/beta-lactamase.csv")

columns = [
    "AA Position",
    "WT AA",
    "Mutant AA",
    "Fitness",
]

df = df[columns]

# カラム名の変更
df = df.rename(
    columns={
        "AA Position": "aa_pos",
        "WT AA": "wt_aa",
        "Mutant AA": "mutant_aa",
        "Fitness": "fitness",
    }
)

# 終止コドンの除去
df = df[df["mutant_aa"] != "*"]

df["aa_pos"] = df["aa_pos"].apply(lambda i: int(i) - 1)


# 変異の適用
def mutate(pos, aa):
    return wt_sequence[:pos] + aa + wt_sequence[pos + 1 :]


df["sequence"] = df.apply(lambda row: mutate(row["aa_pos"], row["mutant_aa"]), axis=1)

# 欠損値の除去
df = df.dropna()

df = df[df["fitness"] > 1.0]

# 対数変換を追加
df["log_fitness"] = np.log1p(df["fitness"])

# 分位点ビンを追加
df["bin"] = pd.qcut(df["log_fitness"], q=10, labels=False, duplicates="drop")

df = df.groupby("bin").sample(n=8, random_state=42)

# 対数変換の削除
df = df.drop(columns=["log_fitness"])

# 分位点ビンを削除
df = df.drop(columns=["bin"])

# データの確認
print(len(df))
df.head()

# CSVファイルの保存
df.to_csv(
    "../data/processed/beta-lactamase.csv", index=False
)

In [None]:
# FASTAファイルの作成（beta-lactamase.fasta）

df = pd.read_csv("../data/processed/beta-lactamase.csv")

records = []
for _, row in df.iterrows():
    record_id = f"{row['aa_pos']+1}{row['wt_aa']}>{row['mutant_aa']}"
    description = f"fitness={row['fitness']}"
    record = SeqRecord(Seq(row["sequence"]), id=record_id, description=description)
    records.append(record)

with open("../data/processed/beta-lactamase.fasta", "w") as handle:
    SeqIO.write(records, handle, "fasta")

In [None]:
# データセットの作成（amino-acid-genotypes-to-brightness.csv）

wt_sequence = "MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"

# CSVファイルの読み込み
df = pd.read_csv("../data/raw/amino-acid-genotypes-to-brightness.csv")

columns = [
    "aaMutations",
    "medianBrightness",
]

df = df[columns]

# カラム名の変更
df = df.rename(
    columns={
        "aaMutations": "mutation",
        "medianBrightness": "brightness",
    }
)

# 変異の適用
def mutate(mutation: str) -> str:
    sequence = list(wt_sequence)
    if not mutation or pd.isna(mutation):
        print("No mutation provided, returning wild-type sequence.")
        return wt_sequence
    for mut in mutation.split(":"):
        m = re.match(r"^S([A-Z])(\d+)([A-Z])$", mut)
        if not m:
            continue
        wt_aa, aa_pos, mutant_aa = m.group(1), m.group(2), m.group(3)
        aa_pos = int(aa_pos) - 1

        if sequence[aa_pos] != wt_aa:
            ValueError()
        sequence[aa_pos] = mutant_aa
    return "".join(sequence)


df["sequence"] = df.apply(lambda row: mutate(row["mutation"]), axis=1)

# 欠損値の除去
df = pd.concat([df.iloc[[0]], df.iloc[1:].dropna()], ignore_index=True)

# データの確認
print(len(df))
df.head()

# CSVファイルの保存
df.to_csv("../data/processed/amino-acid-genotypes-to-brightness.csv", index=False)