In [3]:
pip install tqdm

Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import pandas as pd
import requests
import re
import os
from tqdm import tqdm

# ========== CONFIG ==========

AAINDEX_FILE = "aaindex2.txt"  # This must be downloaded from https://www.genome.jp/ftp/db/community/aaindex/
INPUT_EXCEL = "mutations.xlsx"
OUTPUT_CSV = "mutation_matrix_features.csv"

# ============================

AA_LETTERS = 'ARNDCQEGHILKMFPSTWYV'


def get_uniprot_sequence(uniprot_id):
    url = f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta"
    response = requests.get(url)
    if response.ok:
        lines = response.text.strip().split('\n')
        return ''.join(lines[1:])
    return None


def parse_mutation(mutation_str):
    match = re.match(r"([A-Z])(\d+)([A-Z])", mutation_str)
    if match:
        wt, pos, mut = match.groups()
        return wt, int(pos), mut
    return None, None, None


def get_neighbors(sequence, pos):
    n = sequence[pos - 2] if pos > 1 else None
    c = sequence[pos] if pos < len(sequence) else None
    return n, c


def load_aaindex2(filename):
    matrices = {}
    with open(filename) as f:
        blocks = f.read().split("//\n")
        for block in blocks:
            lines = block.strip().split('\n')
            if not lines or not lines[0].startswith('H'):
                continue
            mat_id = lines[0].split()[1]
            matrix = {}
            values = []
            for line in lines:
                if line.startswith('M'):
                    continue
                if all(c.isdigit() or c.isspace() or c == '.' or c == '-' for c in line.strip()):
                    values.extend([float(x) if x != '-' else 0.0 for x in line.strip().split()])

            if len(values) == 210:
                index = 0
                for i in range(20):
                    for j in range(i + 1):
                        a1, a2 = AA_LETTERS[i], AA_LETTERS[j]
                        val = values[index]
                        matrix[(a1, a2)] = val
                        matrix[(a2, a1)] = val
                        index += 1
                matrices[mat_id] = matrix
    return matrices


def compute_matrix_features(matrices, wt, mut, n, c):
    features = {}
    if not all([wt, mut, n, c]):
        return {f"{mat_id}_mutation_delta": 0.0 for mat_id in matrices}

    for mat_id, matrix in matrices.items():
        wt_n = matrix.get((wt, n), 0.0)
        wt_c = matrix.get((wt, c), 0.0)
        mut_n = matrix.get((mut, n), 0.0)
        mut_c = matrix.get((mut, c), 0.0)
        delta = (mut_n + mut_c) - (wt_n + wt_c)
        features[f"{mat_id}_mutation_delta"] = delta
    return features


def main():
    if not os.path.exists(AAINDEX_FILE):
        print(f"❌ AAindex2 file not found: {AAINDEX_FILE}")
        print("Please download it from: https://www.genome.jp/ftp/db/community/aaindex/aaindex2")
        return

    df = pd.read_excel(INPUT_EXCEL)
    print(df.columns)

    matrices = load_aaindex2(AAINDEX_FILE)
    print(f"✅ Loaded {len(matrices)} mutation matrices.")

    all_rows = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        uniprot_id = row['UniProt ID']
        mutation = row['Mutation']
        wt, pos, mut = parse_mutation(mutation)
        sequence = get_uniprot_sequence(uniprot_id)

        if not sequence or pos > len(sequence):
            print(f"⚠️ Skipping {uniprot_id} - {mutation}")
            continue

        n, c = get_neighbors(sequence, pos)
        features = compute_matrix_features(matrices, wt, mut, n, c)

        all_rows.append({
            'UniProt ID': uniprot_id,
            'Mutation': mutation,
            'WildType': wt,
            'Position': pos,
            'Mutant': mut,
            'Neighbor_N': n,
            'Neighbor_C': c,
            **features
        })

    out_df = pd.DataFrame(all_rows)
    out_df.to_csv(OUTPUT_CSV, index=False)
    print(f"✅ Features saved to {OUTPUT_CSV}")


if __name__ == "__main__":
    main()


Index(['Unnamed: 0', 'UniProt ID', 'Gene', 'Mutation', 'Class'], dtype='object')
✅ Loaded 67 mutation matrices.


  9%|██████▋                                                                    | 1615/18114 [08:38<1:29:40,  3.07it/s]

⚠️ Skipping Q6RUI8 - R109I


 11%|████████▌                                                                  | 2071/18114 [11:08<1:23:53,  3.19it/s]

⚠️ Skipping Q17RH7 - S212P


 20%|███████████████▏                                                           | 3656/18114 [19:39<1:18:40,  3.06it/s]

⚠️ Skipping Q5T1J6 - Q118H


 20%|███████████████▏                                                           | 3657/18114 [19:39<1:20:46,  2.98it/s]

⚠️ Skipping Q5T1J6 - Q59H


 24%|█████████████████▉                                                         | 4323/18114 [23:23<1:13:20,  3.13it/s]

⚠️ Skipping Q96L96 - G1777R


 41%|███████████████████████████████▊                                             | 7474/18114 [40:27<57:07,  3.10it/s]

⚠️ Skipping Q8N1N5 - C89Y


 41%|███████████████████████████████▊                                             | 7475/18114 [40:28<56:45,  3.12it/s]

⚠️ Skipping Q8N1N5 - C236R


 41%|███████████████████████████████▊                                             | 7476/18114 [40:28<56:33,  3.13it/s]

⚠️ Skipping Q8N1N5 - H95P


 41%|███████████████████████████████▊                                             | 7477/18114 [40:28<58:44,  3.02it/s]

⚠️ Skipping Q8N1N5 - V164A


 41%|███████████████████████████████▊                                             | 7478/18114 [40:29<59:52,  2.96it/s]

⚠️ Skipping Q8N1N5 - P173R


 41%|███████████████████████████████▊                                             | 7479/18114 [40:29<58:56,  3.01it/s]

⚠️ Skipping Q8N1N5 - M286T


 41%|███████████████████████████████▊                                             | 7480/18114 [40:29<58:33,  3.03it/s]

⚠️ Skipping Q8N1N5 - M153T


 41%|███████████████████████████████▊                                             | 7481/18114 [40:30<58:56,  3.01it/s]

⚠️ Skipping Q8N1N5 - C27Y


 41%|██████████████████████████████▉                                            | 7482/18114 [40:30<1:00:40,  2.92it/s]

⚠️ Skipping Q8N1N5 - C174R


 41%|██████████████████████████████▉                                            | 7483/18114 [40:31<1:00:10,  2.94it/s]

⚠️ Skipping Q8N1N5 - R154W


 41%|███████████████████████████████▊                                             | 7484/18114 [40:31<58:07,  3.05it/s]

⚠️ Skipping Q8N1N5 - P142R


 61%|█████████████████████████████████████████████▏                            | 11066/18114 [1:00:57<37:57,  3.09it/s]

⚠️ Skipping Q9BYR0 - S168C


 62%|█████████████████████████████████████████████▉                            | 11255/18114 [1:01:59<38:46,  2.95it/s]

⚠️ Skipping P08519 - L3880V


 62%|█████████████████████████████████████████████▉                            | 11256/18114 [1:02:00<38:14,  2.99it/s]

⚠️ Skipping P08519 - L3866V


 62%|█████████████████████████████████████████████▉                            | 11257/18114 [1:02:00<37:46,  3.03it/s]

⚠️ Skipping P08519 - M4106T


 62%|█████████████████████████████████████████████▉                            | 11258/18114 [1:02:00<38:13,  2.99it/s]

⚠️ Skipping P08519 - M4187T


 62%|█████████████████████████████████████████████▉                            | 11259/18114 [1:02:01<37:22,  3.06it/s]

⚠️ Skipping P08519 - R3929Q


 80%|███████████████████████████████████████████████████████████▎              | 14517/18114 [1:19:59<19:47,  3.03it/s]

⚠️ Skipping P57071 - C1230Y


 80%|███████████████████████████████████████████████████████████▎              | 14520/18114 [1:20:00<19:06,  3.14it/s]

⚠️ Skipping P57071 - S1481P


100%|██████████████████████████████████████████████████████████████████████████| 18114/18114 [1:39:53<00:00,  3.02it/s]


✅ Features saved to mutation_matrix_features.csv
