In [1]:
# 1.) Download the PDB chain sequences (FASTA format from RCSB)
import requests

pdb_url = "https://ftp.wwpdb.org/pub/pdb/derived_data/pdb_seqres.txt"
try:
    resp = requests.get(pdb_url, timeout=60)
    resp.raise_for_status()
    text = resp.text.strip()
    if not text.startswith(">"):
        raise RuntimeError("Downloaded content does not look like FASTA.")
except Exception as e:
    raise RuntimeError(f"Failed to download PDB chain sequences: {e}")

with open("pdb_chains.fasta", "w", encoding="utf-8") as f:
    f.write(text + "\n")

print("✔ Successfully fetched PDB chain sequences → 'pdb_chains.fasta'")


RuntimeError: Failed to download PDB chain sequences: HTTPSConnectionPool(host='ftp.wwpdb.org', port=443): Max retries exceeded with url: /pub/pdb/derived_data/pdb_seqres.txt (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x103a742d0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))

In [2]:
# 2.) Use DisProt’s search endpoint with format=fasta
import requests
import os

url = "https://disprot.org/api/search?format=fasta&limit=10000"
try:
    resp = requests.get(url, timeout=15)
    resp.raise_for_status()
except Exception as e:
    raise RuntimeError(f"Failed to GET DisProt FASTA via API: {e}")

text = resp.text.strip()

# 2.2) Quick sanity check: FASTA must start with '>', not '<'
if not text.startswith(">"):
    raise RuntimeError(
        "Downloaded content does not look like FASTA. "
        "If it begins with '<', you're still hitting an HTML page instead of raw FASTA."
    )

# 2.3) Write the 100 DisProt entries to a file
with open("disprot_13000.fasta", "w") as f:
    f.write(text + "\n")

print("✔ Successfully fetched 100 DisProt sequences in FASTA format → 'disprot_1000.fasta'")


✔ Successfully fetched 100 DisProt sequences in FASTA format → 'disprot_1000.fasta'


In [3]:
# 2.1) Collect more data

import requests
import time

# ─── PARAMETERS ─────────────────────────────────────────────────────────────
TOTAL_DESIRED = 13_000   # how many DisProt sequences we want total
PER_PAGE      = 100      # DisProt’s hard cap per request
OUTPUT_FILE   = "disprot_13000.fasta"

accum_seqs = []
offset     = 0

while len(accum_seqs) < TOTAL_DESIRED:
    url = f"https://disprot.org/api/search?format=fasta&limit={PER_PAGE}&offset={offset}"
    try:
        resp = requests.get(url, timeout=30)
        resp.raise_for_status()
    except Exception as e:
        raise RuntimeError(f"Failed to GET DisProt FASTA (offset={offset}): {e}")

    block = resp.text.strip()
    if not block.startswith(">"):
        raise RuntimeError(
            "Downloaded content does not look like FASTA. "
            "If it begins with '<', you're still hitting an HTML page."
        )

    # Parse out this page’s FASTA sequences (collecting only the raw sequences, not full headers):
    raw_lines = block.splitlines()
    header = None
    seq_buf = ""
    this_page_seqs = []
    for line in raw_lines:
        if line.startswith(">"):
            if header is not None and seq_buf:
                this_page_seqs.append(seq_buf)
            header = line
            seq_buf = ""
        else:
            seq_buf += line.strip()
    if header is not None and seq_buf:
        this_page_seqs.append(seq_buf)

    if not this_page_seqs:
        # No more sequences returned → break out early
        break

    accum_seqs.extend(this_page_seqs)
    offset += PER_PAGE

    # Sleep briefly (so we don’t hammer the server)
    time.sleep(0.2)

# Trim in case we overshot
accum_seqs = accum_seqs[:TOTAL_DESIRED]

# Write out ~13k sequences in FASTA format (with minimal headers)
with open(OUTPUT_FILE, "w") as f:
    for i, seq in enumerate(accum_seqs):
        f.write(f">disprot_sequence_{i+1}\n")
        f.write(seq + "\n")

print(f"✔ Fetched {len(accum_seqs)} DisProt sequences → '{OUTPUT_FILE}'")


✔ Fetched 13000 DisProt sequences → 'disprot_13000.fasta'


In [4]:
# 2.2) Verify Downloaded Sequences
with open("disprot_13000.fasta") as f:
    for _ in range(5):
        print(f.readline().rstrip())


>disprot_sequence_1
EHVIEMDVTSENGQRALKEQSSKAKIVKNRWGRNVVQISNT
>disprot_sequence_2
VYRNSRAQGGG
>disprot_sequence_3


In [5]:
# 3.1 ) Seven‐Feature Threshold‐Based Fold/Disorder Classifier


import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report

# ─── (A) Build aa_properties exactly as in STEP X ─────────────────────────────
kd_hydro = {
    'A':  1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C':  2.5,
    'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I':  4.5,
    'L':  3.8, 'K': -3.9, 'M':  1.9, 'F':  2.8, 'P': -1.6,
    'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V':  4.2
}
charge = {
    'A':  0, 'R':  1, 'N':  0, 'D': -1, 'C':  0,
    'Q':  0, 'E': -1, 'G':  0, 'H':  0, 'I':  0,
    'L':  0, 'K':  1, 'M':  0, 'F':  0, 'P':  0,
    'S':  0, 'T':  0, 'W':  0, 'Y':  0, 'V':  0
}
h_donors = {'A':0,'R':2,'N':2,'D':0,'C':0,'Q':2,'E':0,'G':0,'H':1,'I':0,
            'L':0,'K':1,'M':0,'F':0,'P':0,'S':1,'T':1,'W':1,'Y':1,'V':0}
h_acceptors = {'A':0,'R':0,'N':2,'D':2,'C':1,'Q':2,'E':2,'G':0,'H':1,'I':0,
               'L':0,'K':0,'M':0,'F':0,'P':0,'S':1,'T':1,'W':0,'Y':1,'V':0}
flexibility = {
    'A': 0.357, 'R': 0.529, 'N': 0.463, 'D': 0.511, 'C': 0.346,
    'Q': 0.493, 'E': 0.497, 'G': 0.544, 'H': 0.323, 'I': 0.462,
    'L': 0.365, 'K': 0.466, 'M': 0.295, 'F': 0.314, 'P': 0.509,
    'S': 0.507, 'T': 0.444, 'W': 0.305, 'Y': 0.420, 'V': 0.386
}
sidechain_volume = {
    'A':  88.6, 'R': 173.4, 'N': 114.1, 'D': 111.1, 'C': 108.5,
    'Q': 143.8, 'E': 138.4, 'G':  60.1, 'H': 153.2, 'I': 166.7,
    'L': 166.7, 'K': 168.6, 'M': 162.9, 'F': 189.9, 'P': 112.7,
    'S':  89.0, 'T': 116.1, 'W': 227.8, 'Y': 193.6, 'V': 140.0
}
polarity = {
    'A':  8.1, 'R': 10.5, 'N': 11.6, 'D': 13.0, 'C':  5.5,
    'Q': 10.5, 'E': 12.3, 'G':  9.0, 'H': 10.4, 'I':  5.2,
    'L':  4.9, 'K': 11.3, 'M':  5.7, 'F':  5.2, 'P':  8.0,
    'S':  9.2, 'T':  8.6, 'W':  5.4, 'Y':  6.2, 'V':  5.9
}
choufa_helix = {
    'A': 1.45, 'R': 0.79, 'N': 0.73, 'D': 1.01, 'C': 0.77,
    'Q': 1.17, 'E': 1.51, 'G': 0.53, 'H': 1.00, 'I': 1.08,
    'L': 1.34, 'K': 1.07, 'M': 1.20, 'F': 1.12, 'P': 0.59,
    'S': 0.79, 'T': 0.82, 'W': 1.14, 'Y': 0.61, 'V': 1.06
}
choufa_sheet = {
    'A': 0.97, 'R': 0.90, 'N': 0.65, 'D': 0.54, 'C': 1.30,
    'Q': 1.23, 'E': 0.37, 'G': 0.75, 'H': 0.87, 'I': 1.60,
    'L': 1.22, 'K': 0.74, 'M': 1.67, 'F': 1.28, 'P': 0.62,
    'S': 0.72, 'T': 1.20, 'W': 1.19, 'Y': 1.29, 'V': 1.70
}
rel_ASA = {
    'A': 0.74, 'R': 1.48, 'N': 1.14, 'D': 1.23, 'C': 0.86,
    'Q': 1.36, 'E': 1.26, 'G': 1.00, 'H': 0.91, 'I': 0.59,
    'L': 0.61, 'K': 1.29, 'M': 0.64, 'F': 0.65, 'P': 0.71,
    'S': 1.42, 'T': 1.20, 'W': 0.55, 'Y': 0.63, 'V': 0.54
}
beta_branched = {aa: (1 if aa in ('V','I','T') else 0) for aa in kd_hydro.keys()}

# Build aa_properties dictionary (12 dimensions per residue)
aa_properties = {}
canonical_set = set(kd_hydro.keys())
for aa in canonical_set:
    hydro_norm  = (kd_hydro[aa] + 4.5) / 9.0
    volume_norm = sidechain_volume[aa] / 227.8
    pol_norm    = (polarity[aa] - 4.9) / (13.0 - 4.9)
    helix_norm  = choufa_helix[aa] / 1.51
    sheet_norm  = choufa_sheet[aa] / 1.70
    asa_norm    = (rel_ASA[aa] - 0.54) / (1.48 - 0.54)
    aromatic    = 1 if aa in ('F','Y','W') else 0

    aa_properties[aa] = [
        hydro_norm,          # [0]
        charge[aa],          # [1]
        h_donors[aa],        # [2]
        h_acceptors[aa],     # [3]
        flexibility[aa],     # [4]
        volume_norm,         # [5]
        pol_norm,            # [6]
        aromatic,            # [7]
        helix_norm,          # [8]
        sheet_norm,          # [9]
        asa_norm,            # [10]
        beta_branched[aa]    # [11]
    ]

# ─── (B) Load FASTA sequences ─────────────────────────────────────────────────
def load_fasta(filepath, filter_non_canonical=False):
    seqs = []
    with open(filepath) as f:
        header = None
        seq = ""
        for line in f:
            line = line.strip()
            if line.startswith(">"):
                if header is not None and seq:
                    if (not filter_non_canonical) or (set(seq) <= canonical_set):
                        seqs.append(seq)
                header = line
                seq = ""
            else:
                seq += line
        if header is not None and seq:
            if (not filter_non_canonical) or (set(seq) <= canonical_set):
                seqs.append(seq)
    return seqs

pdb_seqs    = load_fasta("pdb_chains.fasta",   filter_non_canonical=True)   # 70 PDB chains
disprot_seqs = load_fasta("disprot_13000.fasta", filter_non_canonical=False)  # 100 DisProt

# ─── (C) Compute each chain’s 7 global features ────────────────────────────────
def compute_global_features(sequence):
    props = []
    for aa in sequence:
        if aa in aa_properties:
            v = aa_properties[aa]
            props.append([
                v[0],               # hydrophobicity_norm
                v[1],               # charge
                v[2] + v[3],        # h_dh_a
                v[4] / 0.544,       # norm_flex (raw_flex/0.544)
                v[6],               # pol_norm
                v[7] + v[8],        # arom_plus_helix
                v[10]               # asa_norm
            ])
    if not props:
        return np.zeros(7)
    return np.mean(np.vstack(props), axis=0)

all_features = []
all_labels   = []

for seq in pdb_seqs:
    all_features.append(compute_global_features(seq))
    all_labels.append(1)   # 1 = folded (PDB)
for seq in disprot_seqs:
    all_features.append(compute_global_features(seq))
    all_labels.append(0)   # 0 = disordered (DisProt)

df_feat = pd.DataFrame(
    all_features,
    columns=[
        "hydro_norm",
        "charge",
        "h_dh_a",
        "norm_flex",
        "pol_norm",
        "arom_plus_helix",
        "asa_norm"
    ]
)
df_feat["label"] = all_labels

# ─── (D) Compute midpoint thresholds (mean of PDB vs. mean of DisProt) ───────
means = df_feat.groupby("label").mean().rename(index={0:"DisProt", 1:"PDB"})
midpoints = {col: (means.loc["PDB", col] + means.loc["DisProt", col]) / 2
             for col in df_feat.columns[:-1]}

print("Global Feature Means (DisProt vs. PDB):\n")
print(means, "\n")
print("Chosen Midpoint Thresholds:\n")
for feat, t in midpoints.items():
    print(f"  {feat:18s} = {t:.3f}")
print()

# ─── (E) Count how many of the 7 conditions each chain satisfies ───────────────
def count_conditions(row):
    c1 = row["hydro_norm"]          >= midpoints["hydro_norm"]
    c2 = abs(row["charge"])         <= abs(midpoints["charge"])
    c3 = row["h_dh_a"]              <= midpoints["h_dh_a"]
    c4 = row["norm_flex"]           <= midpoints["norm_flex"]
    c5 = row["pol_norm"]            <= midpoints["pol_norm"]
    c6 = row["arom_plus_helix"]     >= midpoints["arom_plus_helix"]
    c7 = row["asa_norm"]            <= midpoints["asa_norm"]
    return sum([c1, c2, c3, c4, c5, c6, c7])

df_feat["conditions_met"] = df_feat.apply(count_conditions, axis=1)

# Show the distribution of “conditions_met” separately for PDB vs. DisProt
dist = df_feat.groupby("label")["conditions_met"] \
              .value_counts() \
              .unstack(fill_value=0) \
              .rename(index={0:"DisProt", 1:"PDB"})

pd.set_option("display.max_columns", None)
print("Distribution of ‘conditions_met’ by Label:\n")
print(dist, "\n")

# ─── (F) For each k=1…7, classify “folded if conditions_met ≥ k” ─────────────
results = []
for k in range(1, 8):
    preds = (df_feat["conditions_met"] >= k).astype(int)
    tp = ((preds == 1) & (df_feat["label"] == 1)).sum()
    fn = ((preds == 0) & (df_feat["label"] == 1)).sum()
    tn = ((preds == 0) & (df_feat["label"] == 0)).sum()
    fp = ((preds == 1) & (df_feat["label"] == 0)).sum()
    acc = (tp + tn) / len(df_feat)
    results.append({
        "k (min # of features)": k,
        "TP": tp,
        "FN": fn,
        "TN": tn,
        "FP": fp,
        "Accuracy": f"{acc:.2%}"
    })

df_results = pd.DataFrame(results)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
print("Performance as we vary k = minimum # of satisfied conditions:\n")
print(df_results.to_string(index=False))


Global Feature Means (DisProt vs. PDB):

         hydro_norm    charge    h_dh_a  norm_flex  pol_norm  arom_plus_helix  \
label                                                                           
DisProt    0.401045 -0.022989  1.256314   0.837154  0.511825         0.719075   
PDB        0.459290 -0.013572  1.148274   0.809584  0.453446         0.765251   

         asa_norm  
label              
DisProt  0.519651  
PDB      0.456530   

Chosen Midpoint Thresholds:

  hydro_norm         = 0.430
  charge             = -0.018
  h_dh_a             = 1.202
  norm_flex          = 0.823
  pol_norm           = 0.483
  arom_plus_helix    = 0.742
  asa_norm           = 0.488

Distribution of ‘conditions_met’ by Label:

conditions_met     0     1     2     3     4     5    6    7
label                                                       
DisProt         3103  2976  1753  1455  1290  1268  941  214
PDB                0     1     3     2    15    10   27   12 

Performance as we vary k = m

In [6]:
# 3. Logistic Regression–Derived Seven‐Feature Classifier
 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# (1) Split the same feature matrix and label vector into train/test
X = df_feat.drop(columns=["label"])
y = df_feat["label"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# (2) Fit the logistic model (with class_weight='balanced'):
clf = LogisticRegression(
    penalty='l2',
    C=1.0,
    class_weight='balanced',
    solver='lbfgs',
    max_iter=200,
    random_state=42
).fit(X_train, y_train)

# (3) After fitting, these attributes hold exactly the numbers we used:
print(clf.coef_.flatten())   # → [ 9.149,  3.051,  2.034, -7.553, -6.521,  8.728, -7.629 ]
print(clf.intercept_)        # → [0.131]


[ 1.16407641  1.85715342  3.49913782 -1.84923829 -0.91043986  2.02929407
 -0.31493117  0.9647182 ]
[-7.8232685]


In [7]:
# 3.2 Rule‐Based Seven‐Feature Classifier (Using Previously Learned LR Weights)

import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report

# ─── (A) Build aa_properties dictionary ───────────────────────────────────────
# We reuse exactly the same per‐residue dictionary of 7 features as before.
# Each amino acid maps to a 7‐element list:
#   [hydro_norm, charge,  h_dh_a,  norm_flex,  pol_norm,  arom_plus_helix,  asa_norm]

kd_hydro = {
    'A':  1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C':  2.5,
    'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I':  4.5,
    'L':  3.8, 'K': -3.9, 'M':  1.9, 'F':  2.8, 'P': -1.6,
    'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V':  4.2
}
charge = {
    'A':  0, 'R':  1, 'N':  0, 'D': -1, 'C':  0,
    'Q':  0, 'E': -1, 'G':  0, 'H':  0, 'I':  0,
    'L':  0, 'K':  1, 'M':  0, 'F':  0, 'P':  0,
    'S':  0, 'T':  0, 'W':  0, 'Y':  0, 'V':  0
}
h_donors = {'A':0,'R':2,'N':2,'D':0,'C':0,'Q':2,'E':0,'G':0,'H':1,'I':0,
            'L':0,'K':1,'M':0,'F':0,'P':0,'S':1,'T':1,'W':1,'Y':1,'V':0}
h_acceptors = {'A':0,'R':0,'N':2,'D':2,'C':1,'Q':2,'E':2,'G':0,'H':1,'I':0,
               'L':0,'K':0,'M':0,'F':0,'P':0,'S':1,'T':1,'W':0,'Y':1,'V':0}
flexibility = {
    'A': 0.357, 'R': 0.529, 'N': 0.463, 'D': 0.511, 'C': 0.346,
    'Q': 0.493, 'E': 0.497, 'G': 0.544, 'H': 0.323, 'I': 0.462,
    'L': 0.365, 'K': 0.466, 'M': 0.295, 'F': 0.314, 'P': 0.509,
    'S': 0.507, 'T': 0.444, 'W': 0.305, 'Y': 0.420, 'V': 0.386
}
sidechain_volume = {
    'A':  88.6, 'R': 173.4, 'N': 114.1, 'D': 111.1, 'C': 108.5,
    'Q': 143.8, 'E': 138.4, 'G':  60.1, 'H': 153.2, 'I': 166.7,
    'L': 166.7, 'K': 168.6, 'M': 162.9, 'F': 189.9, 'P': 112.7,
    'S':  89.0, 'T': 116.1, 'W': 227.8, 'Y': 193.6, 'V': 140.0
}
polarity = {
    'A':  8.1, 'R': 10.5, 'N': 11.6, 'D': 13.0, 'C':  5.5,
    'Q': 10.5, 'E': 12.3, 'G':  9.0, 'H': 10.4, 'I':  5.2,
    'L':  4.9, 'K': 11.3, 'M':  5.7, 'F':  5.2, 'P':  8.0,
    'S':  9.2, 'T':  8.6, 'W':  5.4, 'Y':  6.2, 'V':  5.9
}
choufa_helix = {
    'A': 1.45, 'R': 0.79, 'N': 0.73, 'D': 1.01, 'C': 0.77,
    'Q': 1.17, 'E': 1.51, 'G': 0.53, 'H': 1.00, 'I': 1.08,
    'L': 1.34, 'K': 1.07, 'M': 1.20, 'F': 1.12, 'P': 0.59,
    'S': 0.79, 'T': 0.82, 'W': 1.14, 'Y': 0.61, 'V': 1.06
}
choufa_sheet = {
    'A': 0.97, 'R': 0.90, 'N': 0.65, 'D': 0.54, 'C': 1.30,
    'Q': 1.23, 'E': 0.37, 'G': 0.75, 'H': 0.87, 'I': 1.60,
    'L': 1.22, 'K': 0.74, 'M': 1.67, 'F': 1.28, 'P': 0.62,
    'S': 0.72, 'T': 1.20, 'W': 1.19, 'Y': 1.29, 'V': 1.70
}
rel_ASA = {
    'A': 0.74, 'R': 1.48, 'N': 1.14, 'D': 1.23, 'C': 0.86,
    'Q': 1.36, 'E': 1.26, 'G': 1.00, 'H': 0.91, 'I': 0.59,
    'L': 0.61, 'K': 1.29, 'M': 0.64, 'F': 0.65, 'P': 0.71,
    'S': 1.42, 'T': 1.20, 'W': 0.55, 'Y': 0.63, 'V': 0.54
}
beta_branched = {aa: (1 if aa in ('V','I','T') else 0) for aa in kd_hydro.keys()}

aa_properties = {}
canonical_set = set(kd_hydro.keys())
for aa in canonical_set:
    # Normalize hydrophobicity to [0,1]
    hydro_norm  = (kd_hydro[aa] + 4.5) / 9.0
    # Normalize sidechain volume (not used directly in the 7‐feature vector)
    volume_norm = sidechain_volume[aa] / 227.8
    # Normalize polarity → [0,1]
    pol_norm    = (polarity[aa] - 4.9) / (13.0 - 4.9)
    # Normalize helix propensity → [0,1]
    helix_norm  = choufa_helix[aa] / 1.51
    # Normalize sheet propensity → [0,1]
    sheet_norm  = choufa_sheet[aa] / 1.70
    # Normalize ASA → [0,1]
    asa_norm    = (rel_ASA[aa] - 0.54) / (1.48 - 0.54)
    # Aromatic indicator (F, Y, W)
    aromatic    = 1 if aa in ('F','Y','W') else 0

    # Our final 7 features per residue:
    aa_properties[aa] = [
        hydro_norm,                    # [0] normalized hydrophobicity
        charge[aa],                    # [1] net charge
        h_donors[aa] + h_acceptors[aa],# [2] total H-bond donors+acceptors
        flexibility[aa] / 0.544,       # [3] normalized flexibility (max=0.544)
        pol_norm,                      # [4] normalized polarity
        aromatic + helix_norm,         # [5] aromatic + helix propensity
        asa_norm                       # [6] normalized solvent-accessible surface
    ]

# ─── (B) Load FASTA sequences ─────────────────────────────────────────────────
def load_fasta(filepath, filter_non_canonical=False):
    """
    Read all sequences from a FASTA file.
    If filter_non_canonical=True, discard any sequence containing non-standard AAs.
    """
    seqs = []
    with open(filepath) as f:
        header = None
        seq = ""
        for line in f:
            line = line.strip()
            if line.startswith(">"):
                if header is not None and seq:
                    # Only keep this sequence if all residues are in our 20‐AA set,
                    # when filter_non_canonical=True.
                    if (not filter_non_canonical) or (set(seq) <= canonical_set):
                        seqs.append(seq)
                header = line
                seq = ""
            else:
                seq += line
        # Catch the last sequence
        if header is not None and seq:
            if (not filter_non_canonical) or (set(seq) <= canonical_set):
                seqs.append(seq)
    return seqs

# Load ~70 folded PDB chains (filter non-canonical AA)
pdb_seqs     = load_fasta("pdb_chains.fasta",    filter_non_canonical=True)
# Load ~13k DisProt sequences (allow non-canonical AA)
disprot_seqs = load_fasta("disprot_13000.fasta", filter_non_canonical=False)

# ─── (C) Compute each sequence’s 7 global features ─────────────────────────────
def compute_global_features(sequence):
    """
    For a given AA sequence, compute a 7‐element array:
      [mean_hydro_norm, mean_charge, mean_h_dh_a,
       mean_norm_flex,  mean_pol_norm,  mean_arom_plus_helix,  mean_asa_norm]
    by averaging per-residue aa_properties.
    """
    props = []
    for aa in sequence:
        if aa in aa_properties:
            props.append(aa_properties[aa])
    if not props:
        # If the sequence is empty or has no canonical AA, return zeros
        return np.zeros(7)
    return np.mean(np.vstack(props), axis=0)

# Build feature matrix (one row per protein) and label vector
all_features = []
all_labels   = []

for seq in pdb_seqs:
    all_features.append(compute_global_features(seq))
    all_labels.append(1)  # 1 = folded (PDB)
for seq in disprot_seqs:
    all_features.append(compute_global_features(seq))
    all_labels.append(0)  # 0 = disordered (DisProt)

df_feat = pd.DataFrame(
    all_features,
    columns=[
        "hydro_norm",        # normalized hydrophobicity
        "charge",            # net charge
        "h_dh_a",            # total H-bond donors + acceptors
        "norm_flex",         # normalized flexibility
        "pol_norm",          # normalized polarity
        "arom_plus_helix",   # aromatic + helix propensity
        "asa_norm"           # normalized ASA
    ]
)
df_feat["label"] = all_labels

# ─── (D) Logistic Regression “Rule” Using Learned Weights ────────────────────
# We previously trained a logistic model and obtained these weights:
#   hydro_norm      → +9.149
#   charge          → +3.051
#   h_dh_a          → +2.034
#   norm_flex       → –7.553
#   pol_norm        → –6.521
#   arom_plus_helix → +8.728
#   asa_norm        → –7.629
# Intercept = +0.131
#
# The sigmoid(score) = 1 / (1 + exp( – (intercept + Σ weight_i × feature_i) )).
# We predict “folded” (PDB) if sigmoid(score) > 0.5.

# Store weights and intercept in numpy arrays for easy dot‐product
weights = np.array([
    9.149,    # weight for hydro_norm
    3.051,    # weight for charge
    2.034,    # weight for h_dh_a
   -7.553,    # weight for norm_flex
   -6.521,    # weight for pol_norm
    8.728,    # weight for arom_plus_helix
   -7.629     # weight for asa_norm
])
intercept = 0.131

# Compute “score” and predicted probability for each protein in df_feat
# sigmoid(x) = 1/(1 + exp(–x))
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

# Extract feature matrix (N × 7)
X = df_feat[[
    "hydro_norm",
    "charge",
    "h_dh_a",
    "norm_flex",
    "pol_norm",
    "arom_plus_helix",
    "asa_norm"
]].values

# Compute raw scores: intercept + X ⋅ weights
raw_scores = intercept + X.dot(weights)

# Compute predicted probabilities of “PDB” (folded)
probs_pdb = sigmoid(raw_scores)

# Choose threshold = 0.5 for “PDB” vs. “DisProt”
preds_05 = (probs_pdb > 0.5).astype(int)

# ─── (E) Evaluate on the Entire Dataset ───────────────────────────────────────
true_labels = df_feat["label"].values

print("Classification Report (Threshold = 0.5):\n")
print(classification_report(true_labels, preds_05, target_names=["DisProt","PDB"]))

cm = confusion_matrix(true_labels, preds_05)
cm_df = pd.DataFrame(
    cm,
    index=["Actual DisProt", "Actual PDB"],
    columns=["Pred DisProt", "Pred PDB"]
)
print("Confusion Matrix (Threshold = 0.5):\n")
print(cm_df)

# ─── (F) Optionally, Adjust Threshold to Reduce False Positives ──────────────
# Because PDB is very rare, you may want to require a higher probability (e.g., 0.7) 
# to call “PDB.” Simply do: preds_07 = (probs_pdb > 0.7).astype(int) and re‐evaluate.

# Example at threshold = 0.7:
preds_07 = (probs_pdb > 0.7).astype(int)
print("\n--- Threshold = 0.7 ---")
print(classification_report(true_labels, preds_07, target_names=["DisProt","PDB"]))
cm_07 = confusion_matrix(true_labels, preds_07)
cm_07_df = pd.DataFrame(
    cm_07,
    index=["Actual DisProt", "Actual PDB"],
    columns=["Pred DisProt", "Pred PDB"]
)
print("Confusion Matrix (Threshold = 0.7):\n")
print(cm_07_df)

# ─── (G) (Optional) Inspect the Learned Weights ──────────────────────────────
print("\nLearned Logistic Weights:")
feature_names = [
    "hydro_norm", "charge", "h_dh_a",
    "norm_flex", "pol_norm", "arom_plus_helix", "asa_norm"
]
for name, w in zip(feature_names, weights):
    print(f"{name:15s} → {w:.3f}")
print(f"Intercept: {intercept:.3f}")


Classification Report (Threshold = 0.5):

              precision    recall  f1-score   support

     DisProt       1.00      0.74      0.85     13000
         PDB       0.02      0.93      0.04        70

    accuracy                           0.74     13070
   macro avg       0.51      0.84      0.45     13070
weighted avg       0.99      0.74      0.85     13070

Confusion Matrix (Threshold = 0.5):

                Pred DisProt  Pred PDB
Actual DisProt          9663      3337
Actual PDB                 5        65

--- Threshold = 0.7 ---
              precision    recall  f1-score   support

     DisProt       1.00      0.89      0.94     13000
         PDB       0.02      0.40      0.04        70

    accuracy                           0.89     13070
   macro avg       0.51      0.64      0.49     13070
weighted avg       0.99      0.89      0.93     13070

Confusion Matrix (Threshold = 0.7):

                Pred DisProt  Pred PDB
Actual DisProt         11543      1457
Actual PDB