import requests
import os

# 2.1) Use DisProt’s search endpoint with format=fasta
url = "https://disprot.org/api/search?format=fasta&limit=100"
try:
    resp = requests.get(url, timeout=15)
    resp.raise_for_status()
except Exception as e:
    raise RuntimeError(f"Failed to GET DisProt FASTA via API: {e}")

text = resp.text.strip()

# 2.2) Quick sanity check: FASTA must start with '>', not '<'
if not text.startswith(">"):
    raise RuntimeError(
        "Downloaded content does not look like FASTA. "
        "If it begins with '<', you're still hitting an HTML page instead of raw FASTA."
    )

# 2.3) Write the 100 DisProt entries to a file
with open("disprot_100.fasta", "w") as f:
    f.write(text + "\n")

print("✔ Successfully fetched 100 DisProt sequences in FASTA format → 'disprot_100.fasta'")


In [1]:
with open("disprot_100.fasta") as f:
    for _ in range(5):
        print(f.readline().rstrip())


>disprot|DP00003r002 pos=294-334 term=IDPO:00076 ec=ECO:0006220 pmid=8632448
EHVIEMDVTSENGQRALKEQSSKAKIVKNRWGRNVVQISNT

>disprot|DP00003r004 pos=454-464 term=IDPO:00076 ec=ECO:0006220 pmid=8632448
VYRNSRAQGGG


In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report

# ─── (A) Build aa_properties exactly as in STEP X ─────────────────────────────
kd_hydro = {
    'A':  1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C':  2.5,
    'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I':  4.5,
    'L':  3.8, 'K': -3.9, 'M':  1.9, 'F':  2.8, 'P': -1.6,
    'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V':  4.2
}
charge = {
    'A':  0, 'R':  1, 'N':  0, 'D': -1, 'C':  0,
    'Q':  0, 'E': -1, 'G':  0, 'H':  0, 'I':  0,
    'L':  0, 'K':  1, 'M':  0, 'F':  0, 'P':  0,
    'S':  0, 'T':  0, 'W':  0, 'Y':  0, 'V':  0
}
h_donors = {'A':0,'R':2,'N':2,'D':0,'C':0,'Q':2,'E':0,'G':0,'H':1,'I':0,
            'L':0,'K':1,'M':0,'F':0,'P':0,'S':1,'T':1,'W':1,'Y':1,'V':0}
h_acceptors = {'A':0,'R':0,'N':2,'D':2,'C':1,'Q':2,'E':2,'G':0,'H':1,'I':0,
               'L':0,'K':0,'M':0,'F':0,'P':0,'S':1,'T':1,'W':0,'Y':1,'V':0}
flexibility = {
    'A': 0.357, 'R': 0.529, 'N': 0.463, 'D': 0.511, 'C': 0.346,
    'Q': 0.493, 'E': 0.497, 'G': 0.544, 'H': 0.323, 'I': 0.462,
    'L': 0.365, 'K': 0.466, 'M': 0.295, 'F': 0.314, 'P': 0.509,
    'S': 0.507, 'T': 0.444, 'W': 0.305, 'Y': 0.420, 'V': 0.386
}
sidechain_volume = {
    'A':  88.6, 'R': 173.4, 'N': 114.1, 'D': 111.1, 'C': 108.5,
    'Q': 143.8, 'E': 138.4, 'G':  60.1, 'H': 153.2, 'I': 166.7,
    'L': 166.7, 'K': 168.6, 'M': 162.9, 'F': 189.9, 'P': 112.7,
    'S':  89.0, 'T': 116.1, 'W': 227.8, 'Y': 193.6, 'V': 140.0
}
polarity = {
    'A':  8.1, 'R': 10.5, 'N': 11.6, 'D': 13.0, 'C':  5.5,
    'Q': 10.5, 'E': 12.3, 'G':  9.0, 'H': 10.4, 'I':  5.2,
    'L':  4.9, 'K': 11.3, 'M':  5.7, 'F':  5.2, 'P':  8.0,
    'S':  9.2, 'T':  8.6, 'W':  5.4, 'Y':  6.2, 'V':  5.9
}
choufa_helix = {
    'A': 1.45, 'R': 0.79, 'N': 0.73, 'D': 1.01, 'C': 0.77,
    'Q': 1.17, 'E': 1.51, 'G': 0.53, 'H': 1.00, 'I': 1.08,
    'L': 1.34, 'K': 1.07, 'M': 1.20, 'F': 1.12, 'P': 0.59,
    'S': 0.79, 'T': 0.82, 'W': 1.14, 'Y': 0.61, 'V': 1.06
}
choufa_sheet = {
    'A': 0.97, 'R': 0.90, 'N': 0.65, 'D': 0.54, 'C': 1.30,
    'Q': 1.23, 'E': 0.37, 'G': 0.75, 'H': 0.87, 'I': 1.60,
    'L': 1.22, 'K': 0.74, 'M': 1.67, 'F': 1.28, 'P': 0.62,
    'S': 0.72, 'T': 1.20, 'W': 1.19, 'Y': 1.29, 'V': 1.70
}
rel_ASA = {
    'A': 0.74, 'R': 1.48, 'N': 1.14, 'D': 1.23, 'C': 0.86,
    'Q': 1.36, 'E': 1.26, 'G': 1.00, 'H': 0.91, 'I': 0.59,
    'L': 0.61, 'K': 1.29, 'M': 0.64, 'F': 0.65, 'P': 0.71,
    'S': 1.42, 'T': 1.20, 'W': 0.55, 'Y': 0.63, 'V': 0.54
}
beta_branched = {aa: (1 if aa in ('V','I','T') else 0) for aa in kd_hydro.keys()}

# Build aa_properties dictionary (12 dimensions per residue)
aa_properties = {}
canonical_set = set(kd_hydro.keys())
for aa in canonical_set:
    hydro_norm  = (kd_hydro[aa] + 4.5) / 9.0
    volume_norm = sidechain_volume[aa] / 227.8
    pol_norm    = (polarity[aa] - 4.9) / (13.0 - 4.9)
    helix_norm  = choufa_helix[aa] / 1.51
    sheet_norm  = choufa_sheet[aa] / 1.70
    asa_norm    = (rel_ASA[aa] - 0.54) / (1.48 - 0.54)
    aromatic    = 1 if aa in ('F','Y','W') else 0

    aa_properties[aa] = [
        hydro_norm,          # [0]
        charge[aa],          # [1]
        h_donors[aa],        # [2]
        h_acceptors[aa],     # [3]
        flexibility[aa],     # [4]
        volume_norm,         # [5]
        pol_norm,            # [6]
        aromatic,            # [7]
        helix_norm,          # [8]
        sheet_norm,          # [9]
        asa_norm,            # [10]
        beta_branched[aa]    # [11]
    ]

# ─── (B) Load FASTA sequences ─────────────────────────────────────────────────
def load_fasta(filepath, filter_non_canonical=False):
    seqs = []
    with open(filepath) as f:
        header = None
        seq = ""
        for line in f:
            line = line.strip()
            if line.startswith(">"):
                if header is not None and seq:
                    if (not filter_non_canonical) or (set(seq) <= canonical_set):
                        seqs.append(seq)
                header = line
                seq = ""
            else:
                seq += line
        if header is not None and seq:
            if (not filter_non_canonical) or (set(seq) <= canonical_set):
                seqs.append(seq)
    return seqs

pdb_seqs    = load_fasta("pdb_chains.fasta",   filter_non_canonical=True)   # 70 PDB chains
disprot_seqs = load_fasta("disprot_100.fasta", filter_non_canonical=False)  # 100 DisProt

# ─── (C) Compute each chain’s 7 global features ────────────────────────────────
def compute_global_features(sequence):
    props = []
    for aa in sequence:
        if aa in aa_properties:
            v = aa_properties[aa]
            props.append([
                v[0],               # hydrophobicity_norm
                v[1],               # charge
                v[2] + v[3],        # h_dh_a
                v[4] / 0.544,       # norm_flex (raw_flex/0.544)
                v[6],               # pol_norm
                v[7] + v[8],        # arom_plus_helix
                v[10]               # asa_norm
            ])
    if not props:
        return np.zeros(7)
    return np.mean(np.vstack(props), axis=0)

all_features = []
all_labels   = []

for seq in pdb_seqs:
    all_features.append(compute_global_features(seq))
    all_labels.append(1)   # 1 = folded (PDB)
for seq in disprot_seqs:
    all_features.append(compute_global_features(seq))
    all_labels.append(0)   # 0 = disordered (DisProt)

df_feat = pd.DataFrame(
    all_features,
    columns=[
        "hydro_norm",
        "charge",
        "h_dh_a",
        "norm_flex",
        "pol_norm",
        "arom_plus_helix",
        "asa_norm"
    ]
)
df_feat["label"] = all_labels

# ─── (D) Compute midpoint thresholds (mean of PDB vs. mean of DisProt) ───────
means = df_feat.groupby("label").mean().rename(index={0:"DisProt", 1:"PDB"})
midpoints = {col: (means.loc["PDB", col] + means.loc["DisProt", col]) / 2
             for col in df_feat.columns[:-1]}

print("Global Feature Means (DisProt vs. PDB):\n")
print(means, "\n")
print("Chosen Midpoint Thresholds:\n")
for feat, t in midpoints.items():
    print(f"  {feat:18s} = {t:.3f}")
print()

# ─── (E) Count how many of the 7 conditions each chain satisfies ───────────────
def count_conditions(row):
    c1 = row["hydro_norm"]          >= midpoints["hydro_norm"]
    c2 = abs(row["charge"])         <= abs(midpoints["charge"])
    c3 = row["h_dh_a"]              <= midpoints["h_dh_a"]
    c4 = row["norm_flex"]           <= midpoints["norm_flex"]
    c5 = row["pol_norm"]            <= midpoints["pol_norm"]
    c6 = row["arom_plus_helix"]     >= midpoints["arom_plus_helix"]
    c7 = row["asa_norm"]            <= midpoints["asa_norm"]
    return sum([c1, c2, c3, c4, c5, c6, c7])

df_feat["conditions_met"] = df_feat.apply(count_conditions, axis=1)

# Show the distribution of “conditions_met” separately for PDB vs. DisProt
dist = df_feat.groupby("label")["conditions_met"] \
              .value_counts() \
              .unstack(fill_value=0) \
              .rename(index={0:"DisProt", 1:"PDB"})

pd.set_option("display.max_columns", None)
print("Distribution of ‘conditions_met’ by Label:\n")
print(dist, "\n")

# ─── (F) For each k=1…7, classify “folded if conditions_met ≥ k” ─────────────
results = []
for k in range(1, 8):
    preds = (df_feat["conditions_met"] >= k).astype(int)
    tp = ((preds == 1) & (df_feat["label"] == 1)).sum()
    fn = ((preds == 0) & (df_feat["label"] == 1)).sum()
    tn = ((preds == 0) & (df_feat["label"] == 0)).sum()
    fp = ((preds == 1) & (df_feat["label"] == 0)).sum()
    acc = (tp + tn) / len(df_feat)
    results.append({
        "k (min # of features)": k,
        "TP": tp,
        "FN": fn,
        "TN": tn,
        "FP": fp,
        "Accuracy": f"{acc:.2%}"
    })

df_results = pd.DataFrame(results)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
print("Performance as we vary k = minimum # of satisfied conditions:\n")
print(df_results.to_string(index=False))


Global Feature Means (DisProt vs. PDB):

         hydro_norm    charge    h_dh_a  norm_flex  pol_norm  arom_plus_helix  \
label                                                                           
DisProt     0.40117 -0.023269  1.255558   0.837162  0.511919         0.718951   
PDB         0.45929 -0.013572  1.148274   0.809584  0.453446         0.765251   

         asa_norm  
label              
DisProt  0.519567  
PDB      0.456530   

Chosen Midpoint Thresholds:

  hydro_norm         = 0.430
  charge             = -0.018
  h_dh_a             = 1.202
  norm_flex          = 0.823
  pol_norm           = 0.483
  arom_plus_helix    = 0.742
  asa_norm           = 0.488

Distribution of ‘conditions_met’ by Label:

conditions_met     0     1     2     3     4     5    6    7
label                                                       
DisProt         3048  2892  1745  1438  1273  1249  931  211
PDB                0     1     3     2    15    10   27   12 

Performance as we vary k = m