In [8]:
import requests

In [9]:
test_runs = ["1lo6", "2q7n", "3c98", "4qum", "6md5"]

In [10]:
def get_fasta(pdb_id, output_file):
        url = f"https://www.rcsb.org/fasta/entry/{pdb_id}"
        
        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise an exception for any HTTP error
        except requests.exceptions.RequestException as e:
            print(f"Failed to retrieve FASTA for PDB ID {pdb_id}: {e}")
            return
        
        fasta_text = response.text.strip()
        if fasta_text:
            with open(output_file, "w") as f:
                f.write(fasta_text)
        else:
            print(f"No FASTA data found for PDB ID {pdb_id}")

In [11]:
for tests in test_runs:
    get_fasta(tests, f"example_run/{tests}.fasta")

In [17]:
import pandas as pd

def analyze(name, i, native, my_seq, benchmark):
    my_seq = my_seq.replace("<","").replace(">","")
    to_csv = f"{name}/{name}_ver_{i}.csv"

    to_add = []
    for i in range(len(native)):
        pos = i + 1
        if native[i] != my_seq[i] or native[i] != benchmark[i]:
            to_add.append([pos, native[i], my_seq[i], benchmark[i]])
            
    df = pd.DataFrame(to_add,columns=["Position", "Native", "prossmpnn_seq", "benchmarked"])

    same_col1_col2 = df[df['Native'] == df['prossmpnn_seq']]
    same_col2_col3 = df[df['prossmpnn_seq'] == df['benchmarked']]
    same_col1_col3 = df[df['Native'] == df['benchmarked']]
    all_different = df[(df['Native'] != df['prossmpnn_seq']) & (df['Native'] != df['benchmarked']) & (df['prossmpnn_seq'] != df['benchmarked'])]

    same_col1_col2["Decription"] = f"Only benchmarked differs"
    same_col1_col3["Decription"] = f"Only prossmpnn differs"
    all_different["Decription"] = f"Mutated position but different residues for prossmpnn and native"
    same_col2_col3["Decription"] = f"Both benchmarked and prossmpnn sequences agree on the mutation."
    
    combined_df = pd.concat([same_col1_col2, same_col2_col3, same_col1_col3, all_different])
    combined_df.to_csv(to_csv, index=False)


In [20]:
#benchmarking and comparing for 1lo6

native = "LVHGGPCDKTSHPYQAALYTSGHLLCGGVLIHPLWVLTAAHCKKPNLQVFLGKHNLRQRESSQEQSSVVRAVIHPDYDAASHDQDIMLLRLARPAKLSELIQPLPLERDCSANTTSCHILGWGKTADGDFPDTIQCAYIHLVSREECEHAYPGQITQNMLCAGDEKYGKDSCQGDSGGPLVCGDHLRGLVSWGNIPCGSKEKPGVYTNVCRYTNWIQKTIQAK"
my_seq = "LVHG<Y>PCDK<H>SHPYQAALYTSGHLLCGGVLIHPLWVLTAAHCKKPNLQVFLGKHNLRQ<Q>ESSQEQSSVVRAVIHP<G>YDA<S>SHDQDIMLLRLARPAKLSELIQPLPLERDCSANTTSCHILGWGKT<T>DGDFPDTIQCAYIHLVSREECEHAYPGQITQNMLCAGDEKYGKDSCQGDSGGPLVCGDHLRGLVSWGNIPCG<Q>K<N>KPGVYT<K>VCRYTNWIQKTIQA<R>"
benchmark = [
    "LVHGGPCDKTSHPYQAALYTSGHLLCGGVLIHPLWVLTAAHCKKPNLQVFLGKHNLRQRESSQEQSSVVRAVIHPDYDAASHDNDIMLLRLARPAKLSELIQPLPLERDCSANTTSCHILGWGKTADGDFPDTIQCAYIHLVSDEECEKAYPGQITDNMLCAGDEKYGKDSCQGDSGGPLVCGGHLRGLVSWGNIPCGSKEKPGVYTNVCRYTDWIQKTIQAK",
    "LVHGGPCDKTSHPYQAALYTSGHLLCGGVLIHPLWVLTAAHCKKPNLQVFLGKHNLGQQESSQEQSSVVRAVIHPDYDAASHDNDIMLLRLARPAKLSELIQPLPLERDCSANTTSCHILGWGKTADGDFPDTIQCAYIHLVSDEECEKAYPGQITDNMLCAGDEKYGKDSCQGDSGGPLVCGGHLRGLVSWGNIPCGSKEKPGVYTNVCRYTDWIQKTIQAK",
    "LVHGGPCDKTSHPYQAALYTSGHLLCGGVLIHPLWVLTAAHCKKPNLQVFLGKHNLRQRESSQEQSSVVRAVIHPDYDAATHDQDIMLLRLARPAKLSEHIQPLPLERDCSANDTSCHILGWGKTADGDFPDTIQCAYIYLLSDEECERAYPGQITDNMLCAGDEKYGKDSCQGDSGGPLVCGGHLRGLVSWGNIPCGSKEKPGVYTNVCRFVDWIQKTIQAK",
    "LVHGGPCDKTSHPYQAALYTSGHLLCGGVLIHPLWVLTAAHCKKPNLQVFLGKHNLGQQESSQEQSSVVRAVIHPDYDAATHDQDIMLLRLARPAKLSEHIQPLPLERDCSANDTSCHILGWGKTADGDFPDTIQCAYIYLLSDEECERAYPGQITDNMLCAGDEKYGKDSCQGDSGGPLVCGGHLRGLVSWGNIPCGSKEKPGVYTNVCRFVDWIQKTIQAK",
    "LVHGGPCDKTSHPYQAALYTSGHLLCGGVLIHPLWVLTAAHCKKPNLQVFLGKHNLRQRESSQEQSSVVRTVVHPGYDAATHDNDIMLLRLARPAKLSEHIQPLPLERDCSANHTSCHILGWGKTADGDFPDTIQCAYIYLVSQEECEKAYPGQITDNMLCAGDEKYGKDSCQGDSGGPLVCGGHLRGLVSWGHVPCGSKEKPGVYTNVCRYVDWIQKTIQAK",
    "LVHGGPCDKTSHPYQAALYTSGHLLCGGVLIHPLWVLTAAHCKKPNLQVFLGKHNLGQQESSQEQSSVVRTVVHPGYDAATHDNDIMLLRLARPAKLSEHIQPLPLERDCSANHTSCHILGWGKTADGDFPDTIQCAYIYLVSQEECEKAYPGQITDNMLCAGDEKYGKDSCQGDSGGPLVCGGHLRGLVSWGHVPCGSKEKPGVYTNVCRYVDWIQKTIQAK"]

for i in range(len(benchmark)):
    analyze("1lo6", i+1, native, my_seq, benchmark[i])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  same_col1_col2["Decription"] = f"Only benchmarked differs"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  same_col1_col3["Decription"] = f"Only prossmpnn differs"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  same_col1_col2["Decription"] = f"Only benchmarked differs"
A value is trying to be set on

In [21]:
#benchmarking and comparing for 2q7n

native =  "SPLPITPVNATCAIRHPCHNNLMNQIRSQLAQLNGSANALFILYYTAQGEPFPNNLDKLCGPNVTDFPPFHANGTEKAKLVELYRIVVYLGTSLGNITRDQKILNPSALSLHSKLNATADILRGLLSNVLCRLCSKYHVGHVDVTYGPDTSGKDVFQKKKLGCQLLGKYKQIIAVLAQAF"
my_seq = "SPLP<A>TPVNA<S>C<T>I<S><T>PCH<S>NLMNQIR<T>QLAQLNGSANALFILYYTAQGEPFPNNLDKLCGPNVTDFPPFHANGTEKAKLVELYRIVVYLGTSLGNITRDQKILNPSALSLH<T>KLNATADILR<S>LLSNV<R>CRLCSKYHVGHVDVTYGPDTSGKDVFQKKKLGCQLLGKYKQIIAVLAQAF"
benchmark = [
    "SPLPITPVNATCAIRHPCHNNLMNQIRSQLAQLNGSANDLFILYYTAQGEPFPNNLDKLCGPNVTDFPPFHANGTEKDKLVELYRIVVYLGTALGNITRDQKILNPSAQNLHSKLNATADILRGLLSNVLCRLCSKYHVGHVDVTYGPDTSGKDVFQKKKLGCQLLGKYKQIIAVLAQAF",
    "SPLPITPVNATCAIRHPCHNNLMNQIRSQLAQLNGEANDLFILYYTAQGEPFPNNLDKLCGPNVTDFPPFHPNGTEKDKLVELYRIIVYLGTALGNIWRDQKILNPSAQNLHSKLNATADILRGLLSNVLCRLCSKYHVGHVDVTYGPNTSGKDVFQKKKLGCQLLGKYKQVIAELAQAF"
]

for i in range(len(benchmark)):
    analyze("2q7n", i+1, native, my_seq, benchmark[i])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  same_col1_col2["Decription"] = f"Only benchmarked differs"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  same_col1_col3["Decription"] = f"Only prossmpnn differs"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  same_col1_col2["Decription"] = f"Only benchmarked differs"
A value is trying to be set on

In [None]:
#benchmarking and comparing for 4qum