# SCOP to UniProt

The main purpose of this notebook is to read and parse the entire SCOP dataset. For each protein, extract its amino acid sequence using the UniProt API. We then store all the information to a Pandas Dataframe which gets saved as a CSV.

#### Information Stored (Columns)
- UniProt ID
- Protein's Family
- Protein's Superfamily
- Sequence (in text)

## Loading the SCOP Dataset

In [1]:
from tqdm import tqdm 
import pandas as pd
from io import StringIO
from concurrent.futures import ThreadPoolExecutor
from Bio import SeqIO
import requests
import pandas as pd

In [2]:
gpfs_path = "/scratch/gpfs/jr8867"
scop_path = f"{gpfs_path}/datasets/scop/scop-cla.txt"

In [3]:
with open(scop_path, "r") as f:
    # First loop to skip lines starting with #
    for line in f:
        if line.startswith("#"): continue
        else: break

    # Second loop to process the remaining lines
    processed_scop_data = [line.strip().split() for line in f]
    
    # Create list of dictionaries for the main data
    records = []
    for i, parts in enumerate(processed_scop_data):
        record = {
            'i': i,
            "FA-DOMID": parts[0],
            "FA-PDBID": parts[1], 
            "FA-PDBREG": parts[2],
            "FA-UNIID": parts[3],
            "FA-UNIREG": parts[4],
            "SF-DOMID": parts[5],
            "SF-PDBID": parts[6],
            "SF-PDBREG": parts[7],
            "SF-UNIID": parts[8],
            "SF-UNIREG": parts[9]
        }
        # Parse SCOPCLA separately
        scopcla = {part.split('=')[0]: part.split('=')[1] for part in parts[10].split(',')}
        record.update(scopcla)
        records.append(record)
    
    scop_df = pd.DataFrame.from_records(records)

In [4]:
scop_df

Unnamed: 0,i,FA-DOMID,FA-PDBID,FA-PDBREG,FA-UNIID,FA-UNIREG,SF-DOMID,SF-PDBID,SF-PDBREG,SF-UNIID,SF-UNIREG,TP,CL,CF,SF,FA
0,0,8000376,2FR0,A:1448-1915,Q03131,1395-1862,8000377,2FR0,A:1657-1915,Q03131,1604-1862,1,1000002,2000148,3000038,4000119
1,1,8000693,1UDC,A:1-338,P09147,1-338,8000694,1UDC,A:1-263,P09147,1-263,1,1000002,2000148,3000038,4000088
2,2,8000768,2PWZ,A:1-312,P61889,1-312,8000769,2PWZ,A:1-312,P61889,1-312,1,1000002,2000005,3000039,4000045
3,3,8000794,1MG5,A:1-255,P00334,2-256,8001233,1MG5,A:1-240,P00334,2-241,1,1000002,2000148,3000038,4000029
4,4,8000805,1OBB,A:2-480,O33830,2-480,8001235,1OBB,A:2-480,O33830,2-480,1,1000002,2000005,3000039,4000089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36894,36894,8018581,3THX,B:217-1120,P20585,226-1129,8056712,3THX,B:357-507,P20585,366-516,1,1000002,2001251,3000587,4004015
36895,36895,8018581,3THX,B:217-1120,P20585,226-1129,8056714,3THX,B:845-1053,P20585,854-1062,1,1000002,2001251,3002020,4004015
36896,36896,8018583,2O8B,B:362-1335,P52701,362-1335,8056718,2O8B,B:423-487,P52701,423-487,1,1000002,2001251,3001688,4004015
36897,36897,8018583,2O8B,B:362-1335,P52701,362-1335,8056719,2O8B,B:537-697,P52701,537-697,1,1000002,2001251,3000587,4004015


In [11]:
def fetch_single_uid(entry):
    base_url="https://rest.uniprot.org/uniprotkb/"
    try:
        seq = StringIO(''.join(requests.post(f'{base_url}/{entry["FA-UNIID"]}.fasta').text))
        seq = list(SeqIO.parse(seq,'fasta'))
        return (entry['FA-UNIID'], str(seq[0].seq))
    except Exception as e:
        # print(f'failed to retrieve uid {entry["FA-UNIID"]}')
        return (entry['FA-UNIID'], None)

def download_uniprot(scop_data):    
    with ThreadPoolExecutor() as executor:
        retrieved_seqs = list(tqdm(executor.map(fetch_single_uid, scop_data), total=len(scop_data), ncols=100))
   
    return {uid: seq for (uid, seq) in retrieved_seqs if seq is not None}
def download_uniprot_scop_unified(scop_data):
    # Convert DataFrame to list of dicts for download_uniprot
    scop_records = scop_data.to_dict('records')
    seqs = download_uniprot(scop_records)
    
    data = []
    for entry in scop_records:
        if entry['FA-UNIID'] in seqs:
            data.append({
                'uid': entry['FA-UNIID'],
                'fa': entry['FA'],  # Direct column access
                'sf': entry['SF'],  # Direct column access
                'seq': seqs[entry['FA-UNIID']]
            })
    return pd.DataFrame(data)

In [None]:
df = download_uniprot_scop_unified(scop_df)

In [6]:
df

Unnamed: 0_level_0,uid,fa,sf,seq
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Q03131,4000119,3000038,MSGPRSRTTSRRTPVRIGAVVVASSTSELLDGLAAVADGRPHASVV...
1,P09147,4000088,3000038,MRVLVTGGSGYIGSHTCVQLLQNGHDVIILDNLCNSKRSVLPVIER...
2,P61889,4000045,3000039,MKVAVLGAAGGIGQALALLLKTQLPSGSELSLYDIAPVTPGVAVDL...
3,P00334,4000029,3000038,MSFTLTNKNVIFVAGLGGIGLDTSKELLKRDLKNLVILDRIENPAA...
4,O33830,4000089,3000039,MPSVKIGIIGAGSAVFSLRLVSDLCKTPGLSGSTVTLMDIDEERLD...
...,...,...,...,...
35972,P20585,4004015,3000587,MSRRKPASGGLAASSSAPARQAVLSRFFQSTGSLKSTSSSTGAADQ...
35973,P20585,4004015,3002020,MSRRKPASGGLAASSSAPARQAVLSRFFQSTGSLKSTSSSTGAADQ...
35974,P52701,4004015,3001688,MSRQSTLYSFFPKSPALSDANKASARASREGGRAAAAPGASPSPGG...
35975,P52701,4004015,3000587,MSRQSTLYSFFPKSPALSDANKASARASREGGRAAAAPGASPSPGG...


In [3]:
# Read the saved data back from CSV
df = pd.read_csv('/scratch/gpfs/jr8867/datasets/scop/scop_data.csv')
print(f"Loaded {len(df)} records from CSV file")

Loaded 35977 records from CSV file


In [7]:
df.reset_index(drop=True, inplace=True)
df.index.name = 'index'

In [8]:
df.to_csv('/scratch/gpfs/jr8867/datasets/scop/scop_data.csv', index=True)