In [1]:
import pandas as pd

# Mount Google Drive if using Colab
from google.colab import drive
drive.mount('/content/drive')



import pandas as pd

# Path to your BindingDB file
file_path = '/content/drive/MyDrive/ProteinMO/Dataset/BindingDB_All.tsv'
output_path = '/content/drive/MyDrive/ProteinMO/Dataset/bindingdb_subset.parquet'

# Columns we actually need
use_cols = [
    'Ligand SMILES',
    'IC50 (nM)',
    'UniProt (SwissProt) Primary ID of Target Chain 1',
    'UniProt (SwissProt) Recommended Name of Target Chain 1',
    'BindingDB Target Chain Sequence 1'
]

chunk_size = 300000  # Adjust if needed (lower if you still hit RAM issues)
chunks = []

print("Converting TSV to Parquet... this may take a few minutes.")

for i, chunk in enumerate(pd.read_csv(file_path, sep='\t', usecols=use_cols, chunksize=chunk_size, low_memory=False)):
    print(f"Processing chunk {i+1}...")
    chunks.append(chunk)

# Combine all chunks into a single dataframe
df = pd.concat(chunks, ignore_index=True)

# Save as Parquet
df.to_parquet(output_path, index=False)
print(f"Saved Parquet file to {output_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Converting TSV to Parquet... this may take a few minutes.
Processing chunk 1...
Processing chunk 2...
Processing chunk 3...
Processing chunk 4...
Processing chunk 5...
Processing chunk 6...
Processing chunk 7...
Processing chunk 8...
Processing chunk 9...
Processing chunk 10...
Processing chunk 11...
Saved Parquet file to /content/drive/MyDrive/ProteinMO/Dataset/bindingdb_subset.parquet


In [4]:
import pandas as pd

df = pd.read_parquet('/content/drive/MyDrive/ProteinMO/Dataset/bindingdb_subset.parquet')

print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())

# Basic types & memory
print("\nInfo:")
print(df.info(memory_usage='deep'))

# How many are missing in each column?
print("\nNA counts:")
print(df.isna().sum())

# IC50 sanity: how many numeric / non-numeric?
df['IC50 (nM)'] = pd.to_numeric(df['IC50 (nM)'], errors='coerce')
print("\nIC50 numeric fraction:", df['IC50 (nM)'].notna().mean())

# % rows with UniProt IDs present
print("\nRows with UniProt ID:",
      (df['UniProt (SwissProt) Primary ID of Target Chain 1'].notna().mean()))

# Quick look
print("\nHead():")
print(df.head(3))



Shape: (3041030, 5)

Columns: ['Ligand SMILES', 'IC50 (nM)', 'BindingDB Target Chain Sequence 1', 'UniProt (SwissProt) Recommended Name of Target Chain 1', 'UniProt (SwissProt) Primary ID of Target Chain 1']

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3041030 entries, 0 to 3041029
Data columns (total 5 columns):
 #   Column                                                  Dtype 
---  ------                                                  ----- 
 0   Ligand SMILES                                           object
 1   IC50 (nM)                                               object
 2   BindingDB Target Chain Sequence 1                       object
 3   UniProt (SwissProt) Recommended Name of Target Chain 1  object
 4   UniProt (SwissProt) Primary ID of Target Chain 1        object
dtypes: object(5)
memory usage: 3.0 GB
None

NA counts:
Ligand SMILES                                                 21
IC50 (nM)                                                 996436
BindingDB T

In [7]:
import time
# Rename columns for consistency
df = df.rename(columns={
    'Ligand SMILES': 'smiles',
    'IC50 (nM)': 'ic50_nM',
    'BindingDB Target Chain Sequence 1': 'protein_seq',
    'UniProt (SwissProt) Recommended Name of Target Chain 1': 'protein_name',
    'UniProt (SwissProt) Primary ID of Target Chain 1': 'uniprot_id'
})

# Define UniProt fetch functions
def fetch_fasta(uniprot_id):
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
    try:
        res = requests.get(url, timeout=5)
        if res.ok:
            return ''.join(res.text.split('\n')[1:])  # Remove FASTA header
    except:
        return None
    return None

def fetch_description(uniprot_id):
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json"
    try:
        res = requests.get(url, timeout=5)
        if res.ok:
            data = res.json()
            return data.get("proteinDescription", {}).get("recommendedName", {}).get("fullName", {}).get("value", "")
    except:
        return None
    return None

# Fill missing data
missing_rows = df[df['protein_seq'].isna() | df['protein_name'].isna()]
print("Rows needing enrichment:", len(missing_rows))

for idx, row in missing_rows.iterrows():
    uid = row['uniprot_id']
    if pd.isna(row['protein_seq']):
        df.at[idx, 'protein_seq'] = fetch_fasta(uid)
    if pd.isna(row['protein_name']):
        df.at[idx, 'protein_name'] = fetch_description(uid)
    if idx % 100 == 0:
        print(f"Processed {idx} rows...")
        time.sleep(0.5)  # Avoid hitting UniProt rate limits

# Save enriched file
df.to_parquet('/content/drive/MyDrive/ProteinMO/Dataset/bindingdb_enriched.parquet')
print("Step 3 complete! Saved enriched file.")


Rows needing enrichment: 90213
Processed 16000 rows...
Processed 21700 rows...
Processed 32700 rows...
Processed 33000 rows...
Processed 40000 rows...
Processed 40600 rows...
Processed 52400 rows...
Processed 57300 rows...
Processed 59600 rows...
Processed 60900 rows...
Processed 64400 rows...
Processed 64700 rows...
Processed 70500 rows...
Processed 70600 rows...
Processed 70700 rows...
Processed 70800 rows...
Processed 71000 rows...
Processed 71100 rows...
Processed 71200 rows...
Processed 71300 rows...
Processed 71400 rows...
Processed 71500 rows...
Processed 71600 rows...
Processed 71700 rows...
Processed 71800 rows...
Processed 77200 rows...
Processed 80600 rows...
Processed 84500 rows...
Processed 87500 rows...
Processed 90000 rows...
Processed 95600 rows...
Processed 95700 rows...
Processed 96800 rows...
Processed 96900 rows...
Processed 97000 rows...
Processed 97100 rows...
Processed 97200 rows...
Processed 99000 rows...
Processed 99100 rows...
Processed 99200 rows...
Processed

In [8]:
import pandas as pd

# Load enriched file
df = pd.read_parquet('/content/drive/MyDrive/ProteinMO/Dataset/bindingdb_enriched.parquet')

print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())

# Missing values check
print("\nMissing values:")
print(df[['protein_seq', 'protein_name']].isna().sum())

# Show first 3 rows
print("\nSample rows:")
print(df.head(3))

# How many unique UniProt IDs
print("\nUnique UniProt IDs:", df['uniprot_id'].nunique())



Shape: (3041030, 5)

Columns: ['smiles', 'ic50_nM', 'protein_seq', 'protein_name', 'uniprot_id']

Missing values:
protein_seq        70
protein_name    90213
dtype: int64

Sample rows:
                                              smiles  ic50_nM  \
0  O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CCCCCC(O)=...      NaN   
1  O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...      NaN   
2  O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CC2CC2)C(=...      NaN   

                                         protein_seq         protein_name  \
0  PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...  Gag-Pol polyprotein   
1  PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...  Gag-Pol polyprotein   
2  PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...  Gag-Pol polyprotein   

  uniprot_id  
0     P03367  
1     P03367  
2     P03367  

Unique UniProt IDs: 6674


In [10]:
import requests
import pandas as pd
import time

df = pd.read_parquet('/content/drive/MyDrive/ProteinMO/Dataset/bindingdb_enriched.parquet')

def fetch_description(uniprot_id):
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json"
    try:
        res = requests.get(url, timeout=5)
        if res.ok:
            data = res.json()
            return data.get("proteinDescription", {}).get("recommendedName", {}).get("fullName", {}).get("value", "")
    except:
        return None
    return None

# Map over unique UniProt IDs
desc_map = {}
for i, uid in enumerate(df['uniprot_id'].dropna().unique(), 1):
    desc_map[uid] = fetch_description(uid)
    if i % 100 == 0:
        print(f"Fetched {i}/{len(desc_map)} descriptions...")
        time.sleep(0.2)

df['protein_desc'] = df['uniprot_id'].map(desc_map)

df.to_parquet('/content/drive/MyDrive/ProteinMO/Dataset/bindingdb_enriched_with_desc.parquet')
print("Descriptions added! New file saved.")


Fetched 100/100 descriptions...
Fetched 200/200 descriptions...
Fetched 300/300 descriptions...
Fetched 400/400 descriptions...
Fetched 500/500 descriptions...
Fetched 600/600 descriptions...
Fetched 700/700 descriptions...
Fetched 800/800 descriptions...
Fetched 900/900 descriptions...
Fetched 1000/1000 descriptions...
Fetched 1100/1100 descriptions...
Fetched 1200/1200 descriptions...
Fetched 1300/1300 descriptions...
Fetched 1400/1400 descriptions...
Fetched 1500/1500 descriptions...
Fetched 1600/1600 descriptions...
Fetched 1700/1700 descriptions...
Fetched 1800/1800 descriptions...
Fetched 1900/1900 descriptions...
Fetched 2000/2000 descriptions...
Fetched 2100/2100 descriptions...
Fetched 2200/2200 descriptions...
Fetched 2300/2300 descriptions...
Fetched 2400/2400 descriptions...
Fetched 2500/2500 descriptions...
Fetched 2600/2600 descriptions...
Fetched 2700/2700 descriptions...
Fetched 2800/2800 descriptions...
Fetched 2900/2900 descriptions...
Fetched 3000/3000 descriptions..

In [11]:
import pandas as pd

# Load the enriched dataset
df = pd.read_parquet('/content/drive/MyDrive/ProteinMO/Dataset/bindingdb_enriched_with_desc.parquet')

# Basic info
print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())

# Show first few rows
print("\nSample rows:")
print(df[['uniprot_id', 'protein_name', 'protein_desc', 'protein_seq']].head(5))

# Check missing values in protein_desc
print("\nMissing descriptions:", df['protein_desc'].isna().sum())


Shape: (3041030, 6)

Columns: ['smiles', 'ic50_nM', 'protein_seq', 'protein_name', 'uniprot_id', 'protein_desc']

Sample rows:
  uniprot_id         protein_name         protein_desc  \
0     P03367  Gag-Pol polyprotein  Gag-Pol polyprotein   
1     P03367  Gag-Pol polyprotein  Gag-Pol polyprotein   
2     P03367  Gag-Pol polyprotein  Gag-Pol polyprotein   
3     P03367  Gag-Pol polyprotein  Gag-Pol polyprotein   
4     P03367  Gag-Pol polyprotein  Gag-Pol polyprotein   

                                         protein_seq  
0  PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...  
1  PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...  
2  PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...  
3  PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...  
4  PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...  

Missing descriptions: 90479


In [12]:
# Show 10 unique UniProt IDs and their descriptions
print(df[['uniprot_id', 'protein_name', 'protein_desc']].drop_duplicates().head(10))


    uniprot_id         protein_name         protein_desc
0       P03367  Gag-Pol polyprotein  Gag-Pol polyprotein
34      Q9QBY3  Gag-Pol polyprotein  Gag-Pol polyprotein
142     P08684  Cytochrome P450 3A4  Cytochrome P450 3A4
144     P51570        Galactokinase        Galactokinase
168     P12497  Gag-Pol polyprotein  Gag-Pol polyprotein
198     P42574            Caspase-3            Caspase-3
199     P29466            Caspase-1            Caspase-1
208     P49662            Caspase-4            Caspase-4
257     P55210            Caspase-7            Caspase-7
266     P55212            Caspase-6            Caspase-6


In [13]:
import requests
import time

# Unique UniProt IDs with missing descriptions
missing_ids = df[df['protein_desc'].isna()]['uniprot_id'].dropna().unique()
print(f"Missing description for {len(missing_ids)} unique UniProt IDs.")

def fetch_description(uniprot_id):
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json"
    try:
        res = requests.get(url, timeout=5)
        if res.ok:
            data = res.json()
            return data.get("proteinDescription", {}).get("recommendedName", {}).get("fullName", {}).get("value", "")
    except:
        return None
    return None

# Fetch and fill
desc_map = {}
for i, uid in enumerate(missing_ids, 1):
    desc_map[uid] = fetch_description(uid)
    if i % 100 == 0:
        print(f"Fetched {i}/{len(missing_ids)} descriptions...")
        time.sleep(0.3)

# Fill missing
df['protein_desc'] = df['protein_desc'].fillna(df['uniprot_id'].map(desc_map))

# Save updated file
df.to_parquet('/content/drive/MyDrive/ProteinMO/Dataset/bindingdb_enriched_with_desc_v2.parquet')


Missing description for 4 unique UniProt IDs.


In [14]:
import pandas as pd

# Load the latest enriched dataset
df = pd.read_parquet('/content/drive/MyDrive/ProteinMO/Dataset/bindingdb_enriched_with_desc_v2.parquet')

# Drop rows where protein_desc is missing
df = df.dropna(subset=['protein_desc'])

# Save the cleaned dataset
df.to_parquet('/content/drive/MyDrive/ProteinMO/Dataset/bindingdb_enriched_clean.parquet')

print("Rows after dropping missing descriptions:", df.shape)
print("Any remaining missing descriptions:", df['protein_desc'].isna().sum())


Rows after dropping missing descriptions: (2950713, 6)
Any remaining missing descriptions: 0


In [16]:
import pandas as pd
import numpy as np

# Load enriched dataset
df = pd.read_parquet('/content/drive/MyDrive/ProteinMO/Dataset/bindingdb_enriched_clean.parquet')

# Ensure correct column names (in case of variations)
df = df.rename(columns={
    'Ligand SMILES': 'smiles',
    'IC50 (nM)': 'ic50_nM',
    'BindingDB Target Chain Sequence 1': 'protein_seq',
    'UniProt (SwissProt) Recommended Name of Target Chain 1': 'protein_name',
    'UniProt (SwissProt) Primary ID of Target Chain 1': 'uniprot_id'
})

# Convert IC50 to numeric
df['ic50_nM'] = pd.to_numeric(df['ic50_nM'], errors='coerce')

# Drop rows with missing essential fields
df_clean = df.dropna(subset=['smiles', 'ic50_nM', 'uniprot_id', 'protein_seq', 'protein_name', 'protein_desc'])

# Create binary label
def get_label_from_ic50(ic50):
    if ic50 < 1000:   # Strong binder
        return 1
    elif ic50 > 10000:  # Non-binder
        return 0
    else:
        return None  # Ambiguous range

df_clean['label'] = df_clean['ic50_nM'].apply(get_label_from_ic50)

# Drop ambiguous entries
df_labeled = df_clean.dropna(subset=['label'])

# Save final dataset
final_path = '/content/drive/MyDrive/ProteinMO/Dataset/bindingdb_final_labeled.csv'
df_labeled.to_csv(final_path, index=False)

# Print summary
print("Step 4 complete!")
print("Final shape:", df_labeled.shape)
print("Binder vs Non-binder counts:")
print(df_labeled['label'].value_counts())
print("Saved to:", final_path)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['label'] = df_clean['ic50_nM'].apply(get_label_from_ic50)


Step 4 complete!
Final shape: (1262353, 7)
Binder vs Non-binder counts:
label
1.0    1074020
0.0     188333
Name: count, dtype: int64
Saved to: /content/drive/MyDrive/ProteinMO/Dataset/bindingdb_final_labeled.csv


In [17]:
import pandas as pd
import numpy as np

path = '/content/drive/MyDrive/ProteinMO/Dataset/bindingdb_enriched_clean.parquet'
df = pd.read_parquet(path).rename(columns={
    'Ligand SMILES': 'smiles',
    'IC50 (nM)': 'ic50_nM',
    'BindingDB Target Chain Sequence 1': 'protein_seq',
    'UniProt (SwissProt) Recommended Name of Target Chain 1': 'protein_name',
    'UniProt (SwissProt) Primary ID of Target Chain 1': 'uniprot_id'
})

report = {}

report['total'] = len(df)

# numeric IC50
df['ic50_nM'] = pd.to_numeric(df['ic50_nM'], errors='coerce')
report['numeric_ic50'] = df['ic50_nM'].notna().sum()

# drop missing essentials
essential = ['smiles', 'ic50_nM', 'uniprot_id', 'protein_seq', 'protein_name', 'protein_desc']
df_ess = df.dropna(subset=essential)
report['after_essentials'] = len(df_ess)

# counts by threshold
lt_1uM   = (df_ess['ic50_nM'] < 1000).sum()
gt_10uM  = (df_ess['ic50_nM'] > 10000).sum()
between  = ((df_ess['ic50_nM'] >= 1000) & (df_ess['ic50_nM'] <= 10000)).sum()

report['<1uM'] = lt_1uM
report['>10uM'] = gt_10uM
report['1-10uM_dropped'] = between
report['final_labeled'] = lt_1uM + gt_10uM

print(pd.Series(report))

# Class imbalance check
print("\nClass ratio (binders/non-binders): {:.1f}/{:.1f}".format(
    lt_1uM / report['final_labeled'] * 100,
    gt_10uM / report['final_labeled'] * 100
))


total               2950713
numeric_ic50        1572036
after_essentials    1572035
<1uM                1074020
>10uM                188333
1-10uM_dropped       309682
final_labeled       1262353
dtype: int64

Class ratio (binders/non-binders): 85.1/14.9
