<a href="https://colab.research.google.com/github/kattens/Protein-Interaction-with-LLMs/blob/main/Part_0_Downloader_for_the_PDB_files.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Downloading the PDB Files

The first step of our project is to gather the data we want to work with. We will start by using a list of 10 CSV files containing the names of all the files on the RCSB website. After obtaining these lists, we will perform some necessary cleanups.

this file also contains the steps to seperate the chains in a pdb file since we need to work with seperated chains for our analysis.

In [None]:
#install the needed library
!pip install biopython

Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.83


In [None]:
import os
import numpy as np
import pandas as pd
import csv
from Bio import PDB
import shutil
import requests


In [None]:
# path of the csv file -> modify it as needed
folder_path = "/content/drive/MyDrive/pdb_entry_files.csv"

first_column_values = []

# Loop through each file in the specified folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):  # Check if the file is a CSV
        file_path = os.path.join(folder_path, filename)

        with open(file_path, 'r') as csv_in:
            reader = csv.reader(csv_in)
            header = next(reader)  # Skip the header

            for row in reader:
                first_column_values.append(row[0])  # Append the value from the first column

print(first_column_values)  #name of the pdb files
print(len(first_column_values)) #number of pdb files

NotADirectoryError: [Errno 20] Not a directory: '/content/drive/MyDrive/pdb_entry_files.csv'

As shown above, we have the names of each PDB file in the list. We will send requests to download these files. Due to the large number of files, we will process them in 10 batches.

In [None]:
class PDBDownloader:
    def __init__(self, output_directory):
        self.output_directory = output_directory

    def download_pdb_batch(self, protein_name_list, batch_number):
        batch_size = 7500  # Assuming 75000 files divided into 10 batches
        start_index = (batch_number - 1) * batch_size
        end_index = min(batch_number * batch_size, len(protein_name_list))

        for i in range(start_index, end_index):
            protein_name = protein_name_list[i]
            filename = f"{protein_name}.pdb"
            file_path = os.path.join(self.output_directory, filename)

            if os.path.exists(file_path):
                print(f"File already exists for {protein_name}, skipping download.")
                continue

            url = f"https://files.rcsb.org/download/{protein_name}.pdb"

            try:
                response = requests.get(url)
                response.raise_for_status()
                pdb_content = response.text

                if "HEADER    " not in pdb_content:
                    print(f"No PDB file found for {protein_name}")
                    continue

                with open(file_path, "w") as file:
                    file.write(pdb_content)
                print(f"Downloaded PDB file for {protein_name} to {self.output_directory}")
            except requests.HTTPError as e:
                print(f"Failed to download PDB file for {protein_name}")
                print(f"HTTP Error: {e}")
            except Exception as e:
                print(f"Failed to download PDB file for {protein_name}")
                print(f"Error: {e}")


# Notes for handling this part:

since we dont want to overload the memory and have enough disk space, we will download everything in 10 batches, clear the previous directory, process the files as we want, then go to the next batch. we should do it manually by changing the batch numbers as a hypert parameter.

In [None]:
#clear the folder that we are downloading the pdb files in to make sure we arent duplicating anything
def clear_directory(directory):
    for item in os.listdir(directory):
        item_path = os.path.join(directory, item)
        if os.path.isfile(item_path) or os.path.islink(item_path):
            os.unlink(item_path)
        elif os.path.isdir(item_path):
            shutil.rmtree(item_path)

In [None]:
# Folder containing the CSV files  -> change as needed
folder_path = "/home/f.ensafitakaldani001/Downloader_code/IDs"
first_column_values = []

for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r') as csv_in:
            reader = csv.reader(csv_in)
            next(reader)  # Skip the header
            for row in reader:
                first_column_values.append(row[0])

# Specify the output directory -> change as needed
output_directory = "/home/f.ensafitakaldani001/Downloader_code/newoutput"
clear_directory(output_directory)  # Clear the directory before downloading

# Create an instance of PDBDownloader
downloader = PDBDownloader(output_directory)

# Define the batch number you want to download
batch_number = 10  # Adjust this number to download different batches

# Download the PDB files for the current batch
downloader.download_pdb_batch(first_column_values, batch_number)

print(f"Batch {batch_number} download complete.")

#Chain Seperator:
one of the most necessary parts of this project is to have seperated chains from pdb files. since we want to train a model base on the closest amino acids in different chains in one protein complex to understand the dynamics of amino acids that tend to get closer to one another.

In [None]:
def separate_chains(source_dir, target_dir, limit=500):
    os.makedirs(target_dir, exist_ok=True)  # Ensure the target directory exists
    pdb_parser = PDB.PDBParser()
    pdb_io = PDB.PDBIO()
    processed_files = 0

    for filename in sorted(os.listdir(source_dir)):  # Sort to ensure consistent order
        if processed_files >= limit:
            break  # Stop after processing the limit of files

        if filename.endswith(".pdb"):
            file_path = os.path.join(source_dir, filename)
            structure = pdb_parser.get_structure(filename, file_path)

            # Count chains to ensure there are 2 or more before proceeding
            chain_count = sum(1 for _ in structure.get_chains())
            if chain_count < 2:
                continue  # Skip files with less than 2 chains

            for model in structure:
                for chain in model:
                    chain_id = chain.id
                    output_filename = f"{os.path.splitext(filename)[0]}_{chain_id}.pdb"
                    output_path = os.path.join(target_dir, output_filename)

                    # Set structure for output to just this chain
                    pdb_io.set_structure(chain)
                    pdb_io.save(output_path)

            processed_files += 1  # Increment counter after processing each file


In [None]:
#we have to use this code to replace it with manually cleaning up files after we are done with them
def clear_directory(directory):
    """Removes all files and directories in the specified directory."""
    for item in os.listdir(directory):
        item_path = os.path.join(directory, item)
        try:
            if os.path.isfile(item_path) or os.path.islink(item_path):
                os.unlink(item_path)  # Remove file or link
            elif os.path.isdir(item_path):
                shutil.rmtree(item_path)  # Remove directory and all its contents
        except Exception as e:
            print(f"Failed to delete {item_path}. Reason: {e}")


In [None]:
# Example usage
source_dir = output_directory # Update this to your source directory path
target_dir = '/home/f.ensafitakaldani001/Downloader_code/seperatedoutput'  # Update this to your target directory path
# Clear everything in the target directory
clear_directory(target_dir)

# Assuming separate_chains is defined and handles the processing
# For example, you call it with the source   directory, target directory, and a batch size
separate_chains(source_dir, target_dir, total_files)


print(f"Chain separation complete for first {total_files} PDB files.")


#just to see: Specify the output directory
output_directory = "/home/f.ensafitakaldani001/Downloader_code/seperatedoutput"
import glob
files = glob.glob(os.path.join(output_directory, '*'))
total_files = len([f for f in files if os.path.isfile(f)])

NameError: name 'output_directory' is not defined

# End Goal:
At, this point we should have 10 different batch csv files with name of the protein_ID, coordinates and sequence. then we can go to create the dataset/dataframe in part 1