# Data Cleaning Procedure:

We are working with a dataset containing protein names from the PDB site, and our goal is to streamline the data cleaning process using automated methods. While manual downloading is possible on the site, we aim to create a code that can handle this task efficiently.

Here's the step-by-step process we'll follow:

1. **Data Retrieval and Initial Filtering:**
   We will initiate the process by extracting protein names from the PDB site. Our code will then automatically download the corresponding files. This approach simplifies data scraping and enhances efficiency, providing a structured dataset. While direct manual downloads are feasible, automation offers more streamlined handling.

2. **Categorization of Protein Types:**
   After the initial download, we will categorize the proteins based on their types. Proteins that are DNAs (NMR) and EM (complexes) will be identified and segregated into a separate CSV file. This segregation ensures that the dataset remains organized and suitable for downstream analyses.

3. **Refining the Dataset:**
   We will proceed by eliminating undesired data points. This involves removing proteins with low resolutions and excluding those that are not based on X-ray experiment types. This refinement step ensures that the dataset consists of high-quality, relevant protein structures for further processing.

4. **Handling Multi-Chain Files:**
   For proteins composed of multiple chains, such as the example 1abc with 2 chains, we will create distinct entries for each chain. This separation results in entries like 1abc_1 and 1abc_2. By doing this, we maintain granularity in our dataset, enabling more precise analyses.


By following this refined data cleaning procedure, we aim to establish a well-structured and comprehensive dataset that can be effectively utilized for various analyses and modeling tasks.

In [None]:
%%capture
!pip install biopython
!pip install biopandas

In [None]:
import os
import pandas as pd
import csv
import requests
import numpy as np
from Bio import PDB
from Bio.PDB import PDBParser
import tensorflow as tf
import re


In [None]:
#Name of the proteins File
csv_path = "/content/drive/MyDrive/pdb_entry_files.csv"
df = pd.read_csv(csv_path)

#first delete nucliotides only, those who are NMR, and seperate the EM complexes in another file

In [None]:
def save_and_remove_data(dataframe, csv_path):
    # Remove rows with 'nuc' in the second column
    dataframe = dataframe[dataframe.iloc[:, 1] != 'nuc']

    # Remove rows with 'NMR' in the third column
    dataframe = dataframe[dataframe.iloc[:, 2] != 'NMR']

    # Save the removed rows with 'EM' in the third column to a new CSV file
    em_rows = dataframe[dataframe.iloc[:, 2] == 'EM']
    em_rows.to_csv('em_rows.csv', index=False)

    # Remove rows with 'EM' in the third column
    dataframe = dataframe[dataframe.iloc[:, 2] != 'EM']

    # Save the rows with 'prot-nuc' in the second column to a new CSV file
    prot_nuc_rows = dataframe[dataframe.iloc[:, 1] == 'prot-nuc']
    prot_nuc_rows.to_csv('prot_nuc_rows.csv', index=False)

    # Remove rows with 'prot-nuc' in the second column
    dataframe = dataframe[dataframe.iloc[:, 1] != 'prot-nuc']

    # Save the modified DataFrame to a new CSV file
    dataframe.to_csv(csv_path, index=False)

    return dataframe

# Read the original CSV file into a DataFrame
original_csv_path = "/content/drive/MyDrive/pdb_entry_files.csv"
df = pd.read_csv(original_csv_path)

# Call the function to save EM rows, 'prot-nuc' rows, remove 'nuc', 'NMR', 'EM', and 'prot-nuc' rows
modified_df = save_and_remove_data(df, 'modified_data.csv')

# Now, the 'modified_df' DataFrame contains the desired modifications and is saved in 'modified_data.csv'


In [None]:
#dont forget to remove the first row in all the dataset
prot_EM = '/content/em_rows.csv'
prot_nuc = '/content/prot_nuc_rows.csv'
prot_csv = '/content/modified_data.csv'

df = pd.read_csv(prot_csv)

#note:
the number of proteins are nearly 167k.
Now, this is what we should download our pdb files off of.


In [None]:
import csv

def create_prot_name_list(csv_path):
    prot_list = []
    with open(csv_path, 'r') as csv_file:
        csv_reader = csv.reader(csv_file)
        for row in csv_reader:
            prot_list.append(row[0])
    return prot_list

#create a list to download PDB files based on the names
protein_names = create_prot_name_list(prot_csv)

#download the pdb files

In [None]:
import os
import requests

class PDBDownloader:
    def __init__(self, destination_path):
        self.destination_path = destination_path

    def download_pdb(self, protein_name_list):
        for protein_name in protein_name_list:
            filename = f"{protein_name}.pdb"
            file_path = os.path.join(self.destination_path, filename)

            # Check if the file already exists in the destination path
            if os.path.exists(file_path):
                print(f"File already exists for {protein_name}, skipping download.")
                continue

            url = f"https://files.rcsb.org/download/{protein_name}.pdb"

            try:
                response = requests.get(url)
                response.raise_for_status()
                pdb_content = response.text

                # Check if the response contains the PDB content
                if "HEADER    " not in pdb_content:
                    print(f"No PDB file found for {protein_name}")
                    continue

                with open(file_path, "w") as file:
                    file.write(pdb_content)

                print(f"Downloaded PDB file for {protein_name}")
            except requests.HTTPError as e:
                print(f"Failed to download PDB file for {protein_name}")
                print(f"HTTP Error: {e}")
            except Exception as e:
                print(f"Failed to download PDB file for {protein_name}")
                print(f"Error: {e}")

# Example usage
downloader = PDBDownloader(destination_path="/content/drive/MyDrive/NewPDBFiles")
protein_names = protein_names
downloader.download_pdb(protein_names)


# Removing Low Resolutions, Non-X-Ray Entries, and DNA Files:
We will exclude proteins with low resolutions and those that are not based on X-ray experiments. Additionally, we will omit DNA files from the dataset. This step is essential as the shared letters in DNA sequences might create confusion for the model, even though theoretically the model could potentially distinguish them. However, the intricacies of this theoretical capability are beyond the scope of our current discussion.

In [None]:
#RESOLUTION
def extract_resolution(pdb_file_path):
    with open(pdb_file_path, 'r') as pdb_file:
        for line in pdb_file:
            match = re.match(r'REMARK\s+2\s+RESOLUTION\.\s+(\d+\.\d+)\s+ANGSTROMS\.', line)
            if match:
                resolution_str = match.group(1)
                try:
                    resolution = float(resolution_str)
                    return resolution
                except ValueError:
                    print("Error: Resolution information is not in a valid numerical format.")
                    return None

    print(f"Resolution information not found in {pdb_file_path}.")
    return None

#EXPERIMENT TYPE
def extract_experiment_type(pdb_file_path):
    with open(pdb_file_path, 'r') as pdb_file:
        for line in pdb_file:
            if line.startswith('REMARK 200  EXPERIMENT TYPE'):
                experiment_type = line.split(':')[1].strip()
                return experiment_type

    print("Experiment type information not found.")
    return None

#DNA FILES
def remove_pdb_files_with_header(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdb'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r') as pdb_file:
                lines = pdb_file.readlines()
                if any(line.startswith('HEADER    DNA') for line in lines):
                    print(f"Removing file: {filename}")
                    os.remove(file_path)

def delete_files_not_meeting_criteria(directory_path, threshold_resolution, target_experiment_type):
    num_deleted_files = 0  # Initialize the counter for deleted files
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdb"):
            pdb_file_path = os.path.join(directory_path, filename)
            resolution = extract_resolution(pdb_file_path)
            experiment_type = extract_experiment_type(pdb_file_path)

            if resolution is not None and experiment_type is not None:
                # Check if the file meets any of the criteria
                if resolution < threshold_resolution or experiment_type != target_experiment_type:
                    print(f"Deleting file: {pdb_file_path} with resolution {resolution:.2f} Å and experiment type {experiment_type}")
                    os.remove(pdb_file_path)
                    num_deleted_files += 1  # Increment the counter for deleted files
                else:
                    # If the file passes the resolution and experiment type criteria,
                    # check if it also has the 'HEADER DNA' line, and remove it.
                    with open(pdb_file_path, 'r') as pdb_file:
                        lines = pdb_file.readlines()
                    if any(line.startswith('HEADER    DNA') for line in lines):
                        print(f"Removing file: {pdb_file_path}")
                        os.remove(pdb_file_path)
                        num_deleted_files += 1  # Increment the counter for deleted files

    print(f"Total files deleted: {num_deleted_files}")

# Replace 'path/to/your/directory' with the actual directory path containing the PDB files.
directory_path = "/content/drive/MyDrive/NewPDBFiles/"
threshold_resolution = 2.0  # Set the threshold resolution below which files will be deleted.
target_experiment_type = "X-ray diffraction"  # Set the desired experiment type.

delete_files_not_meeting_criteria(directory_path, threshold_resolution, target_experiment_type)


#Now the clean up happened, we have to separate chains and have them in a new folder.

In [None]:
def split_pdb_chains(input_pdb_path, output_folder):
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure("protein", input_pdb_path)

    for model in structure:
        chains = list(model)

        for chain in chains:
            chain_id = chain.id
            base_filename = os.path.splitext(os.path.basename(input_pdb_path))[0]
            chain_output_path = os.path.join(output_folder, f"{base_filename}_{chain_id}.pdb")

            io = PDB.PDBIO()
            io.set_structure(chain)
            io.save(chain_output_path)

            if len(chains) == 1:
                print(f"File {chain_output_path} only has 1 chain.")
            else:
                print(f"Separated {chain_output_path}")

if __name__ == "__main__":
    input_folder = "/content/drive/MyDrive/NewPDBFiles"  # Update this to your folder containing PDB files
    output_folder = "/content/drive/MyDrive/outputPDB"  # Update this to where you want to save the separate chain files

    processed_files = set()  # Maintain a set of processed filenames

    for pdb_file in os.scandir(output_folder):
        if pdb_file.is_file() and pdb_file.name.startswith("4") and pdb_file.name.endswith(".pdb"):
            processed_files.add(pdb_file.name[:4])  # Add the first 4 characters to the set

    for pdb_file in os.scandir(input_folder):
        if pdb_file.is_file() and pdb_file.name.endswith(".pdb"):
            base_filename = os.path.splitext(pdb_file.name)[0]
            if base_filename[:4] in processed_files:
                print(f"Skipping {base_filename} as it's already processed.")
                continue

            split_pdb_chains(pdb_file.path, output_folder)


#Part one is DONE!