# PDB Quality Predictor
## Prediction of X-ray Diffraction Data Quality from Experimental Parameters
By Jesse Yoder
April/May/June 2022
The Data Incubator Capstone

This notebook is for intial dataframe assembly. The RCSB PDB is queried, files are downloaded and parsed, then added to a dataframe. The dataframe is saved locall as a feather file.

The PDB fields extracted into the dataframe are: 
1. PDB ID
2. Resolution
3. Completeness
4. I / sigma
5. R_value
6. R_free
7. Detector (model)
8. Detector type
9. Optics

In [1]:
import glob
import os
import pandas as pd

#
# Run the functions in the final cell to create the intial dataframe
# The PDB_batch_download.sh script was obtained from the RCSB PDB website
#

In [2]:
#Download all ~165k X-ray structures 
#163,258 on April 11 2022 as query date
#this took about 12 hours

# Obtain CSV lists for download from RCSB PDB website search:
# Search for "Structural Attribute" > "Experimental Method" > "X-RAY DIFFRACTION"
# Download the CSVs (multiple) with entry ID information
# https://www.rcsb.org/search/advanced

#Path to CSV lists
pdb_lists_path = "./PDB_list_CSVs"


def initial_library_constructor():
    print(" - Notice: Expect about 1 hour download per 10k structures over a fast network - ")
    
    command = f"""for file in `ls {pdb_lists_path} | grep rcsb_pdb_ids`; do ./PDB_batch_download.sh -f \
                {pdb_lists_path}/$file -o ./PDB_depot -p; done"""
    
    subprocess.Popen(command, shell = True,
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
                 text=True)
    
    print(" - PDB Download Complete")


In [3]:
#%%bash
#Create backup directory of PDB_depot
#Use pigz (*parallel* implementation of gzip) from homebrew
#runs in 5 minutes instead of hours

#tar -c --use-compress-program=pigz -f PDB_depot_archive.tar.gz ./PDB_depot/

In [4]:
#Split the ~165k pdb files to 9 subdirectories, based on first character
#Otherwise gunzip won't work (argument list too long)


def intial_directory_setup():
    #1st, make the 9 directories (eg. 1xxx_pdbs ... 9xxx_pdbs)
    command = "for i in $(seq 1 9); do mkdir 'PDB_depot/'$i'xxx_pdbs'; done"
    
    subprocess.Popen(command, shell = True,
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
                 text=True)
    
    
    #2nd, sort the PDB files, moving them to matching directory
    command = """for n in $(seq 1 9); do
    for i in `ls PDB_depot/${n}*.pdb.gz`; do    #backticks don't actually need curly braces for $VAR expansion
        mv $i  "PDB_depot/${n}xxx_pdbs"; #Double quotes do
    done
    done
    """
    
    subprocess.Popen(command, shell = True,
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
                 text=True)
    
    
    #unzip everything
    #force it to get rid of any test PDBs/duplicates
    #no multi-core option for unzip, just go with usual
    #Must pass specific file to gunzip, passing dir/* will result in "argument list too long"
    command = """for i in `ls PDB_depot | grep xxx_pdbs`; do
        for f in `ls ./PDB_depot/${i}/`; do 
            gunzip -d -f "./PDB_depot/${i}/${f}";
        #echo gunzip -d -f "./PDB_depot/${i}/*";
        done
        done"""
    
    subprocess.Popen(command, shell = True,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
             text=True)
    
    print(" - the PDB library organization is complete")
    


In [5]:

def intial_dataframe_constructor():
    #dataframe construction took 4:30 hours before optimizing. Now I would think 1-2 hours.
    df = pd.DataFrame(columns=["Resolution", "Completeness", "I_sigma", "R_value", "R_free", "Detector", "Det_type", "Optics", "Synchrotron"])


    #Add date - I can drop anything before... 2002? 

    for filename in glob.glob('PDB_depot/*/*.pdb'):  #for full set, use wildcard to catch 9 subdirs 
        with open(os.path.join(os.getcwd(), filename), 'r') as f:

            resolution = completeness = i_sigma = r_value = r_free = detector = det_type = optics = "NULL"  #set as null in case line not present

            pdb_id = filename[20:24]
            for line in f:

                #Exit condition, 50% drop in time
                #works without strip? remove other line strips for speed 30% drop in time
                if line.startswith("ATOM"):
                    break

                #Resolution
                if line.startswith("REMARK   3   RESOLUTION RANGE HIGH (ANGSTROMS)"):
                    resolution = line.split(":")[1].strip()

                #Completeness
                if line.startswith("REMARK   3   COMPLETENESS FOR RANGE        (%)"):
                    completeness = line.split(":")[1].strip()         

                #I/sigma
                if line.startswith("REMARK 200  <I/SIGMA(I)> FOR THE DATA SET"):
                    i_sigma = line.split(":")[1].strip()      

                #R-Value
                if line.startswith("REMARK   3   R VALUE     (WORKING + TEST SET)"):
                    r_value = line.split(":")[1].strip()

                #R-Free
                if line.startswith("REMARK   3   FREE R VALUE"):
                    if line.split(":")[0].strip().endswith("VALUE"):  #need to match end here
                        r_free = line.split(":")[1].strip()

                #Detector Model (Pilatus 6M, Eiger 16M etc)
                if line.startswith("REMARK 200  DETECTOR MANUFACTURER"):
                    detector = line.split(":")[1].strip()
                    if ";" in detector:
                        detector = line.split(';')[0].strip()  #some lines contain 2 fields, 2nd redundant

                #Detector Type (Pixel, CCD, etc)
                if line.startswith("REMARK 200  DETECTOR TYPE"):
                    det_type = line.split(":")[1].strip()

                #Optics (Mirrors)
                if line.startswith("REMARK 200  OPTICS"):
                    optics = line.split(":")[1].strip()


                #Radiation Source (synchrotron name / homesource)
                if line.startswith("REMARK 200  RADIATION SOURCE"):
                    synchrotron = line.split(":")[1].strip()



            df.loc[pdb_id] = [resolution, completeness, i_sigma, r_value, r_free, detector, det_type, optics, synchrotron]
    
    ## Write dataframe to feather (can't have index)
    #  Feather loads in a few seconds vs. hours reading all the text files
    df = df.reset_index()
    
    outdir = './dataframe_feather'
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    
    df.to_feather(f'{outdir}/0_initial_dataframe.feather')
    
    print(f" - the intial dataframe was writen to this location:\
          \n \t {outdir}/0_initial_dataframe.feather")

In [None]:
# initial_library_constructor()
# intial_directory_setup()
intial_dataframe_constructor()