# Data Aquisition
In this file, I download and format data to be provided to machine learning models. There are several steps:


In [18]:
import sys
!{sys.executable} -m pip install joblib

Collecting joblib
  Using cached https://files.pythonhosted.org/packages/0d/1b/995167f6c66848d4eb7eabc386aebe07a1571b397629b2eac3b7bebdc343/joblib-0.13.0-py2.py3-none-any.whl
[31mmkl-random 1.0.1 requires cython, which is not installed.[0m
Installing collected packages: joblib
Successfully installed joblib-0.13.0
[33mYou are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


## 1. Aquire a list of PDB IDs to download.

In [19]:
import datetime
today = datetime.datetime.today()
day = today.day
month = today.month
suffix = "_{0:02d}{1:02d}".format(month, day)
suffix

'_1129'

In [20]:
import requests

In [21]:
url = 'http://www.rcsb.org/pdb/rest/search'

# Retrieves all PDB IDs that have resolution < 3.0 Angstroms.
# Must contain 1 protein.
query = """<orgPdbQuery>
    <version>head</version>
    <queryType>org.pdb.query.simple.EntriesOfEntitiesQuery</queryType>
    <description>Entries of :Secondary structure has:  1 or more Alpha Helices and between 85 and 100 percent of elements are Alpha Helical  and 0 or less Beta Sheets and 0 or less percent of elements are Beta Sheet 
and
Oligomeric state Search : Min Number of oligomeric state=1 Max Number of oligomeric state=1
and
Sequence Length is between 9 and 60 
</description>
    <queryId>72D8FBC1</queryId>
    <resultCount>86</resultCount>
    <runtimeStart>2018-11-29T13:52:29Z</runtimeStart>
    <runtimeMilliseconds>2</runtimeMilliseconds>
    <parent><![CDATA[<orgPdbCompositeQuery version="1.0">
    <resultCount>1890</resultCount>
    <queryId>1A24C4C2</queryId>
 <queryRefinement>
  <queryRefinementLevel>0</queryRefinementLevel>
  <orgPdbQuery>
    <version>head</version>
    <queryType>org.pdb.query.simple.SecondaryStructureQuery</queryType>
    <description>Secondary structure has:  1 or more Alpha Helices and between 85 and 100 percent of elements are Alpha Helical  and 0 or less Beta Sheets and 0 or less percent of elements are Beta Sheet </description>
    <queryId>F9D5DD03</queryId>
    <resultCount>1890</resultCount>
    <runtimeStart>2018-11-29T13:35:16Z</runtimeStart>
    <runtimeMilliseconds>637</runtimeMilliseconds>
    <polyStats.helixPercent.comparator>between</polyStats.helixPercent.comparator>
    <polyStats.helixCount.comparator>between</polyStats.helixCount.comparator>
    <polyStats.sheetPercent.comparator>between</polyStats.sheetPercent.comparator>
    <polyStats.sheetCount.comparator>between</polyStats.sheetCount.comparator>
    <polyStats.helixPercent.min>85</polyStats.helixPercent.min>
    <polyStats.helixPercent.max>100</polyStats.helixPercent.max>
    <polyStats.helixCount.min>1</polyStats.helixCount.min>
    <polyStats.sheetPercent.max>0</polyStats.sheetPercent.max>
    <polyStats.sheetCount.max>0</polyStats.sheetCount.max>
  </orgPdbQuery>
 </queryRefinement>
 <queryRefinement>
  <queryRefinementLevel>1</queryRefinementLevel>
  <conjunctionType>and</conjunctionType>
  <orgPdbQuery>
    <version>head</version>
    <queryType>org.pdb.query.simple.BiolUnitQuery</queryType>
    <description>Oligomeric state Search : Min Number of oligomeric state=1 Max Number of oligomeric state=1</description>
    <queryId>1BB8A37D</queryId>
    <resultCount>59551</resultCount>
    <runtimeStart>2018-11-29T13:35:17Z</runtimeStart>
    <runtimeMilliseconds>1060</runtimeMilliseconds>
    <oligomeric_statemin>1</oligomeric_statemin>
    <oligomeric_statemax>1</oligomeric_statemax>
  </orgPdbQuery>
 </queryRefinement>
 <queryRefinement>
  <queryRefinementLevel>2</queryRefinementLevel>
  <conjunctionType>and</conjunctionType>
  <orgPdbQuery>
    <version>head</version>
    <queryType>org.pdb.query.simple.SequenceLengthQuery</queryType>
    <description>Sequence Length is between 9 and 60 </description>
    <queryId>95F56C17</queryId>
    <resultCount>31099</resultCount>
    <runtimeStart>2018-11-29T13:35:18Z</runtimeStart>
    <runtimeMilliseconds>2545</runtimeMilliseconds>
    <v_sequence.chainLength.min>9</v_sequence.chainLength.min>
    <v_sequence.chainLength.max>60</v_sequence.chainLength.max>
  </orgPdbQuery>
 </queryRefinement>
</orgPdbCompositeQuery>]]></parent>
  </orgPdbQuery>"""

header = {'Content-Type': 'application/x-www-form-urlencoded'}
response = requests.post(url, data=query, headers=header)
if response.status_code != 200:
    print ("Failed to retrieve results.")
    
PDB_IDS = response.text.split("\n")    
print ("Retrieved {0} PDB IDs.".format(len(PDB_IDS)))

Retrieved 87 PDB IDs.


## 2. Set amino acid encoding and angle downloading methods.

In [22]:
import prody as pr
pr.confProDy(verbosity='error')
import numpy as np
import re
import pickle
from joblib import Parallel, delayed
from multiprocessing import Pool
import sys
    
AA_MAP = {'A': 15,'C': 0,'D': 1,'E': 17,'F': 8,'G': 10,'H': 11,'I': 5,'K': 4,'L': 12,'M': 19,'N': 9,'P': 6,'Q': 3,'R': 13,'S': 2,'T': 7,'V': 16,'W': 14,'Y': 18}
CUR_DIR = "/home/jok120/projML/data/"
pr.pathPDBFolder(CUR_DIR + "pdbgz/")
np.set_printoptions(suppress=True) # suppresses scientific notation when printing
np.set_printoptions(threshold=np.nan) # suppresses '...' when printing

In [23]:
def seq_to_onehot(seq):
    """ Given an AA sequence, returns a vector of one-hot vectors."""
    vector_array = []
    for aa in seq:
        one_hot = np.zeros(len(AA_MAP), dtype=bool)
        one_hot[AA_MAP[aa]] = 1
        vector_array.append(one_hot)
    return np.asarray(vector_array)

In [24]:
def get_bond_angles(res, next_res):
    """ Given 2 residues, returns the ncac, cacn, and cnca bond angles between them."""
    atoms = res.backbone.copy()
    atoms_next = next_res.backbone.copy()
    ncac = pr.calcAngle(atoms[0], atoms[1], atoms[2], radian=True)
    cacn = pr.calcAngle(atoms[1], atoms[2], atoms_next[0], radian=True)
    cnca = pr.calcAngle(atoms[2], atoms_next[0], atoms_next[1], radian=True)
    return ncac, cacn, cnca

In [25]:
def get_angles_from_chain(chain, pdb_id):
    """ Given a ProDy Chain object (from a Hierarchical View), return a numpy array of 
        angles. Returns None if the PDB should be ignored due to weird artifacts. Also measures
        the bond angles along the peptide backbone, since they account for significat variation.
        i.e. [[phi, psi, omega, ncac, cacn, cnca, chi1, chi2, chi3, chi4, chi5], [...] ...] """
    PAD_CHAR = 0
    OUT_OF_BOUNDS_CHAR = 0
    dihedrals = []
    sequence = ""
    
    try:
        if chain.nonstdaa:
#             print("Non-standard AAs found.")
            return None
        sequence = chain.getSequence()
        length = len(sequence)
        chain = chain.select("protein and not hetero").copy()
    except Exception as e:
#         print("Problem loading sequence.", e)
        return None

    all_residues = list(chain.iterResidues())
    prev = all_residues[0].getResnum()
    for i, res in enumerate(all_residues):   
        if (not res.isstdaa):
            print("Found a non-std AA. Why didn't you catch this?", chain)
            print(res.getNames())
            return None
        if res.getResnum() != prev:
            print('\rNon-continuous!!', pdb_id, end="")
            return None
        else:
            prev = res.getResnum() + 1
        try:
            phi = pr.calcPhi(res, radian=True, dist=None)
        except:
            phi = OUT_OF_BOUNDS_CHAR
        try:
            psi = pr.calcPsi(res, radian=True, dist=None)
        except:
            psi = OUT_OF_BOUNDS_CHAR
        try:
            omega = pr.calcOmega(res, radian=True, dist=None)
        except:
            omega = OUT_OF_BOUNDS_CHAR
        if phi == 0 and psi == 0 and omega == 0:
            return None
            
        if i == len(all_residues) - 1:
            BONDANGLES = [0, 0, 0]
        else:
            try:
                BONDANGLES = list(get_bond_angles(res, all_residues[i+1]))
            except Exception as e:
                print("Bond angle issue with", pdb_id, e)
                return None

        BACKBONE = [phi,psi,omega]
                  
        def compute_single_dihedral(atoms):
            return pr.calcDihedral(atoms[0],atoms[1],atoms[2],atoms[3],radian=True)
        
        def compute_all_res_dihedrals(atom_names):
            atoms = [res.select("name " + an) for an in atom_names]
            if None in atoms:
                return None
            res_dihedrals = []
            if len(atom_names) > 0:
                for i in range(len(atoms)-3):      
                    a = atoms[i:i+4]
                    res_dihedrals.append(compute_single_dihedral(a))
            return BACKBONE + BONDANGLES + res_dihedrals + (5 - len(res_dihedrals))*[PAD_CHAR]

        if res.getResname()=="ARG":
            atom_names = ["CA","C","CB","CG","CD","NE","CZ","NH1"]             
        elif res.getResname()=="HIS":
            atom_names = ["CA","C","CB","CG","ND1"]            
        elif res.getResname()=="LYS":
            atom_names = ["CA","C","CB","CG","CD","CE","NZ"]                   
        elif res.getResname()=="ASP":
            atom_names = ["CA","C","CB","CG","OD1"]            
        elif res.getResname()=="GLU":
            atom_names = ["CA","C","CB","CG","CD","OE1"]            
        elif res.getResname()=="SER":
            atom_names = ["CA","C","CB", "OG"]       
        elif res.getResname()=="THR":
            atom_names = ["CA","C","CB","CG2"]                    
        elif res.getResname()=="ASN":
            atom_names = ["CA","C","CB","CG","ND2"]                    
        elif res.getResname()=="GLN":
            atom_names = ["CA","C","CB","CG","CD","NE2"]                    
        elif res.getResname()=="CYS":
            atom_names = ["CA","C","CB","SG"]         
        elif res.getResname()=="GLY":
            atom_names = []                    
        elif res.getResname()=="PRO":
            atom_names = []                    
        elif res.getResname()=="ALA":
            atom_names = []            
        elif res.getResname()=="VAL":
            atom_names = ["CA","C","CB","CG1"]        
        elif res.getResname()=="ILE":
            atom_names = ["CA","C","CB","CG1","CD1"]        
        elif res.getResname()=="LEU":
            atom_names = ["CA","C","CB","CG","CD1"]        
        elif res.getResname()=="MET":
            atom_names = ["CA","C","CB","CG","SD","CE"]                    
        elif res.getResname()=="PHE":
            atom_names = ["CA","C","CB","CG", "CD1"]         
        elif res.getResname()=="TRP":
            atom_names = ["CA","C","CB","CG","CD1"]                          
        elif res.getResname()=="TYR":
            atom_names = ["CA","C","CB","CG","CD1"]
        else:
            continue
            
        calculated_dihedrals = compute_all_res_dihedrals(atom_names)
        if calculated_dihedrals == None:
            return None
        dihedrals.append(calculated_dihedrals)

    # No normalization
    dihedrals_np = np.asarray(dihedrals)
    # Check for NaNs - they shouldn't be here, but certainly should be excluded if they are.
    if np.any(np.isnan(dihedrals_np)):
        return None
    return dihedrals_np, sequence

## 3a. Iterate through all chains in `PDB_IDS`, saving all results to disk.

### Remove empty PDB ids.

In [26]:
PDB_IDS = list(filter(lambda x: x != "", PDB_IDS))

### Remove PDB entries that have gaps

In [27]:
NEW_PDB_IDS = []
for p in PDB_IDS:
    pdb_ = pr.parsePDB(p)
    if pdb_ is None:
        print(p, None)
        continue
    pdb_hv = pr.parsePDB(p).getHierView()
    for chain in pdb_hv:
        c = chain
        if c.getResindices()[-1] + 1 != len(set(c.getResindices())):
            print(p, "bad")
        else:
            NEW_PDB_IDS.append(p)

1DEB bad
1DEB bad
1Y47 bad
1Y47 bad
3BBZ bad
3BBZ bad
4MZZ bad
4MZZ bad
4N3X bad
4N3X bad
4N3X bad
4N3X bad
4YV4 bad
4YV4 bad
4YV4 bad
4YV4 bad
4YV4 bad
4YV4 bad
4YV4 bad
4YV4 bad
5MFH bad
5MFH bad
5MFH bad
5MFH bad
5MFH bad
5MFH bad
5V2O bad
5V2O bad
5V2O bad
5V2O bad
5V2O bad
5V2O bad


In [29]:
PDB_IDS = NEW_PDB_IDS.copy()

## 3b. Parallelized method of downloading data (not yet implemented).

In [30]:
import tqdm

In [31]:
%time
def work(pdb_id):
    pdb_dihedrals = []
    pdb_sequences = []
    ids = []
    try:
        pdb_hv = pr.parsePDB(pdb_id).getHierView()
        for chain in pdb_hv:
            chain_id = chain.getChid()
            dihedrals_sequence = get_angles_from_chain(chain, pdb_id)
            if dihedrals_sequence is None:
                continue 
            dihedrals, sequence = dihedrals_sequence
            pdb_dihedrals.append(dihedrals)
            pdb_sequences.append(sequence)
            ids.append(pdb_id + "_" + chain_id)
    except Exception as e:
        print("Whoops, returning where I am.", e)
    return pdb_dihedrals, pdb_sequences, ids

# p = Pool(15)
# results = p.map(work, PDB_IDS)
def _foo(i):
    return work(PDB_IDS[i])

with Pool(16) as p:
    results = list(tqdm.tqdm(p.imap(_foo, range(len(PDB_IDS))), total=len(PDB_IDS)))

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.39 µs


100%|██████████| 78/78 [00:00<00:00, 91.29it/s]


## 4. Save Python lists of data to disk. 

In [33]:
with open("raw_aquired" + suffix + ".pkl", "wb") as F:
    pickle.dump(results, F, 2)
len(results)

78

In [18]:
with open("raw_aquired" + "_1129" + ".pkl", "rb") as F:
    results = pickle.load(F)
len(results)

78

In [19]:
results_onehots = []
for r in results:
    ang, seq, i = r
    if len(i) == 0:
        continue
    results_onehots.append((ang[0], seq_to_onehot(seq[0]), i[0]))

In [35]:
results_onehots[0]

(array([[ 0.        ,  2.10685991,  3.14039919,  1.91003101,  2.0511851 ,
          2.09481361,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ],
        [-2.84690899,  3.11956781, -3.14123585,  1.9115174 ,  2.04962505,
          2.095468  ,  2.11113846,  0.84151237,  0.        ,  0.        ,
          0.        ],
        [-1.43614185, -1.70443975, -3.10168744,  1.91877092,  2.07926199,
          2.0361038 , -0.90541488,  1.79231189,  2.55188067,  1.12747811,
          0.        ],
        [ 0.98096575,  0.40613516, -3.09588627,  1.84636708,  2.04142665,
          2.13146332,  2.73564648,  0.        ,  0.        ,  0.        ,
          0.        ],
        [ 0.26276274,  0.75703315, -3.08993623,  1.93363706,  2.01503707,
          2.12904963, -1.75312631,  1.79243467,  0.        ,  0.        ,
          0.        ],
        [ 0.30216041,  0.90758012, -3.11061607,  1.91900806,  1.93802902,
          2.14637458, -1.30039865,  2.07659783,  0.        ,  0.       

In [20]:
all_ohs = []
all_angs = []
all_ids = []
for r in results_onehots:
    a, oh, i = r
    all_ohs.append(oh)
    all_angs.append(a)
    all_ids.append(i)

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
ohs_ids = list(zip(all_ohs, all_ids))

In [34]:
# To only have a training set, use this cell
X_train, X_test, X_val = ohs_ids, [], ohs_ids
y_train, y_test, y_val = all_angs, [], all_angs

In [39]:
X_train, X_test, y_train, y_test = train_test_split(ohs_ids, all_angs, test_size=0.20, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.14, random_state=42)

In [35]:
list(map(len, [X_train, X_test, y_train, y_test, X_val, y_val]))

[76, 0, 76, 0, 76, 76]

Remove and save ids.

In [36]:
X_train_labels = [x[1] for x in X_train]
X_test_labels = [x[1] for x in X_test]
X_val_labels = [x[1] for x in X_val]

In [37]:
X_train = [x[0] for x in X_train]
X_test = [x[0] for x in X_test]
X_val = [x[0] for x in X_val]

In [38]:
data = {"train": {"seq": X_train,
                  "ang": y_train,
                  "ids": X_train_labels},
        "valid": {"seq": X_val,
                  "ang": y_val,
                  "ids": X_val_labels},
        "test":  {"seq": X_test,
                  "ang": y_test,
                  "ids": X_test_labels},
       "settings": {"max_len": max(map(len, all_ohs))}}

In [44]:
data_file_name = "helix_train_only.pkl"

In [43]:
data

{'train': {'seq': [array([[False, False, False, False, False, False, False, False, False,
           False, False, False, False, False, False,  True, False, False,
           False, False],
          [False,  True, False, False, False, False, False, False, False,
           False, False, False, False, False, False, False, False, False,
           False, False],
          [False, False, False, False,  True, False, False, False, False,
           False, False, False, False, False, False, False, False, False,
           False, False],
          [False, False,  True, False, False, False, False, False, False,
           False, False, False, False, False, False, False, False, False,
           False, False],
          [False,  True, False, False, False, False, False, False, False,
           False, False, False, False, False, False, False, False, False,
           False, False],
          [False, False, False, False, False, False, False, False, False,
           False, False, False,  True, F

In [45]:
with open(data_file_name, "wb") as datafile:
    pickle.dump(data, datafile)

In [3]:
import pickle

In [4]:
with open("data.pkl", "rb") as datafile:
    data = pickle.load(datafile)

In [40]:
import torch

In [41]:
torch.save(data, data_file_name)

In [42]:
X_train_labels

['1DX7_A',
 '1GK7_A',
 '1HGV_A',
 '1HGZ_A',
 '1HH0_A',
 '1IFD_A',
 '1IFI_A',
 '1IFJ_A',
 '1IFK_A',
 '1IFL_A',
 '1IFM_A',
 '1IFN_A',
 '1IFP_A',
 '1O06_A',
 '1QL1_A',
 '1X9B_A',
 '2BP4_A',
 '2C0W_A',
 '2IFM_A',
 '2IFO_A',
 '2IV4_A',
 '2IV5_A',
 '2K10_A',
 '2KMU_A',
 '2KWY_A',
 '2L36_A',
 '2L5R_A',
 '2LBG_A',
 '2LHG_A',
 '2LLM_A',
 '2LMF_A',
 '2LQ0_A',
 '2LQ2_A',
 '2LX4_A',
 '2M0W_A',
 '2M1A_A',
 '2M3F_A',
 '2M8M_A',
 '2M8O_A',
 '2MG1_A',
 '2MMM_A',
 '2MOC_A',
 '2MUA_A',
 '2MUN_A',
 '2MZX_A',
 '2N35_A',
 '2N85_A',
 '2N8O_A',
 '2NB2_A',
 '2NCS_A',
 '2NCT_A',
 '2NCW_A',
 '2NCX_A',
 '2NCY_A',
 '2ND2_A',
 '2ND6_A',
 '2NDK_A',
 '2RRH_A',
 '2XKM_A',
 '3A03_A',
 '3P7K_A',
 '3V1A_A',
 '5KWZ_A',
 '5KX1_A',
 '5LBJ_A',
 '5LWC_A',
 '5MMK_A',
 '5MML_A',
 '5UJQ_A',
 '5V4U_A',
 '5WOD_A',
 '5XDJ_A',
 '5XER_A',
 '6ATS_A',
 '6BJF_A',
 '6FS5_A']