# Create Dataset
This notebook extracts protein secondary structure information for a set of representative protein chains and assigns a fold type. 

In [1]:
# parameters
feature_col = "features" # feature vector
value_col = "foldType" # fold type to be predicted

In [2]:
import pandas as pd
import numpy as np     
import pdbutils 

### Read representative set of PDB chains 
- sequence identity 20%
- resolution 2.5

In [3]:
rep = pdbutils.read_pisces_representatives('./data/cullpdb_pc25_res3.0_R1.0_d180920_chains14051.gz')
print("Number of representative PDB chains:", rep.shape[0])
rep.head()

Number of representative PDB chains: 14051


Unnamed: 0,length,Exptl.,resolution,R-factor,FreeRvalue,pdbChainId
0,330,XRAY,2.2,0.16,0.29,12AS.A
1,366,XRAY,2.1,0.19,0.26,16VP.A
2,348,XRAY,2.6,0.22,0.34,1A0I.A
3,413,XRAY,1.7,0.19,0.22,1A12.C
4,108,XRAY,2.0,0.21,0.25,1A1X.A


### Read protein sequence and secondary structure for all PDB chains
The following file downloaded from rcsb.org contains the sequences and calculated secondary structure using the DSSP method.

The method below reads the file with secondary structure information and returns the results as a Pandas dataframe.

In [4]:
ss = pdbutils.read_secondary_structure('./data/ss_dis.txt.gz')
print("Number of PDB chains:", ss.shape[0])
ss.head()

Number of PDB chains: 402007


Unnamed: 0,pdbChainId,sequence,secondary_structure
0,101M.A,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,CCCCHHHHHHHHHHHHHHGGGHHHHHHHHHHHHHHHCGGGGGGCTT...
1,102L.A,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSE...,CCHHHHHHHHHCCEEEEEECTTSCEEEETTEEEESSSCTTTHHHHH...
2,102M.A,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,CCCCHHHHHHHHHHHHHHGGGHHHHHHHHHHHHHHHCGGGGGGCTT...
3,103L.A,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNSLDAAK...,CCHHHHHHHHHCCEEEEEECTTSCEEEETTEECCCCCCCCCHHHHH...
4,103M.A,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,CCCCHHHHHHHHHHHHHHGGGHHHHHHHHHHHHHHHCGGGGGGCTT...


### Find the intersection between the two data sets
By merging the represenative set with the secondary structure dataframe we obtain the interaction of the two dataframes.

In [5]:
df = ss.merge(rep, left_on='pdbChainId', right_on='pdbChainId', how='inner')
print(df.shape[0])
df.head()

13791


Unnamed: 0,pdbChainId,sequence,secondary_structure,length,Exptl.,resolution,R-factor,FreeRvalue
0,12AS.A,MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQD...,CCCCHHHHHHHHHHHHHHHHHHHHHHHCEEECCCCSEEETTSSCSC...,330,XRAY,2.2,0.16,0.29
1,16VP.A,SRMPSPPMPVPPAALFNRLLDDLGFSAGPALCTMLDTWNEDLFSAL...,CCSCCCCCCCCHHHHHHHHHHHHTCTTHHHHHHHHHHCCCCCSTTS...,366,XRAY,2.1,0.19,0.26
2,1A0I.A,VNIKTNPFKAVSFVESAIKKALDNAGYLIAEIKYDGVRGNICVDNT...,CTTCCCCEEEEECCHHHHHHHHHHHSSEEEEECCCSEEEEEEEETT...,348,XRAY,2.6,0.22,0.34
3,1A12.C,RRSPPADAIPKSKKVKVSHRSHSTEPGLVLTLGQGDVGQLGLGENV...,CCCCCCCCCCCCCCCCCCCTTCCCCCBEEEEEEECTTSTTCSCTTC...,413,XRAY,1.7,0.19,0.22
4,1A1X.A,GSAGEDVGAPPDHLWVHQEGIYRDEYQRTWVAVVEEETSFLRARVQ...,CCCCCCCCCCCSEEEEEETTEEEETTSCEEEEEEEECSSCEEEEEE...,108,XRAY,2.0,0.21,0.25


### Calculate secondary structure percentage  

In [6]:
# helper functions to calculate secondary structure percentage  
helix_percent_calculator = lambda s: (s.count('5') + s.count('H') + s.count('G')) / len(s)                                                   
sheet_percent_calculator = lambda s: (s.count('E') + s.count('B')) / len(s)                                                                
coil_percent_calculator = lambda s: (s.count('S') + s.count('T') + s.count('C')) / len(s)                                                   
                                                                                                                                                             
# calculate each secondary structure percentage                                                                                                              
df['alpha'] = df.secondary_structure.apply(helix_percent_calculator)                                                                                         
df['beta'] = df.secondary_structure.apply(sheet_percent_calculator)                                                                                          
df['coil'] = df.secondary_structure.apply(coil_percent_calculator)                                                                                           

df.head()

Unnamed: 0,pdbChainId,sequence,secondary_structure,length,Exptl.,resolution,R-factor,FreeRvalue,alpha,beta,coil
0,12AS.A,MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQD...,CCCCHHHHHHHHHHHHHHHHHHHHHHHCEEECCCCSEEETTSSCSC...,330,XRAY,2.2,0.16,0.29,0.345455,0.206061,0.448485
1,16VP.A,SRMPSPPMPVPPAALFNRLLDDLGFSAGPALCTMLDTWNEDLFSAL...,CCSCCCCCCCCHHHHHHHHHHHHTCTTHHHHHHHHHHCCCCCSTTS...,366,XRAY,2.1,0.19,0.26,0.469945,0.046448,0.483607
2,1A0I.A,VNIKTNPFKAVSFVESAIKKALDNAGYLIAEIKYDGVRGNICVDNT...,CTTCCCCEEEEECCHHHHHHHHHHHSSEEEEECCCSEEEEEEEETT...,348,XRAY,2.6,0.22,0.34,0.232759,0.318966,0.448276
3,1A12.C,RRSPPADAIPKSKKVKVSHRSHSTEPGLVLTLGQGDVGQLGLGENV...,CCCCCCCCCCCCCCCCCCCTTCCCCCBEEEEEEECTTSTTCSCTTC...,413,XRAY,1.7,0.19,0.22,0.038741,0.418886,0.542373
4,1A1X.A,GSAGEDVGAPPDHLWVHQEGIYRDEYQRTWVAVVEEETSFLRARVQ...,CCCCCCCCCCCSEEEEEETTEEEETTSCEEEEEEEECSSCEEEEEE...,108,XRAY,2.0,0.21,0.25,0.037037,0.472222,0.490741


### Label structure with protein foldtype

In [7]:
def protein_fold_type(data, minThreshold, maxThreshold):
    '''
    Returns fold type with three major secondary structure class:
    "alpha", "beta", "alpha+beta", and "other" based upon the fraction of alpha/beta content.

    Attributes:
        data (DataFrame<Row>): input dataframe with alpha, beta composition
        minThreshold (float): below this threshold, the secondary structure is ignored
        maxThreshold (float): above this threshold, the secondary structure is ignored
    '''
    if data.alpha > maxThreshold and data.beta < minThreshold:                                
        return "alpha"                                                                        
    elif data.beta > maxThreshold and data.alpha < minThreshold:                              
        return "beta"                                                                         
    elif data.alpha > maxThreshold and data.beta > minThreshold:                              
        return "alpha+beta"                                                                   
    else:                                                                                     
        return "other"

## Classify structures by secondary structure content
* alpha: predominantly alpha (>=25%)
* beta: predominantly beta (>=25%)
* alpha+beta: significant alpha (>=25%) and beta (>=25%)

In [8]:
df[value_col] = df.apply(protein_fold_type, minThreshold = 0.05, maxThreshold = 0.25, axis=1)
df = df[(df[value_col] == 'alpha') | (df[value_col] == 'beta') | (df[value_col] == 'alpha+beta')]
df.head(10)

Unnamed: 0,pdbChainId,sequence,secondary_structure,length,Exptl.,resolution,R-factor,FreeRvalue,alpha,beta,coil,foldType
0,12AS.A,MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQD...,CCCCHHHHHHHHHHHHHHHHHHHHHHHCEEECCCCSEEETTSSCSC...,330,XRAY,2.2,0.16,0.29,0.345455,0.206061,0.448485,alpha+beta
1,16VP.A,SRMPSPPMPVPPAALFNRLLDDLGFSAGPALCTMLDTWNEDLFSAL...,CCSCCCCCCCCHHHHHHHHHHHHTCTTHHHHHHHHHHCCCCCSTTS...,366,XRAY,2.1,0.19,0.26,0.469945,0.046448,0.483607,alpha
3,1A12.C,RRSPPADAIPKSKKVKVSHRSHSTEPGLVLTLGQGDVGQLGLGENV...,CCCCCCCCCCCCCCCCCCCTTCCCCCBEEEEEEECTTSTTCSCTTC...,413,XRAY,1.7,0.19,0.22,0.038741,0.418886,0.542373,beta
4,1A1X.A,GSAGEDVGAPPDHLWVHQEGIYRDEYQRTWVAVVEEETSFLRARVQ...,CCCCCCCCCCCSEEEEEETTEEEETTSCEEEEEEEECSSCEEEEEE...,108,XRAY,2.0,0.21,0.25,0.037037,0.472222,0.490741,beta
5,1A2X.B,GDEEKRNRAITARRQHLKSVMLQIAATELEKEEGRREAEKQNYLAEH,CCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHTCCCCCCCCCCCCCCC,47,XRAY,2.3,0.22,0.33,0.574468,0.0,0.425532,alpha
6,1A41.A,NAKRDRIFVRVYNVMKRINCFINKNIKKSSTDSNYQLAVFMLMETM...,CHHHHHHHHHHHHHHHHHHHHHHHHTTSTTTCTTHHHHHHHHHHHC...,234,XRAY,2.3,0.23,0.31,0.568376,0.081197,0.350427,alpha+beta
7,1A5T.A,MRWYPWLRPDFEKLVASYQAGRGHHALLIQALPGMGDDALIYALSR...,CCCCGGGHHHHHHHHHHHHTTCCCSEEEEECCTTSCHHHHHHHHHH...,334,XRAY,2.2,0.2,0.27,0.538922,0.086826,0.374251,alpha+beta
8,1A62.A,MNLTELKNTPVSELITLGENMGLENLARMRKQDIIFAILKQHAKSG...,CBHHHHHTSCHHHHHHHHHTTTCCCCTTSCHHHHHHHHHHHHHHTT...,130,XRAY,1.55,0.22,0.25,0.284615,0.261538,0.453846,alpha+beta
10,1A7J.A,MSKKHPIISVTGSSGAGTSTVKHTFDQIFRREGVKAVSIEGDAFHR...,CCTTSCEEEEESCCCCCCCTHHHHHHHHHHHHTCCEEEEEGGGGBS...,290,XRAY,2.5,0.21,0.28,0.393103,0.186207,0.42069,alpha+beta
11,1A8L.A,MGLISDADKKVIKEEFFSKMVNPVKLIVFVRKDHCQYCDQLKQLVQ...,CCSSCHHHHHHHHHHTGGGCCSCEEEEEEECSSSCTTHHHHHHHHH...,226,XRAY,1.9,0.19,0.22,0.411504,0.230088,0.358407,alpha+beta


### Save dataset

In [9]:
df.to_json("./secondaryStructure.json")

## Next step
After you saved the dataset here, go back to the [Overview.ipynb](./Overview.ipynb) to run the next step of the analysis.