In [2]:
import os
import pandas as pd
from tsv_to_df import *
import time
from stability_scores_csv import *

In [3]:
os.chdir('/home/jupyter/tacc-work/Jan/')

In [4]:
Hamed = '/home/jupyter/sd2e-community/protein-design/data_v1_April_2018/aggregated_v1_data_Hamed_May_23_2018/data_v1_aggregated.csv'

In [5]:
def canonicalize_name(name):
    if name.endswith('.seq'):
        i = name.rfind('.pdb')
        return name[0:i] + '.pdb'
    if not name.endswith('.pdb'):
        return name + '.pdb'
    else:
        return name



In [17]:
def tsv_to_residues_df(tsv_file):
    #make sure you are at right directory
    start_time = time.time()

    if os.getcwd != '/home/jupyter/tacc-work/Jan/proteins.tsv/': #change to protein.tsv
        os.chdir('/home/jupyter/tacc-work/Jan/proteins.tsv/')
    library_name = (os.path.splitext(tsv_file))[0]
    #create empty DataFrame
    final_df = pd.DataFrame()
    #create Hamed DataFrame
    Hamed_df = pd.read_table(Hamed,sep=',',usecols=['dssp','sequence','name'],index_col=[0])
    Hamed_df.index = Hamed_df.index.map(canonicalize_name)
    #create counter for proteins skipped:
    count = 0
    #create protein dictionary
    tsv_dict = tsv_to_dict(tsv_file)
    #iterate through protein dict
    for entry in tsv_dict:
    #If the protein name matches a name in Hamed...
        try:
            Hamed_values = Hamed_df.loc[canonicalize_name(entry)]
            #create Si_values list dictionary per protein
            Si_list = tsv_dict[entry]['Si_values']
            #create 'sequence' list from Hamed_df['sequence']
            sequence = Hamed_values['sequence']
            #create 'dssp' list from Hamed_df['dssp']
            dssp = Hamed_values['dssp']
            #use zip function to create a tuple of all three values
            tuples = list(zip(dssp,Si_list))
            #create index that is the protein name as many times as the length of the sequence // ([entry]*len(tuples))
            index_tuples = list(zip(([entry]*(len(sequence))),sequence))
            index = pd.MultiIndex.from_tuples(index_tuples,names=['protein','sequence'])
            #create DataFrame where data=tuples, columns=['sequence','dssp','Si_values], and index=index
            df = pd.DataFrame(tuples,columns=['dssp','Si_values'],index=index)
            #append this DataFrame to the EmptyDataFrame
            final_df = final_df.append(df)
        except KeyError:
            #adds 1 for every protein it skips
            count+=1
            continue
    #final_df = final_df.transpose()
    final_df.to_csv('/home/jupyter/tacc-work/Jan/proteins.df/'+library_name+'.csv')
    final_time = time.time()
    total_time = (final_time-start_time)
    return final_df, total_time, count
    

In [26]:
def tsv_to_stableresidues_df(tsv_file):
    #make sure you are at right directory
    start_time = time.time()

    if os.getcwd != '/home/jupyter/tacc-work/Jan/proteins.tsv/': #change to protein.tsv
        os.chdir('/home/jupyter/tacc-work/Jan/proteins.tsv/')
    library_name = (os.path.splitext(tsv_file))[0]
    #create empty DataFrame
    final_df = pd.DataFrame()
    #create Hamed DataFrame
    Hamed_df = pd.read_table(Hamed,sep=',',usecols=['dssp','sequence','stable?','name'],index_col=[0])
    #Create 'stable' Hamed DataFrame
    values = pd.to_numeric(Hamed_df['stable?'])
    stable = values==1
    Hamed_df = Hamed_df[stable]
    #fix the index names to be canonical
    Hamed_df.index = Hamed_df.index.map(canonicalize_name)
    #create counter for proteins skipped:
    count = 0
    #create protein dictionary
    tsv_dict = tsv_to_dict(tsv_file)
    #iterate through protein dict
    for entry in tsv_dict:
    #If the protein name matches a name in Hamed...
        try:
            Hamed_values = Hamed_df.loc[canonicalize_name(entry)]
            #create Si_values list dictionary per protein
            Si_list = tsv_dict[entry]['Si_values']
            #create 'sequence' list from Hamed_df['sequence']
            sequence = Hamed_values['sequence']
            #create 'dssp' list from Hamed_df['dssp']
            dssp = Hamed_values['dssp']
            #use zip function to create a tuple of all three values
            tuples = list(zip(sequence,dssp,Si_list))
            #create index that is the protein name as many times as the length of the sequence
            index_tuples = list(zip(([entry]*(len(sequence))),sequence))
            index = pd.MultiIndex.from_tuples(index_tuples,names=['protein','sequence'])
            #create DataFrame where data=tuples, columns=['sequence','dssp','Si_values], and index=index
            df = pd.DataFrame(tuples,columns=['sequence','dssp','Si_values'],index=index)
            #append this DataFrame to the EmptyDataFrame
            final_df = final_df.append(df)
        except KeyError:
            #adds 1 for every protein it skips
            count+=1
            continue
    #final_df = final_df.transpose()
    final_df.to_csv('/home/jupyter/tacc-work/Jan/proteins.df/stable_residues/'+library_name+'_stable.csv')
    final_time = time.time()
    total_time = (final_time-start_time)
    return final_df, total_time, count
    

In [27]:
def tsv_to_unstableresidues_df(tsv_file):
    #make sure you are at right directory
    start_time = time.time()

    if os.getcwd != '/home/jupyter/tacc-work/Jan/proteins.tsv/': #change to protein.tsv
        os.chdir('/home/jupyter/tacc-work/Jan/proteins.tsv/')
    
    library_name = (os.path.splitext(tsv_file))[0]
    #create empty DataFrame
    final_df = pd.DataFrame()
    #create Hamed DataFrame
    Hamed_df = pd.read_table(Hamed,sep=',',usecols=['dssp','sequence','stable?','name'],index_col=[0])
    #Create 'stable' Hamed DataFrame
    values = pd.to_numeric(Hamed_df['stable?'])
    unstable = values==0
    Hamed_df = Hamed_df[unstable]
    #fix the index names to be canonical
    Hamed_df.index = Hamed_df.index.map(canonicalize_name)
    #create counter for proteins skipped:
    count = 0
    #create protein dictionary
    tsv_dict = tsv_to_dict(tsv_file)
    #iterate through protein dict
    for entry in tsv_dict:
    #If the protein name matches a name in Hamed...
        try:
            Hamed_values = Hamed_df.loc[canonicalize_name(entry)]
            #create Si_values list dictionary per protein
            Si_list = tsv_dict[entry]['Si_values']
            #create 'sequence' list from Hamed_df['sequence']
            sequence = Hamed_values['sequence']
            #create 'dssp' list from Hamed_df['dssp']
            dssp = Hamed_values['dssp']
            #use zip function to create a tuple of all three values
            tuples = list(zip(sequence,dssp,Si_list))
            #create index that is the protein name as many times as the length of the sequence
            index_tuples = list(zip(([entry]*(len(sequence))),sequence))
            index = pd.MultiIndex.from_tuples(index_tuples,names=['protein','sequence'])
            #create DataFrame where data=tuples, columns=['sequence','dssp','Si_values], and index=index
            df = pd.DataFrame(tuples,columns=['sequence','dssp','Si_values'],index=index)
            #append this DataFrame to the EmptyDataFrame
            final_df = final_df.append(df)
        except KeyError:
            #adds 1 for every protein it skips
            count+=1
            continue
    #final_df = final_df.transpose()
    final_df.to_csv('/home/jupyter/tacc-work/Jan/proteins.df/unstable_residues/'+library_name+'_unstable.csv')
    final_time = time.time()
    total_time = (final_time-start_time)
    return final_df, total_time, count
    

## Eva1 Proteins

In [20]:
final_df, total_time, count = tsv_to_residues_df('Eva1.tsv')

print("process took:",total_time,'seconds \n')
print("skipped",count,'proteins \n')
print('final DF is:\n',final_df)

process took: 3282.8737285137177 seconds 

skipped 233 proteins 

final DF is:
                             dssp Si_values
protein            sequence               
ems_ferrM_7350.pdb G           L     0.933
                   E           E     0.134
                   V           E    -0.379
                   E           E    -0.021
                   V           E    -0.365
                   H           L    -0.107
                   N           L    -0.436
                   V           L    -0.580
                   D           L     0.326
                   E           L     0.358
                   A           H    -0.338
                   R           H    -1.099
                   E           H     0.432
                   F           H    -0.622
                   A           H    -0.300
                   K           H     0.649
                   E           H     0.846
                   A           H    -0.094
                   K           H     0.550
                 

In [28]:
tsv_to_stableresidues_df('Eva1.tsv')

(                          sequence dssp Si_values
 protein          sequence                        
 ems_4hM_4415.pdb D               D    L     1.294
                  L               L    H     0.316
                  E               E    H    -0.151
                  K               K    H     0.634
                  L               L    H    -0.785
                  A               A    H    -0.605
                  E               E    H    -0.854
                  K               K    H     0.680
                  L               L    H    -0.932
                  V               V    H    -1.076
                  E               E    H     0.954
                  E               E    H     1.248
                  I               I    H    -0.014
                  F               F    L    -0.426
                  D               D    L     0.540
                  D               D    L     0.512
                  P               P    H    -1.378
                  N            

In [29]:
tsv_to_unstableresidues_df('Eva1.tsv')

(                            sequence dssp Si_values
 protein            sequence                        
 ems_ferrM_7350.pdb G               G    L     0.933
                    E               E    E     0.134
                    V               V    E    -0.379
                    E               E    E    -0.021
                    V               V    E    -0.365
                    H               H    L    -0.107
                    N               N    L    -0.436
                    V               V    L    -0.580
                    D               D    L     0.326
                    E               E    L     0.358
                    A               A    H    -0.338
                    R               R    H    -1.099
                    E               E    H     0.432
                    F               F    H    -0.622
                    A               A    H    -0.300
                    K               K    H     0.649
                    E               E    H    

## Eva2 Proteins 

In [21]:
final_df, total_time, count = tsv_to_residues_df('Eva2.tsv')

print("process took:",total_time,'seconds')
print("skipped",count,'proteins')
print('final DF is:\n',final_df)

process took: 5802.316943883896 seconds
skipped 260 proteins
final DF is:
                                dssp Si_values
protein               sequence               
ems_3hC_63.pdb        D           L     1.377
                      K           H     1.490
                      S           H    -0.100
                      Q           H     0.735
                      T           H    -0.567
                      V           H    -0.801
                      T           H    -0.872
                      T           H    -0.576
                      L           H    -1.184
                      S           H    -0.577
                      K           H     0.607
                      K           H     0.325
                      A           H    -0.591
                      E           H    -0.537
                      K           H     0.314
                      L           H    -0.784
                      L           H    -1.240
                      K           H    -0.876
     

In [None]:
tsv_to_stableresidues_df('Eva2.tsv')

(                               sequence dssp Si_values
 protein               sequence                        
 ems_3hC_1127_0001.pdb E               E    L     1.750
                       Q               Q    H     1.056
                       V               V    H    -0.293
                       E               E    H     0.771
                       E               E    H     0.730
                       W               W    H    -0.784
                       A               A    H    -0.747
                       T               T    H    -1.095
                       T               T    H    -0.599
                       V               V    H    -0.911
                       A               A    H    -0.594
                       E               E    H    -0.083
                       L               L    H    -0.979
                       Y               Y    H    -1.212
                       V               V    H    -0.732
                       K               K    H   

In [None]:
tsv_to_unstableresidues_df('Eva2.tsv')

## Inna Proteins 

In [22]:
final_df, total_time, count = tsv_to_residues_df('Inna.tsv')

print("process took:",total_time,'seconds')
print("skipped",count,'proteins')
print('final DF is:\n',final_df)
len(final_df)

process took: 38.632808446884155 seconds
skipped 656 proteins
final DF is:
                                                    dssp Si_values
protein                                   sequence               
p1-15H-GABBL-15H-GBBL-16H_0395_0001.pdb   S           L     1.331
                                          K           H     1.616
                                          E           H     0.429
                                          E           H     0.668
                                          K           H     0.989
                                          I           H    -1.250
                                          K           H    -0.646
                                          R           H     0.337
                                          T           H    -0.798
                                          A           H    -0.673
                                          T           H    -1.025
                                          K           H     0.604


84255

In [None]:
tsv_to_stableresidues_df('Inna.tsv')

In [None]:
tsv_to_unstableresidues_df('Inna.tsv')

In [151]:
final_df['p1-14H-GBL-14H-GBL-14H_0528_0001_0001.pdb']

sequence,E,S,E.1,E.2,L,K,K.1,R,A,T,...,Y,I,R.1,E.3,L.1,L.2,E.4,R.2,L.3,G
dssp,L,H,H,H,H,H,H,H,H,H,...,H,H,H,H,H,H,H,H,H,L
Si_values,2.065,0.948,0.755,0.553,-1.095,-1.326,0.443,-0.058,-0.613,-0.762,...,-0.915,-1.471,-1.445,0.054,-0.945,-0.904,0.894,0.910,0.099,0.865


## Longxing Proteins

In [23]:
final_df, total_time, count = tsv_to_residues_df('Longxing.tsv')

print("process took:",total_time,'seconds')
print("skipped",count,'proteins')
print('final DF is:\n',final_df)

process took: 3767.058089733124 seconds
skipped 523 proteins
final DF is:
                          dssp Si_values
protein         sequence               
EEHEE_28024.pdb D           L     1.115
                D           L     0.907
                V           E     0.215
                K           E     1.318
                E           E     0.760
                K           E     0.869
                R           E     1.453
                T           E    -0.017
                N           E    -0.076
                V           E     0.320
                N           L    -0.031
                G           L     0.327
                R           E    -0.315
                E           E     0.744
                Q           E    -0.899
                R           E    -0.371
                H           E    -0.412
                V           E    -0.255
                E           E     0.934
                V           E     0.056
                N           L     0.419
     

In [None]:
tsv_to_stableresidues_df('Longxing.tsv')

In [None]:
tsv_to_unstableresidues_df('Longxing.tsv')

(                         sequence dssp Si_values
 protein         sequence                        
 EEHEE_28024.pdb D               D    L     1.115
                 D               D    L     0.907
                 V               V    E     0.215
                 K               K    E     1.318
                 E               E    E     0.760
                 K               K    E     0.869
                 R               R    E     1.453
                 T               T    E    -0.017
                 N               N    E    -0.076
                 V               V    E     0.320
                 N               N    L    -0.031
                 G               G    L     0.327
                 R               R    E    -0.315
                 E               E    E     0.744
                 Q               Q    E    -0.899
                 R               R    E    -0.371
                 H               H    E    -0.412
                 V               V    E    -0.255


## Longxing_untested Proteins

In [None]:
final_df, total_time, count = pd_tuple_residues('Longxing_untested.tsv')

print("process took:",total_time,'seconds')
print("skipped",count,'proteins')
print('final DF is:\n',final_df)

## Rocklin Proteins

In [24]:
final_df, total_time, count = tsv_to_residues_df('Rocklin.tsv')

print("process took:",total_time,'seconds')
print("skipped",count,'proteins')
print('final DF is:\n',final_df)

process took: 1768.9460153579712 seconds
skipped 305 proteins
final DF is:
                             dssp Si_values
protein            sequence               
HEEH_rd2_0581.pdb  S           L     1.447
                   D           H     0.757
                   K           H     1.191
                   E           H     0.796
                   K           H    -0.171
                   A           H    -0.309
                   Q           H    -0.012
                   R           H     0.100
                   A           H    -0.373
                   K           H    -0.969
                   E           H     0.158
                   A           H    -0.197
                   Y           H    -1.081
                   K           H     0.810
                   R           H     1.026
                   N           L    -0.352
                   Q           L     0.822
                   P           L    -0.694
                   I           E    -0.448
                   T 

In [None]:
tsv_to_stableresidues_df('Rocklin.tsv')

(                           sequence dssp Si_values
 protein           sequence                        
 EHEE_rd4_0025.pdb R               R    L     1.542
                   V               V    E     0.679
                   T               T    E     0.341
                   M               M    E     1.004
                   T               T    E     0.456
                   V               V    E     0.001
                   H               H    E     0.628
                   G               G    L     0.535
                   E               E    H     0.962
                   E               E    H     1.053
                   Q               Q    H     0.058
                   A               A    H    -0.084
                   R               R    H    -0.108
                   R               R    H     0.242
                   I               I    H    -0.312
                   E               E    H    -0.692
                   E               E    H     0.726
            

In [None]:
tsv_to_unstableresidues_df('Rocklin.tsv')

(                            sequence dssp Si_values
 protein            sequence                        
 HEEH_rd2_0581.pdb  S               S    L     1.447
                    D               D    H     0.757
                    K               K    H     1.191
                    E               E    H     0.796
                    K               K    H    -0.171
                    A               A    H    -0.309
                    Q               Q    H    -0.012
                    R               R    H     0.100
                    A               A    H    -0.373
                    K               K    H    -0.969
                    E               E    H     0.158
                    A               A    H    -0.197
                    Y               Y    H    -1.081
                    K               K    H     0.810
                    R               R    H     1.026
                    N               N    L    -0.352
                    Q               Q    L    

In [38]:
test_dict = final_df.to_dict()
test_dict

{'sequence': {'p1-14H-GBL-16H-GABBL-16H_0040_0001.pdb': 'R',
  'p1-14H-GBL-14H-GABBL-15H_0244_0001_0001.pdb': 'K',
  'p1-15H-GBBL-15H-GABBL-14H_0209_0001_0001.pdb': 'F',
  'p1-15H-GBL-16H-GABBL-15H_0065_0001.pdb': 'K',
  'p1-14H-GBL-16H-GABBL-15H_0265_0001.pdb': 'G',
  'p1-15H-BBL-16H-GABBL-15H_0075_0001_0001.pdb': 'L',
  'p1-15H-GBBL-14H-GBL-16H_0998_0001_0001.pdb': 'G',
  'p1-14H-GBL-14H-GBBL-14H_0231_0001_0001.pdb': 'G',
  'p1-15H-GBL-16H-GABBL-16H_0354_0001.pdb': 'N',
  'p1-15H-GBL-16H-GBL-14H_0215_0001_0001.pdb': 'G',
  'p1-15H-GABBL-15H-GBBL-16H_0929_0001_0001.pdb': 'G',
  'p1-15H-GBL-16H-GBL-15H_0077_0001_0001.pdb': 'S',
  'p1-14H-GABBL-16H-GABBL-14H_0079_0001_0001.pdb': 'R',
  'p1-16H-GABBL-14H-GBL-15H_0182_0001_0001.pdb': 'G',
  'p1-14H-GBL-14H-GABBL-15H_0160_0001_0001.pdb': 'S',
  'p1-14H-GBL-14H-GBL-15H_0603_0001_0001.pdb': 'G',
  'p1-16H-GBL-16H-GABBL-14H_0202_0001_0001.pdb': 'S',
  'p1-15H-BBL-15H-GBL-15H_0744_0001_0001.pdb': 'S',
  'p1-15H-GABBL-14H-GBL-15H_0210_0001_0001