## This function will change .pdb files to text files with a .entropy extension

* runs through about **162 .pdb files** per minute
* assumes directories that are inputted are already made
* written using an explicit path that must be changed depending on computer
* assumes `popcoen_server.py` is running (can only be run using python 2.x)

#### How to run popcoen_server.py

* install [Popcoen](http://fmc.ub.edu/popcoen/Popcoen_Full_Version1.tar.bz2)
* open a **terminal** and **change directory** to `Popcoen_Full_Version1` directory
* run `./popcoen_server.py` in a terminal


IMPORTANT: _**This notebook must have a Python2 kernel in order to work since Popcoen was written in Python 2.x**_

In [1]:
#this code assumes the popcoen_server.py script is running in a terminal

import os
from glob import glob

def pdb_to_text(pdb_dir,target_dir): #takes the current pdb_dir and where the outputted .txt files will be stored as arguments
    
    """
    Converts .pdb files from a specified directory within '/home/jupyter/tacc-work/Jan/pdb_files'
    to text files with the extension .entropy in a specified directory within '/home/jupyter/
    proteins.entropy'
    """
    #make sure you are at the pdb_dir
    path_name=''
    if os.getcwd != '/home/jupyter/tacc-work/Jan/pdb_files':
        os.chdir('/home/jupyter/tacc-work/Jan/pdb_files')
    
    #create path name
    path_name = os.path.join(os.getcwd(),pdb_dir)
    
    print("path_name = "+path_name)
    os.chdir(path_name)
    
    #use glob to create a list of pdbs
    pdb_list = glob('*.pdb')
    
    print('pdb list contains',len(pdb_list),'entries')
    
    #iterate through list
    for protein in pdb_list: 
        #store the protein's name using os.path.basename and os.path.splitext[0]
        name_tuple = os.path.splitext(protein)
        name_protein = name_tuple[0] #not stored as a string
        
        #feed each term into Popcoen through command line using os.system // possible err is server not running
        #grep through the output looking for "Si-predict", "S_PC", and "Reliability-number"
        #store output in a text file <name>.entropy in the target_dir
        os.system('/home/jupyter/tacc-work/maverick/Popcoen_Full_Version1/client_for_popcoen.py < '+ path_name+"/"+str(protein)+' | grep -e \"Si-predict:\" -e \"S_PC = sum Si =\" -e \"Reliability-number lambda =\">/home/jupyter/tacc-work/Jan/proteins.entropy/'+str(target_dir)+"/"+str(name_protein)+'.entropy')
        
        
        



In [2]:
def pdb_to_text_v2(pdb_dir,target_dir, glob_list=True, list_of_pdb=None): #takes the current pdb_dir and where the outputted .txt files will be stored as arguments
    
    """
    Same as pdb_to_text but offers the option of manually feeding it a list of PDB files
    """
    #make sure you are at the pdb_dir
    original_path = os.getcwd()
    if original_path != '/home/jupyter/tacc-work/Jan/PDB Files/':
        os.chdir('/home/jupyter/tacc-work/Jan/PDB Files/')
    
    #create path name
    path_name = os.path.join(os.getcwd(),pdb_dir)
    
    print("path_name = "+path_name)
    os.chdir(path_name)
    
    pdb_list = []
    if glob_list:
        #use glob to create a list of pdbs
        pdb_list = glob('*.pdb')
    else:
        pdb_list = list_of_pdb
    
    print('pdb list contains',len(pdb_list),'entries')
    
    #iterate through list
    for protein in pdb_list: 
        #store the protein's name using os.path.basename and os.path.splitext[0]
        name_tuple = os.path.splitext(protein)
        name_protein = name_tuple[0] #not stored as a string
        
        #feed each term into Popcoen through command line using os.system // possible err is server not running
        #grep through the output looking for "Si-predict", "S_PC", and "Reliability-number"
        #store output in a text file <name>.entropy in the target_dir
        os.system('/home/jupyter/tacc-work/maverick/Popcoen_Full_Version1/client_for_popcoen.py < '+ path_name+"/"+str(protein)+' | grep -e \"Si-predict:\" -e \"S_PC = sum Si =\" -e \"Reliability-number lambda =\">/home/jupyter/tacc-work/Jan/proteins.entropy/'+str(target_dir)+"/"+str(name_protein)+'.entropy')
    
    #return to original path
    os.chdir(original_path)

In [3]:
import pandas as pd
topmin_df = pd.read_csv('/home/jupyter/tacc-work/Jan/topology_mining_untested_keys.csv')
pdb_names = topmin_df['name_y'].tolist()


In [None]:
pdb_to_text_v2('topology_mining_pdb', 'Topology_mining.entropy', glob_list=False,list_of_pdb=pdb_names)

path_name = /home/jupyter/tacc-work/Jan/PDB Files/topology_mining_pdb
('pdb list contains', 16720, 'entries')


### Ran the code on a test directory

In [12]:
pdb_to_text_v2('Example','Example.entropy')

path_name = /home/jupyter/tacc-work/Jan/PDB Files/Example
('pdb list contains', 5, 'entries')


### Ran the code on `Eva1` dataset

In [None]:
pdb_to_text('Eva_data1','Eva_data1.entropy')

path_name = /home/jupyter/Jan/pdb_files/Eva_data1


### Ran the code on `Eva2` dataset

In [None]:
pdb_to_text('Eva_data2','Eva_data2.entropy')

path_name = /home/jupyter/tacc-work/Jan/pdb_files/Eva_data2


### Ran the code on `Inna` dataset

In [None]:
pdb_to_text('Inna_data','Inna_data.entropy')

path_name = /home/jupyter/tacc-work/Jan/pdb_files/Inna_data


### Ran the code on `Longxing` dataset


In [None]:
pdb_to_text('Longxing_data','Longxing_data.entropy')

path_name = /home/jupyter/tacc-work/Jan/pdb_files/Longxing_data


### Ran the code on `Longxing_untested` dataset

In [None]:
pdb_to_text('Longxing_data_untested','Longxing_data_untested.entropy')

path_name = /home/jupyter/tacc-work/Jan/pdb_files/Longxing_data_untested
('pdb list contains', 63106, 'entries')


### Ran the code on `Rocklin` dataset

In [None]:
pdb_to_text('Rocklin_data','Rocklin_data.entropy')