In [3]:
# Standard library imports
from argparse import ArgumentParser
from datetime import datetime
from functools import reduce
import logging as log
import os
import shutil
import sys


# Third party imports
from Bio.PDB import PDBParser
from Bio.PDB.PDBList import PDBList 
from Bio.PDB.DSSP import DSSP
import pandas as pd

In [4]:
#local imports
#import_path_base = '/storage1/hezscha/src/'
import_path_base = '/home/henrike/Documents/PD_AS/src/'
sys.path.insert(1, import_path_base + 'PRISM/prism/scripts/')
from PrismData import PrismParser, VariantData
sys.path.insert(1, import_path_base + 'PRISM/software/domain_protein_features/scripts/')
from FillVariants import copy_wt_variants
#from prism_parser_helper import write_prism



In [5]:
def write_prism(metadata, dataframe, prism_file, comment=''):
	variant_dataset = VariantData(metadata, dataframe)
	parser = PrismParser()
	parser.write(prism_file, variant_dataset, comment_lines=comment)

def read_from_prism(primsfile):
	parser = PrismParser()
	dataframe = parser.read(primsfile).dataframe
	meta_data = parser.read_header(primsfile)
	return meta_data, dataframe

def download_pdb(pdb_id, output_dir='.'):
    pdbl = PDBList()
    pdbl.retrieve_pdb_file(pdb_id, file_format='pdb', pdir=output_dir)
    pdb_path = os.path.join(output_dir, f'{pdb_id}.pdb')
    shutil.move(os.path.join(output_dir, f'pdb{pdb_id.lower()}.ent'), pdb_path)
    return pdb_path

In [6]:
output_dir = '/home/henrike/Documents/PD_AS/projects/marks_disease_genes/data/pdbs'

#select pdb file
#dhfr with MTX bound:
#pdbID = '1u72'
#dhfr with NAD but not MTX bound:
pdbID = '4m6j'

In [7]:
#start by dling the pdb file
pdb_file = download_pdb(pdbID, output_dir=output_dir)

Downloading PDB structure '4m6j'...


In [8]:
#prep the structure
pdb_p = PDBParser()
structure = pdb_p.get_structure(pdbID, pdb_file)
model = structure[0]

In [9]:
#run DSSP
dssp = DSSP(model, pdb_file)

In [None]:
#use all chains. Switch if you want to want only specific chains (then use comma separated list, see below)
#chain = 'A,B,C'
chain = 'all'

In [None]:
variant_list = [['variant', 'SS', 'ASA', 'chain']]
if chain == 'all':
    chains = list(set([key[0] for key in dssp.keys() ])) #use all chains
else:
    chains = chain.split(',') #use only the asked for chains

for chain in chains:
    for key in dssp.keys():
        if key[0] == chain:
            arr = [None] * len(variant_list[0])
            arr[0] = f'{dssp[key][1]}{key[1][1]}='
            # secondary structure
            arr[1] = dssp[key][2] 
            # accessible surface area
            # Use this to convert NA to None so you still have a numeric column for the RSA
            arr[2] = None if dssp[key][3] == 'NA' else dssp[key][3]
            arr[3] = chain
            variant_list.append(arr)
dssp_df = pd.DataFrame(data = variant_list[1:], columns=variant_list[0])

variant_dic = { 'SS': 'Secondary structure with H=alpha helix (4-12), B=isolated beta-bridge residue, E=Strand, G=3-10 helix, i=Pi helix, T=Turn, S=Bend, -=None', 
                'ASA':'Accessible surface area',}

In [None]:
#look at the df created from the DSSP results:
dssp_df

In [None]:
#save to csv file (change path to what's applicable)
dssp_df.to_csv('/home/henrike/Documents/PD_AS/projects/Sofie_Mave/data/'+pdbID+'_dssp.csv', sep = ' ', index = False)