In [None]:
upstream = None 
product = None
input = None
clade_assignments = None

# 01 - Uniform files

We will follow these steps to guarantee that all the 

1. Copy all files from different components into different folders.
2. Name them according to the following convention.

   
    {dataset}.{stoichiometry}.{id}.pdb
    

    Where:
    - {dataset} : either gold or silver.
    - {stoichiometry} : chains. For now, we shall consider DDKK, DDKKGG, and HH.
    - {id}: we will use just use an internal code. 


3. Store the original name, the new name, the scientific name, the taxon (if known),
the stoichiometry, the average pLDDT, and the previous steps.
4. Rename files.

In [None]:
import pandas as pd
import glob
import os
import prody as pdy
import numpy as np
import requests
import os
import tqdm
import shutil
COUNTER = 0

In [None]:
print(input)

In [None]:
def analyze_structure(file, chain_reference):
    pdb = pdy.parsePDB(file)
    stoichiometry = []

    for chain in pdb.getHierView().iterChains():

        chain_id = chain.getChid()
        subunit_id = chain_reference[chain_id]
        stoichiometry.append(subunit_id)

    average_pLDDT = np.mean(pdb.select('name CA').getBetas())

    return ''.join(sorted(stoichiometry)), average_pLDDT

In [None]:
def remove_tails(x, terms):
    for t in terms:
        x = x.replace(t, '')
    return x 

def extract_scientific_name(x):

    x = x.split('_')
    if x[0] == 'Anc':
        return '_'.join(x[:-1]), x[0], True
    else:
        return ' '.join(x[1:-1]), x[0], False

def retrieve_taxon(x, taxon_data):
    try:
        return taxon_data[x]
        
    except KeyError:
        r = requests.get('https://www.ebi.ac.uk/ena/taxonomy/rest/scientific-name/{0}'.format(x.replace(' ', '%20')))
        try:
            result = r.json()[0]
            taxon_data[x] = result
            return result
        except IndexError:
            return []

taxon_data = {}
print(remove_tails('Anc_1206_001.rechained.pdb', ['.rechained', '.pdb']))
print(remove_tails('Nif_Acetobacterium_woodii_005.rechained.pdb', ['.rechained', '.pdb']))
print(extract_scientific_name('Nif_Acetobacterium_woodii_005'))
print(extract_scientific_name('Anc_1206_001'))
print(retrieve_taxon('Acetobacterium woodii', taxon_data))
print(retrieve_taxon('Acetobacterium woodii', taxon_data))

In [None]:
def extract_variant_type(x):
    if x.find('_map') != -1:
        return 'MAP'
    
    if x.find('_altall') != -1:
        return 'ALTALL'

    if x.find('_alt') != -1 and x.find('all') == -1:
        return 'ALT' + str(x[x.find('_alt') + 4])

    else:
        return 'WT'

print(extract_variant_type('Anc1207_map'))
print(extract_variant_type('Anc1205_altall'))
print(extract_variant_type('Anc1205_alt5'))
print(extract_variant_type('Anc1205_alt3'))
print(extract_variant_type('Nif_Thermoanaerobacterium_thermosaccharolyticum_003'))
print(extract_variant_type('Anc_1206_001'))

In [None]:
clade_assignments = pd.read_csv(clade_assignments)
clade_assignments = clade_assignments.set_index('name').to_dict()['clade']

def retrieve_clade(scientific_name, nitrogenase_type):
    if nitrogenase_type != 'Anc':
        name = nitrogenase_type + '_' + scientific_name.replace(' ', '_')
    else:
        name = scientific_name.replace('_map', '')
    return clade_assignments[name]

assert(retrieve_clade('Azotobacter vinelandii', 'Nif') == 'nif-i')
assert(retrieve_clade('Azotobacter vinelandii', 'Anf') == 'an')
assert(retrieve_clade('Anc_1485_map', 'Anc') == 'nif-ii')
assert(retrieve_clade('Elusimicrobia bacterium RIFOXYA2 FULL 50 26', 'Nif') == 'nif-ii')

    

In [None]:
def add_remarks(file, information):
    with open(file) as f:
        contents = f.read()

    with open(file, 'w') as f:
        for key, item in information.items():

            f.write('REMARK {:70s}\n'.format((str(key) + ':' + str(item))[:70]))

        f.write(contents)
    return

In [None]:
def uniform_pdb_filenames(folder_info, reference_index, COUNTER, TAXON_DATA):
    current_path = os.getcwd()
    os.chdir(folder_info['path'])
    pdb_files = glob.glob('*.rechained.pdb')
    chain_reference = folder_info['chain-reference']

    for file in tqdm.tqdm(pdb_files):

        stoichiometry, average_pLDDT = analyze_structure(file, chain_reference)
        clean_file = remove_tails(file, ['.rechained', '.pdb'])
        scientific_name, nitrogenase_type, is_ancestral = extract_scientific_name(clean_file)
        variant = extract_variant_type(clean_file)
        try:
            clade = retrieve_clade(scientific_name, nitrogenase_type)
        except KeyError:
            clade = 'other'
        
        if is_ancestral is False:

            taxon_information = retrieve_taxon(scientific_name, taxon_data=TAXON_DATA)
            try:
                taxid = taxon_information['taxId']
                lineage = taxon_information['lineage']
            except:
                taxid = None 
                lineage = None

        else:

            taxid = scientific_name
            lineage = scientific_name
        
        COUNTER += 1
         
        new_name = 'nsdb-{:06d}'.format(COUNTER)
        info = dict(
                original_name=file,
                id=new_name,
                stochiometry=stoichiometry,
                average_pLDDT=average_pLDDT,
                taxond_id=taxid,
                lineage=lineage,
                scientific_name=scientific_name,
                is_ancestral=is_ancestral,
                nitrogenase_type=nitrogenase_type,
                status=folder_info['dataset_status'],
                chain_reference=chain_reference,
                variant=variant,
                clade=clade
            )
        
        shutil.copy(file, current_path + '/' + new_name + '.pdb')
        add_remarks(current_path + '/' + new_name + '.pdb', info)
        reference_index.append(
            info
        )
        shutil.copy(
            file.replace('.rechained.pdb', '.clean.pdb'), 
            current_path + '/' + new_name + '.clean.pdb'
        )
        add_remarks(
            current_path + '/' + new_name + '.clean.pdb',
            info
        )

    os.chdir(current_path)
    return COUNTER 



In [None]:
reference = []
taxon_cache = {}
COUNTER = 0

for f in input:
    COUNTER = uniform_pdb_filenames(
        f, 
        reference_index=reference,  
        COUNTER=COUNTER,
        TAXON_DATA=taxon_cache
    )
    

In [None]:
pd.DataFrame.from_records(reference).to_csv(product['data'], index=None)