# README

Notebook to test the Complex class as well as parsing code from cobrame/ecolime

## From COBRAme/ECOLIme...

### Flat files / ProcessData

In [1]:
import ecolime
import ecolime.flat_files

#### Protein complexes - ComplexData and ComplexFormation (the reactions needed to assemble the complexes in ComplexData)

In [2]:
# First load the list of complexes which tells you complexes + subunit stoichiometry
# Converts the protein_complexes.txt file into a dictionary for ME model construction
complexes = ecolime.flat_files.get_complex_subunit_stoichiometry('protein_complexes.txt')

# Then load the modifications which tells you the modificiations (ie. cofactors) that are needed for a complex
# Converts protein_modification.txt 
complex_modification_dict = ecolime.flat_files.get_complex_modifications('protein_modification.txt', 'protein_complexes.txt')

In [3]:
complexes

{'YmfC_mono': {'protein_b1135': 1.0},
 'CARNRACE-MONOMER': {'protein_b0036': 1.0},
 'CPLX0-1601': {'protein_b1587': 1.0,
  'protein_b1588': 1.0,
  'protein_b1589': 1.0,
  'protein_b1590': 1.0},
 'CPLX0-1762': {'protein_b1388': 1.0,
  'protein_b1389': 1.0,
  'protein_b1390': 1.0,
  'protein_b1392': 1.0},
 'NANE-MONOMER': {'protein_b3223': 1.0},
 'DTDPGLUCDEHYDRAT2-MONOMER': {'protein_b3788': 1.0},
 'NARU-MONOMER': {'protein_b1469': 1.0},
 'YDGEF-CPLX': {'protein_b1599': 1.0, 'protein_b1600': 1.0},
 'PHOSMANMUT-MONOMER': {'protein_b2048': 1.0},
 'CPLX0-3929': {'protein_b2708': 4.0},
 'CPLX0-7652': {'protein_b3903': 4.0},
 'ANSB-CPLX': {'protein_b2957': 4.0},
 'EG11983-MONOMER': {'protein_b2034': 1.0},
 'NARW-MONOMER': {'protein_b1466': 1.0},
 'EG11009-MONOMER': {'protein_b3035': 1.0},
 'GABATRANSAM-CPLX': {'protein_b2662': 2.0},
 'DIHYDROOROTOX-MONOMER': {'protein_b0945': 1.0},
 'NAPD-MONOMER': {'protein_b2207': 1.0},
 'DSBD-MONOMER': {'protein_b4136': 1.0},
 'NITRATREDUCTZ-CPLX': {'prot

In [4]:
complexes['CPLX0-7']

{'protein_b2415': 1.0,
 'protein_b2416': 1.0,
 'protein_b2417': 1.0,
 'protein_b2429': 1.0}

In [5]:
complexes['CPLX0-1601']

{'protein_b1587': 1.0,
 'protein_b1588': 1.0,
 'protein_b1589': 1.0,
 'protein_b1590': 1.0}

#### Reaction to complex information

In [6]:
from collections import defaultdict
import pandas
from os.path import dirname, join, abspath

ecoli_files_dir = join('/home/nathan/projects_unsynced/ecolime/ecolime/', 'building_data/')

from ecolime import corrections

def fixpath(filename):
    return join(ecoli_files_dir, filename)

# From: ecolime.flat_files.get_reaction_to_complex, modified to just parse the file
def get_reaction_to_complex(modifications=True):
    """anything not in this dict is assumed to be an orphan"""

    rxn_to_complex_dict = defaultdict(set)

    # Load enzyme reaction association dataframe
    df = pandas.read_csv(fixpath('enzyme_reaction_association.txt'),
                         delimiter='\t', names=['Reaction', 'Complexes'])
    # Fix legacy naming
    df = df.applymap(lambda x: x.replace('DASH', ''))
    df = df.set_index('Reaction')

    df = corrections.correct_enzyme_reaction_association_frame(df)

    for reaction, complexes in df.itertuples():
        for cplx in complexes.split(' OR '):
            if modifications:
                rxn_to_complex_dict[reaction].add(cplx)
            else:
                rxn_to_complex_dict[reaction].add(cplx.split('_mod_')[0])

    return rxn_to_complex_dict

reaction_to_complex = get_reaction_to_complex()

In [11]:
for reaction,cplxs in reaction_to_complex.items():
    for c in cplxs:
        if 'NADH-DHI-CPLX' in c:
            print(reaction, cplxs)

NADH17pp {'NADH-DHI-CPLX_mod_2fe2s_mod_4fe4s_mod_fmn'}
NADH18pp {'NADH-DHI-CPLX_mod_2fe2s_mod_4fe4s_mod_fmn'}
NADH16pp {'NADH-DHI-CPLX_mod_2fe2s_mod_4fe4s_mod_fmn'}


#### Summary

In [6]:
from collections import OrderedDict

In [13]:
biglist = []

for reaction,cplxs in reaction_to_complex.items():
    print('Reaction:', reaction)
    print('Reaction rule:', cplxs)
    print()
    
    for cplx in cplxs:
        
        smalldict = OrderedDict()
        smalldict['Reaction'] = reaction
#         smalldict['Reaction_rule'] = ';'.join(cplxs)
        
        if cplx not in complex_modification_dict:
            subunits = {k.split('protein_')[1]:v for k,v in complexes[cplx].items()}
            print('\tComplex ID:', cplx)
            print('\tComplex subunits:', subunits)
            smalldict['Complex_ID'] = cplx
            smalldict['Complex_ID_mod'] = None
            smalldict['Complex_subunits'] = [(k, v) for k,v in subunits.items()]
            smalldict['Complex_modifications'] = None
        else:
            subunits = {k.split('protein_')[1]:v for k,v in complexes[complex_modification_dict[cplx]['core_enzyme']].items()}
            mods = complex_modification_dict[cplx]['modifications']
            print('\tComplex ID (modification):', cplx)
            print('\tComplex ID (original):', complex_modification_dict[cplx]['core_enzyme'])
            print('\tComplex subunits:', subunits)
            print('\tComplex modification:', mods)
            smalldict['Complex_ID'] = complex_modification_dict[cplx]['core_enzyme']
            smalldict['Complex_ID_mod'] = cplx
            smalldict['Complex_subunits'] = ((k, v) for k,v in subunits.items())
            smalldict['Complex_modifications'] = ((k, v) for k,v in mods.items())
        print()
        
        biglist.append(smalldict)

Reaction: CSND
Reaction rule: {'CYTDEAM-MONOMER_mod_fe2_mod_zn2', 'CYTDEAM-MONOMER_mod_cobalt2_mod_zn2', 'CYTDEAM-MONOMER_mod_mn2_mod_zn2'}

	Complex ID (modification): CYTDEAM-MONOMER_mod_fe2_mod_zn2
	Complex ID (original): CYTDEAM-MONOMER
	Complex subunits: {'b0337': 1.0}
	Complex modification: {'fe2_c': -1.0, 'zn2_c': -1.0}

	Complex ID (modification): CYTDEAM-MONOMER_mod_cobalt2_mod_zn2
	Complex ID (original): CYTDEAM-MONOMER
	Complex subunits: {'b0337': 1.0}
	Complex modification: {'cobalt2_c': -1.0, 'zn2_c': -1.0}

	Complex ID (modification): CYTDEAM-MONOMER_mod_mn2_mod_zn2
	Complex ID (original): CYTDEAM-MONOMER
	Complex subunits: {'b0337': 1.0}
	Complex modification: {'mn2_c': -1.0, 'zn2_c': -1.0}

Reaction: ADNt2pp
Reaction rule: {'NUPG-MONOMER', 'NUPC-MONOMER'}

	Complex ID: NUPG-MONOMER
	Complex subunits: {'b2964': 1.0}

	Complex ID: NUPC-MONOMER
	Complex subunits: {'b2393': 1.0}

Reaction: GTPDPDP
Reaction rule: {'PPPGPPHYDRO-CPLX_mod_nh4_mod_mg2'}

	Complex ID (modificatio

	Complex ID (original): PGPPHOSPHAA-MONOMER
	Complex subunits: {'b0418': 1.0}
	Complex modification: {'mg2_c': -1.0}

Reaction: GALTptspp
Reaction rule: {'CPLX0-231'}

	Complex ID: CPLX0-231
	Complex subunits: {'b2415': 1.0, 'b2416': 2.0, 'b2093': 1.0, 'b2092': 2.0, 'b2094': 1.0}

Reaction: PHEt2rpp
Reaction rule: {'AROP-MONOMER', 'EG12713-MONOMER', 'PHEP-MONOMER'}

	Complex ID: AROP-MONOMER
	Complex subunits: {'b0112': 1.0}

	Complex ID: EG12713-MONOMER
	Complex subunits: {'b1473': 1.0}

	Complex ID: PHEP-MONOMER
	Complex subunits: {'b0576': 1.0}

Reaction: CPGNtonex
Reaction rule: {'CPLX0-1923_EG10306-MONOMER'}

	Complex ID: CPLX0-1923_EG10306-MONOMER
	Complex subunits: {'b1102': 1.0, 'b3005': 1.0, 'b1252': 1.0, 'b3006': 1.0}

Reaction: FE3tex
Reaction rule: {'CPLX0-7534', 'G6700-MONOMER', 'CPLX0-7530', 'CPLX0-7533'}

	Complex ID: CPLX0-7534
	Complex subunits: {'b0929': 3.0}

	Complex ID: G6700-MONOMER
	Complex subunits: {'b1377': 1.0}

	Complex ID: CPLX0-7530
	Complex subunits: {'b0

	Complex subunits: {'b1377': 1.0}

	Complex ID: CPLX0-7530
	Complex subunits: {'b0241': 3.0}

	Complex ID: CPLX0-7533
	Complex subunits: {'b2215': 3.0}

Reaction: CYANtex
Reaction rule: {'CPLX0-7534', 'G6700-MONOMER', 'CPLX0-7530', 'CPLX0-7533'}

	Complex ID: CPLX0-7534
	Complex subunits: {'b0929': 3.0}

	Complex ID: G6700-MONOMER
	Complex subunits: {'b1377': 1.0}

	Complex ID: CPLX0-7530
	Complex subunits: {'b0241': 3.0}

	Complex ID: CPLX0-7533
	Complex subunits: {'b2215': 3.0}

Reaction: EAR80x1
Reaction rule: {'ENOYL-ACP-REDUCT-NADH-MONOMER'}

	Complex ID: ENOYL-ACP-REDUCT-NADH-MONOMER
	Complex subunits: {'b1288': 1.0}

Reaction: APG3PAT161
Reaction rule: {'EG11674-MONOMER'}

	Complex ID: EG11674-MONOMER
	Complex subunits: {'b3059': 1.0}

Reaction: APG3PAT160
Reaction rule: {'EG11674-MONOMER'}

	Complex ID: EG11674-MONOMER
	Complex subunits: {'b3059': 1.0}

Reaction: CYTDt2pp
Reaction rule: {'NUPG-MONOMER', 'NUPC-MONOMER'}

	Complex ID: NUPG-MONOMER
	Complex subunits: {'b2964': 1.0

IndexError: list index out of range

In [14]:
import pandas as pd
pd.DataFrame(biglist)

Unnamed: 0,Reaction,Complex_ID,Complex_ID_mod,Complex_subunits,Complex_modifications
0,CSND,CYTDEAM-MONOMER,CYTDEAM-MONOMER_mod_fe2_mod_zn2,<generator object <genexpr> at 0x7f9d90937eb8>,<generator object <genexpr> at 0x7f9d908713b8>
1,CSND,CYTDEAM-MONOMER,CYTDEAM-MONOMER_mod_cobalt2_mod_zn2,<generator object <genexpr> at 0x7f9d90871990>,<generator object <genexpr> at 0x7f9d90911990>
2,CSND,CYTDEAM-MONOMER,CYTDEAM-MONOMER_mod_mn2_mod_zn2,<generator object <genexpr> at 0x7f9d90911ca8>,<generator object <genexpr> at 0x7f9d90911620>
3,ADNt2pp,NUPG-MONOMER,,"[(b2964, 1.0)]",
4,ADNt2pp,NUPC-MONOMER,,"[(b2393, 1.0)]",
5,GTPDPDP,PPPGPPHYDRO-CPLX,PPPGPPHYDRO-CPLX_mod_nh4_mod_mg2,<generator object <genexpr> at 0x7f9d908854c0>,<generator object <genexpr> at 0x7f9d908855c8>
6,RNDR1b3,RIBONUCLEOSIDE-DIP-REDUCTII-CPLX,,"[(b2676, 2.0), (b2675, 2.0)]",
7,RNDR1b1,RIBONUCLEOSIDE-DIP-REDUCTII-CPLX,,"[(b2676, 2.0), (b2675, 2.0)]",
8,RNDR1b4,RIBONUCLEOSIDE-DIP-REDUCTII-CPLX,,"[(b2676, 2.0), (b2675, 2.0)]",
9,NTD10,EG11817-MONOMER,EG11817-MONOMER_mod_cobalt2,<generator object <genexpr> at 0x7f9d908856d0>,<generator object <genexpr> at 0x7f9d90885938>


## Todo for ssbio

1. Take as input the complex subunits dictionary
1. Reformat to remove the "protein\_" part
1. If monomer (only one subunit of protein)...
    
    1. Run thru GEM-PRO pipeline
    1. If PDBs...

        1. (A*) For each PDB

            1. Check quality

                1. If high quality...

                    1. Select and clean best PDB - **END**

                1. If not high quality, go to next PDB (A\*)

        1. If no good PDB - go to homology modeling (B\*)

    1. If no PDB...

        1. (B\*) For each homology model

            1. Check quality

                1. If quality good - use model - **END**
                1. If quality bad - go to next homology model (B\*)

        1. If all models bad - run I-TASSER manually - use PDB - **END**

1. If homomer (multiple subunits S of same protein)

    1. Run thru GEM-PRO pipeline
    1. If PDBs...

        1. (C\*) For each PDB...

            1. If homomeric

                1. (D\*) For each bioassembly

                    1. Select and download bioassembly

                    1. Compare complex stoichiometry 

                        1. If same stoichiometry - use bioassembly 1 - **END**
                        1. If not same...

                            1. Rerun (D\*)
                            1. (E\*) OR - **MANUALLY inspect EcoCyc entry** and **REANNOTATE ME-model complex** - use PDB - **END** (TODO priority 1)

                1. If no bioassemblies appropriate, go to next PDB (C\*) (will not run if we do (E\*))

            1. If heteromeric

                1. Not appropriate, go to next PDB (C\*)

        1. If no PDB appropriate, go to (F\*)

    1. (F\*) If no PDBs...
    
        1. (G\*) For SWISS-MODEL models

            1. Run/retreive quaternary structure predictor

                1. (H\*) For each quatstruct predictor (ranked by probability)

                    1. Compare complex stoichiometry

                        1. If same stoichiometry - use quatstruct - **END**
                        1. If not same, go to next prediction (H\*)

                1. If no quatstruct predictions right, go to next SWISS-MODEL model (G\*)
            
        1. If no SWISS-MODEL models have correct quatstruct, resort to (B\*) and...
        
            1. Select best model
            1. Assemble "dummy" PDB with S times the subunit
            1. Save as final complex - **END**
                
1. If heteromer (multiple subunits of different proteins)...

    1. SEE LUCIDCHART!