In [None]:
product = None
upstream = None
md_reference_data = None

# Contact analysis for Rubisco Extant protein complexes

In [1]:
import MDAnalysis as mda
#from MDAnalysis.tests.datafiles import TPR, XTC
from MDAnalysis.analysis import density
from MDAnalysis.analysis import rdf
from MDAnalysis.analysis import contacts
import prody as pdy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import seaborn as sns
import math



In [2]:
def contacts_within_cutoff(u, group_a, group_b, radius=6.0):
    timeseries = []
    for ts in u.trajectory:
        dist = contacts.distance_array(group_a.positions, group_b.positions)
        n_contacts = contacts.contact_matrix(dist, radius).sum()
        timeseries.append([ts.frame, n_contacts])
    return np.array(timeseries)

In [3]:
lsu_chains = ['A','E','I','M','C','G','K','O']
ssu_chains = ['B','F','J','N','D','H','L','P']

In [4]:
def average_gas_per_subunit(pdb_file, dcd_file, gas, atom, conc):
    env = mda.Universe(pdb_file, dcd_file, frames="all")
    prot_lsu_atoms = env.select_atoms('protein').select_atoms('name CA and (segid A or segid E or segid I or segid M or segid C or segid G or segid K or segid O)')     # This is the annotation for 8RUC reference structure we have
    prot_ssu_atoms = env.select_atoms('protein').select_atoms('name CA and (segid B or segid F or segid J or segid N or segid D or segid H or segid L or segid P)')
    gas_atoms = env.select_atoms('resname ' + gas).select_atoms('name ' + atom)

    ca_lsu = contacts_within_cutoff(env, gas_atoms, prot_lsu_atoms, radius=6.0)
    ca_lsu_df = pd.DataFrame(ca_lsu, columns=['Frame', '# Contacts'])

    ca_ssu = contacts_within_cutoff(env, gas_atoms, prot_ssu_atoms, radius=6.0)
    ca_ssu_df = pd.DataFrame(ca_ssu, columns=['Frame', '# Contacts'])

    avg_val_lsu = sum(ca_lsu_df['# Contacts'][249:749])/len(prot_lsu_atoms)
    avg_val_ssu = sum(ca_ssu_df['# Contacts'][249:749])/len(prot_ssu_atoms)

    return(avg_val_lsu, avg_val_ssu)

In [5]:
def average_gas_per_individual_subunit(pdb_file, dcd_file, gas, atom, conc):
    
    env = mda.Universe(pdb_file, dcd_file, frames="all")
    gas_atoms = env.select_atoms('resname ' + gas).select_atoms('name ' + atom)

    li_ca_lsu = []
    for id in lsu_chains:
        prot_lsu_atoms = env.select_atoms('protein').select_atoms('name CA and segid ' + id)
        ca_lsu = contacts_within_cutoff(env, gas_atoms, prot_lsu_atoms, radius=6.0)
        ca_lsu_df = pd.DataFrame(ca_lsu, columns=['Frame', '# Contacts'])
        avg_val_lsu = sum(ca_lsu_df['# Contacts'][249:749])/len(prot_lsu_atoms)
        li_ca_lsu.append(avg_val_lsu)
        
    li_ca_ssu = []
    for id in ssu_chains:
        prot_ssu_atoms = env.select_atoms('protein').select_atoms('name CA and segid ' + id)
        ca_ssu = contacts_within_cutoff(env, gas_atoms, prot_ssu_atoms, radius=6.0)
        ca_ssu_df = pd.DataFrame(ca_ssu, columns=['Frame', '# Contacts'])
        try:
            avg_val_ssu = sum(ca_ssu_df['# Contacts'][249:749])/len(prot_ssu_atoms)
        except ZeroDivisionError:
            avg_val_ssu = 0
        li_ca_ssu.append(avg_val_ssu)

    return(li_ca_lsu, li_ca_ssu)

In [8]:
avergage_gas_per_indvidual_subunit_contacts = []
path = '../../../simulations/'

def central_atom(gas):
    if gas == 'CO2':
        return 'C1'
    elif gas == 'MO2':
        return 'O1'

for entry in md_reference_data:
    print(entry['code'])

    contacts_lsu, contacts_ssu = average_gas_per_individual_subunit(
        path + entry['dry_pdb'], 
        path + entry['dry_dcd'], 
        entry['gas'], 
        central_atom(entry['gas']), 
        entry['concentration']
    )
    for x in contacts_lsu:
    
        avergage_gas_per_indvidual_subunit_contacts.append(
            dict(
                code=entry['code'], gas=entry['gas'],
                concentration=entry['concentration'],
                contacts=x, subunit='lsu'
            )
        )
    
    for x in contacts_ssu:
    
        avergage_gas_per_indvidual_subunit_contacts.append(
            dict(
                code=entry['code'], gas=entry['gas'],
                concentration=entry['concentration'],
                contacts=x, subunit='ssu'
            )
        )

avergage_gas_per_indvidual_subunit_contacts = pd.DataFrame.from_records(
    avergage_gas_per_indvidual_subunit_contacts
)



In [None]:
col_set = {"CO2" : "#ff5a5f",  "MO2" : "#86bbd8"}
g = sns.catplot(
    data=avergage_gas_per_indvidual_subunit_contacts, 
    x='code', y='contacts', hue='gas', col='subunit',
    palette=col_set, kind='bar'
)
g.set_ylabels('Gas contact per residue', fontname='Arial')
g.axes[0, 0].set_ylim(0,250)
g.axes[0, 1].set_ylim(0,250)

In [16]:
avergage_gas_per_indvidual_subunit_contacts.to_csv(
    product['contacts_per_subunit'], index=None
)

# Calclulating Statiscs for comparing the gas contacts
## Student t-test

In [11]:
from scipy.stats import ttest_ind

In [4]:
u = avergage_gas_per_indvidual_subunit_contacts.groupby(['code', 'subunit', 'gas']).apply(lambda x: list(x['contacts'])).reset_index().rename(columns={0:'contacts'}).pivot(
    index=['code', 'subunit'], columns='gas', values='contacts'
)
# t_stat = ttest_ind(cc[cc.code.isin([var])]['value']
u['t_test_pvalue'] = u.apply(
    lambda x: ttest_ind(x['CO2'], x['MO2']).pvalue, axis=1
)
u
u['t_test_statistic'] = u.apply(
    lambda x: ttest_ind(x['CO2'], x['MO2']).statistic, axis=1
)

u['CO2_mean'] = u['CO2'].apply(np.mean)
u['MO2_mean'] = u['MO2'].apply(np.mean)
u['CO2_std'] = u['CO2'].apply(np.std)
u['MO2_std'] = u['MO2'].apply(np.std)

u.reset_index()[['code', 'subunit', 't_test_pvalue', 't_test_statistic', 'CO2_mean', 'MO2_mean', 'CO2_std', 'MO2_std']]