In [1]:
cd

/Users/keithwalcott


In [2]:
import pandas as pd
import csv
import numpy as np

In [3]:
#pymol stuff
import pymol
from pymol import cmd, stored

# Initialize PyMOL in headless mode
pymol.pymol_argv = ['pymol', '-qc']
pymol.finish_launching()

# Load the structure 6xm4
cmd.fetch("6xm4", "molecule")

 ExecutiveLoad-Detail: Detected mmCIF


'molecule'

In [4]:
sample_data = pd.read_csv('kw_dms/results/func_effects/averages/4u8c_B1T1-vs-B1T1_+_4u8c_B2T1-vs-B2T1_func_effects.csv')

  site wildtype mutant   effect  effect_std  times_seen  n_selections
0    1        M      M  0.00000         NaN         NaN             1
1    1        M      T -0.41420         NaN         2.0             1
2   10        L      I -0.05005     0.06388        40.5             2
3   10        L      L  0.00000     0.00000         NaN             2
4   10        L      M  0.05568     0.04362         7.0             2


In [5]:
filtered_data = sample_data[
    (sample_data['n_selections'] != 1) &
    (sample_data['times_seen'] >= 3) &
    (sample_data['effect_std'] <= 1.6)
]

In [6]:
sum_site_effect = filtered_data.groupby('site').agg({
    'effect': 'mean',    # averaging up the 'effect' column
    'wildtype': 'first'
}).reset_index()

In [7]:
def get_charge(amino_acid):
    if amino_acid in ['R', 'K', 'H']:
        return 'Positive'
    elif amino_acid in ['D', 'E']:
        return 'Negative'
    else:
        return 'Neutral'

def get_polarity(amino_acid):
    if amino_acid in ['R', 'K', 'H', 'D', 'E', 'S', 'T', 'N', 'Q', 'Y']:
        return 'Polar'
    else:
        return 'Nonpolar'

In [8]:
sum_site_effect['charge_status'] = sum_site_effect['wildtype'].apply(get_charge)
sum_site_effect['polarity'] = sum_site_effect['wildtype'].apply(get_polarity)

In [9]:
#store secondary structure information for each residue

# Initialize a list in the stored namespace
stored.secondary_structures = []

# Iterate over each residue and store its secondary structure in the stored list
cmd.iterate("molecule and name CA", "stored.secondary_structures.append((resi, ss))")

# Print secondary structure for each residue

3826

In [10]:
#formatting
sum_site_effect['site'] = sum_site_effect['site'].str.extract(r'(\d+)').astype(int)

In [11]:
# Assuming 'stored.secondary_structures' contains your PyMOL data
# Convert to DataFrame
ss_df = pd.DataFrame(stored.secondary_structures, columns=['site', 'secondary_structure'])

# Replace the letters with actual secondary structure names
ss_df['secondary_structure'] = ss_df['secondary_structure'].map({
    'H': 'Alpha Helix',
    'S': 'Beta Sheet',
    'L': 'Loop'
})

# Convert 'site' to int in ss_df and remove duplicates
ss_df['site'] = ss_df['site'].astype(int)
ss_df.drop_duplicates(subset=['site'], inplace=True)

# Merge the DataFrames
merged_df = pd.merge(sum_site_effect, ss_df, on='site')

print(merged_df)

      site    effect wildtype charge_status  polarity secondary_structure
0      100 -0.027504        I       Neutral  Nonpolar                 NaN
1     1002 -0.047275        Q       Neutral     Polar         Alpha Helix
2     1003 -0.078230        S       Neutral     Polar         Alpha Helix
3     1005 -0.212417        Q       Neutral     Polar         Alpha Helix
4     1006 -0.040065        T       Neutral     Polar         Alpha Helix
...    ...       ...      ...           ...       ...                 ...
1048   993  0.025138        I       Neutral  Nonpolar         Alpha Helix
1049   994  0.010592        D      Negative     Polar         Alpha Helix
1050   995  0.010540        R      Positive     Polar         Alpha Helix
1051   998 -0.035740        T       Neutral     Polar         Alpha Helix
1052   999 -0.055803        G       Neutral  Nonpolar         Alpha Helix

[1053 rows x 6 columns]


In [42]:
entry_lookup = merged_df.loc[merged_df['site'] == 297]
print(entry_lookup)

     site  effect wildtype charge_status polarity secondary_structure
429   297 -0.8246        S       Neutral    Polar         Alpha Helix


In [22]:
# Initialize a list in the stored namespace
stored.atom_data = []

# Iterate over each atom and store its chain identifier, residue number, residue name, and atom name
cmd.iterate("molecule", "stored.atom_data.append((chain, resi, resn, name))")

# Create a DataFrame from the stored data
atom_df = pd.DataFrame(stored.atom_data, columns=['chain', 'resi', 'resn', 'name'])

#convert resi to int if its not already
atom_df['resi'] = atom_df['resi'].astype(int)

In [27]:
# Merge atom_df with only the 'effect' column of merged_df
atom_effect = pd.merge(atom_df, merged_df[['site', 'effect']], left_on='resi', right_on='site', how='left')

# Drop the 'site' column after merging
atom_effect.drop('site', axis=1, inplace=True)

# Replace NaN in 'effect' with 0
atom_effect['effect'].fillna(0, inplace=True)

# Display the first few rows of the updated df1
print(atom_effect)

      chain  resi resn name    effect
0         A    14  GLN   CA -0.036986
1         A    15  CYS   CA -0.044967
2         A    16  VAL   CA -0.070498
3         A    17  ASN   CA -0.170608
4         A    18  LEU   CA -0.080739
...     ...   ...  ...  ...       ...
26174     A  1305  NAG   C6  0.000000
26175     A  1305  NAG   O6  0.000000
26176     A  1305  NAG   C7  0.000000
26177     A  1305  NAG   O7  0.000000
26178     A  1305  NAG   C8  0.000000

[26179 rows x 5 columns]


In [29]:
#Optionally, save the updated DataFrame to a file
#atom_effect.to_csv('kw_dms/atom_effect', sep=' ', index=False, header=False)