## **SETD2 with SAM, analysis @Dreamworks**

## **Import modules**

In [1]:
import os
import sys
    
data_path=os.path.abspath('/media/dataHog/hca/SETD2') #Where your group data is
base_path=os.path.abspath('/media/dataHog/hca/SimFound_v2/source/') #Where your source code is (SFv2)

sys.path.append(base_path)
sys.path.append(data_path)

import importlib

import warnings
warnings.filterwarnings('ignore')
import Protocols as P

import main
import tools
import Trajectory
import MSM
import Featurize as F
import Discretize


from simtk.unit import *



## **Define Project**

In [2]:
importlib.reload(Trajectory)
importlib.reload(main)

workdir=data_path
results=workdir+'/results'
inputs=workdir+'/inputs/structures/'

protein=['setd2_complexed_noSub']

ligand=['SAM']
parameters=['310K']
timestep=20*picoseconds

project=main.Project(title='SETD2-SAM', 
                     hierarchy=('protein', 'ligand', 'parameter'), 
                     workdir=workdir,
                     parameter=parameters, 
                     replicas=40, 
                     protein=protein, 
                     ligand=ligand,
                     topology='SETD2_complexed_noSub_full.pdb',
                     timestep=timestep,
                    initial_replica=1)
project.setSystems()

Converted parameter "temperature" (in K) into scalar: 310.0 K


## **Markov State Models**


In [11]:
importlib.reload(MSM)
importlib.reload(Trajectory)
import tools_plots
importlib.reload(tools_plots)

protein_regions = {'loopIN' : 'resid 241 to 255 and backbone', #red
                   'postSET' : 'resid 220 to 240 and backbone', #blue
                   'SETcd' : 'resid 136 to 165 and backbone', #green
                   'control_core'  : 'resid 190 to 210 and backbone', #yellow
                   'combined' : 'resid 220 to 255 and backbone',
                   'full' : 'protein and name CA',
                   'control_ZF' : 'resid 47 to 85 and backbone'}


#protein_regions = {'R1670-SAM' : ('resid 221 and name CZ', 'resname SAM and name SD')}
#'lid' : ('resid 241 to 255', 'resid 136 to 165')}

features_to_extract=['torsions', 'positions']
#features_to_extract=['distances']

kinetic_models={}
kinetics=MSM.MSM(project,
                 protein_regions,
                 stride=1, 
                 timestep=timestep, 
                 warnings=True,
                def_traj=['production_NPT-1.dcd'])

Results will be stored under:  /media/dataHog/hca/SETD2/results/setd2_complexed_noSub-SAM-310K/MSM
PyEMMA calculations will be stored under:  /media/dataHog/hca/SETD2/results/setd2_complexed_noSub-SAM-310K/MSM/MSM_storage
Using pre-defined trajectory ['production_NPT-1.dcd'] and/or topology None


In [12]:
states = [5, 10, 15, 20, 30, 50, 100]
lags = [10, 100, 250, 500, 1000, 2000]
tica_lags=[10, 20, 50, 100, 200, 500, 1000, 2000]

test_models = []

for region in protein_regions.keys():
    for feature in features_to_extract:
        test_models.append((region, feature, states, lags))

kinetics.analysis(inputs=test_models, 
                    method='generate', 
                    tica_lag=tica_lags,
                    eval_vamps=True, 
                  tica_weights='empirical')

Executing load_models 
	Models calculated:  1906
	Models to discard:  2797
	Models to calculate:  0
	Failed models:  2799
	Loaded models:  1906
	Total number of models : 4705


In [None]:


models=kinetics.model_comparison()
indexes=['Discretized feature', 'name', 'feature', 'model', 'Test', 'Filters', 'Processes', 'States', 'Lag', 'Dimensions', 're-weighting', 'tICA lag', 'Cluster method']

models.reset_index(inplace=True)
models= models.melt(id_vars=indexes, 
        var_name="VAMP2", 
        value_name="Score")
#models.hvplot(kind='scatter', y='Score')
#slider = pnw.FloatSlider(name='Lag', start=0, end=1)
#models.hvplot.kde(y='Lag', by='Test')
#models.hvplot.kde(y='Score', by='name' , groupby='feature') 





In [None]:
models.hvplot.box(y='Score', by=['name'], groupby='feature', grid=True)

In [None]:
models

## Evaluate *features* and *dimensions* with VAMP2 scores

In [None]:
vamp_lags=[1, 2, 10, 20, 50, 100, 200, 500, 1000, 2000]
kinetics.calculate(method='VAMP', 
                    evaluate=['features', 'dimensions'], 
                    features=features_to_extract, 
                    VAMP_lags=vamp_lags, 
                    dim=0.95)

## TICA

In [None]:
tica_lags=[10, 20, 50, 100, 200, 500, 1000, 2000]
kinetics.calculate(inputs=protein_regions, method='TICA', 
                   features=features_to_extract, 
                   TICA_lag=tica_lags,
                  tica_weights='empirical')


## Clustering in TICA space

In [None]:
kinetics.calculate(method='Clustering',
                   features=features_to_extract, 
                   TICA_lag=10,
                   cluster_lags=[500],
                   def_traj=['production_NPT-1.dcd'])

In [None]:
tica_lags=[10, 20, 50, 100, 200, 500, 1000, 2000]
for lag in tica_lags:
    kinetics.analysis(inputs=test_models, 
                    method='inspect', 
                    disc_lag=lag, 
                    eval_vamps=True)

## Global

In [None]:
top_models = [('loop_IN', 'torsions', 30, 100, (3,5)),
              ('loop_IN', 'positions', 30, 100, (3,5)),
              ('post_SET_domain', 'positions', 30, 100, (3,5)),
              ('SETcd', 'positions', 20, 100, (3,5)), 
              ('SETcd', 'torsions', 20, 100, (3,5)), 
              ('SETcd', 'positions', 30, 100, (3,5)),
              ('combined', 'positions', 30, 100, (3,5))]
top_models = {2000 : [('full_Ca', 'positions', 20, 100, 3)],
                20 : [('combined', 'torsions', 30, 100, 3)],
                10 : [('SETcd', 'positions', 20, 100, 3)]}

## CK test

In [None]:
for tica_lag, input_models in top_models.items():
    kinetics.analysis(inputs=input_models, disc_lag=tica_lag, method='CKtest')

## Hidden MSM

In [None]:
for tica_lag, input_models in top_models.items():
    kinetics.analysis(inputs=input_models, disc_lag=tica_lag, method='HMSM', hmsm_lag=[1,2,3])

## PCCA

In [None]:
for tica_lag, input_models in top_models.items():
    kinetics.analysis(inputs=input_models, disc_lag=tica_lag, method='PCCA')            

## Implied Timescales

In [None]:
region_states = {'loop_IN' : 30, #red
                   'post_SET_domain' : 30, #blue
                   'SETcd' : 30, #green
                   'control'  : 30, #yellow
                'combined': 30} #cyan

kinetics.calculate(inputs=region_states, 
                                         method='ITS', 
                                         features=features_to_extract, 
                                         TICA_lag=10, 
                                         def_traj=['production_NPT-1.dcd'])

## Spectral Analysis

In [None]:
tica_lags=[10, 20, 50, 100, 200, 500, 1000, 2000]
for lag in tica_lags:
    kinetics.analysis(inputs=test_models, 
                    method='Spectral', 
                    disc_lag=lag, 
                    eval_vamps=True)

## MFPT

In [None]:
for tica_lag, input_models in top_models.items():
    t=kinetics.analysis(inputs=input_models, disc_lag=tica_lag, method='MFPT', hmsm_lag=[1])

## Visualization

In [None]:
view = []
for tica_lag, input_models in top_models.items():
    view.append(kinetics.analysis(inputs=input_models, disc_lag=tica_lag, hmsm_lag=[1], method='Visual'))

In [None]:
view

In [None]:
view[0]['full_Ca_positions@40000ps_s1_20@2000ps_HMSM-3']

In [None]:
view[1]['combined_torsions@400ps_s1_30@2000ps_HMSM-3']

In [None]:
view[2]['SETcd_positions@200ps_s1_20@2000ps_HMSM-3']

## RMSD

In [None]:
pdb = ['SETD2_complexed_noSub_full']
pdb_discard = ['5lsu', '5lsx', '5lt6',  '5lt8', 'SETD2_open', 
       '4fmu', '4h12', '5jjy', '5jlb', '5jle', 
       '5lss', '5lsy', '5lsz', 
       '5lt7', '5v21', '5v22', 
       '6j9j', '6vdb', '7lzb', '7lzd', '7lzf']

for tica_lag, input_models in top_models.items():
    kinetics.analysis(inputs=input_models, disc_lag=tica_lag, hmsm_lag=[1],  method='RMSD', compare_pdbs=pdb)

## TPT

In [None]:
for tica_lag, input_models in top_models.items():
    kinetics.analysis(inputs=input_models, disc_lag=tica_lag, method='flux')

In [None]:
import Featurize as F
importlib.reload(F)

featurize=F.Featurize(project_systems, results=results, timestep=timestep, warnings=True, heavy_and_fast=False)

featurize.calculate('backbone', 
                    method='RMSD',  
                    n_cores=4,
                    feature_name='backbone',
                    def_traj=['production_NPT-1_superposed.dcd'])

In [None]:
featurize.calculate('protein and name CA', 
                                       method='RMSF',
                                       n_cores=6,
                                       feature_name='C-alpha',
                                       def_traj=['production_NPT-1_superposed.dcd'])

In [None]:
for k, v in featurize.features.items():
    print(k)
    featurize.plot(input_df=v, level=2, feature_name=k, subplots_=False)

In [None]:
import mdtraj as md

In [None]:
s=md.load_frame(f'{inputs}/SETD2_complexed_noSub_full.pdb', index=0)

In [None]:
s

In [None]:
s.topology.select('residue 1690')

In [None]:
s.topology.residue(3)

# 