In [31]:
# Autoreload 
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
from katlas.motif import SequenceMotif

In [34]:
# TODO: 
# use decorator to validate AA seq

In [33]:
# Represent a kinase recognition motif from a string. 

"""
Expected input format:
The phosphorylation site should be denoted with an asterisk (*) after the phospho-acceptor. The acceptor must be a serine, threonine, or tyrosine.
The site sequence can include any of the 20 amino acids, 'X' for masking a position, and '_' for truncation.
Use lower-case letters (s/t/y) and check the corresponding box in the advanced options to including phosphorylated residues (pS/pT/pY).

Examples:

PSVEPPLs*QETFSDL

PSVEPPLS*QETFSDL

PSVEXPLs*QXTF___

PPLs*

PSVEPPLs*QEtFSDL (with phospho-priming option checked)

"""



"\nExpected input format:\nThe phosphorylation site should be denoted with an asterisk (*) after the phospho-acceptor. The acceptor must be a serine, threonine, or tyrosine.\nThe site sequence can include any of the 20 amino acids, 'X' for masking a position, and '_' for truncation.\nUse lower-case letters (s/t/y) and check the corresponding box in the advanced options to including phosphorylated residues (pS/pT/pY).\n\nExamples:\n\nPSVEPPLs*QETFSDL\n\nPSVEPPLS*QETFSDL\n\nPSVEXPLs*QXTF___\n\nPPLs*\n\nPSVEPPLs*QEtFSDL (with phospho-priming option checked)\n\n"

In [35]:
import pandas as pd

import numpy as np

In [36]:
s = SequenceMotif('____spxLs*QExyDL', phospho_priming=False)
s

        -5 -4 -3 -2 -1  0  1  2  3  4  5  6
Residue  _  S  P  X  L  S  *  Q  E  X  Y  D

In [46]:
s = SequenceMotif('PSVEPPLS*QETFsDL', phospho_priming=True)
s

        -5 -4 -3 -2 -1  0  1  2  3  4  5   6
Residue  V  E  P  P  L  S  *  Q  E  T  F  pS

In [47]:
s.motif

{0: 'S',
 1: '*',
 2: 'Q',
 3: 'E',
 4: 'T',
 5: 'F',
 6: 's',
 -1: 'L',
 -2: 'P',
 -3: 'P',
 -4: 'E',
 -5: 'V'}

In [54]:
from katlas.motif.PSSM import PSSM

df = PSSM.pssm

In [55]:
sm = SequenceMotif("PSVEPPLs*QETFSDL")
sm

        -5 -4 -3 -2 -1  0  1  2  3  4  5  6
Residue  V  E  P  P  L  S  Q  E  T  F  S  D

In [56]:
cols = list(df.columns)

# Remove any integers from the column strings 
import re
cols = [re.sub('\d', '', col) for col in cols]

# Remove any '-' characters
cols = [re.sub('-', '', col) for col in cols]

cols = list(set(cols))

allowed_chars = "".join(sorted(cols))
allowed_chars

'ACDEFGHIKLMNPQRSTVWYsty'

In [57]:
PSSM.allowed_chars

'ACDEFGHIKLMNPQRSTVWYsty'

In [75]:
# For each position in the motif, filter the PSSM to only include the amino acid at that position
# Then, sum the scores for each amino acid at that position
print(sm.sequence)
motif = sm.motif 
df = PSSM.pssm 

# For each key: value pair in the motif dictionary, 
# filter the PSSM to only include the amino acid at that position. 
# e.g. for -5: 'P' filter the PSSM to include the "-5P" column. 

# The dataframe's row index is the kinase name.  The columns are of the form {position}{amino acid}


positions = [
    -5, -4, -3, -2, -1, 1, 2, 3, 4
]
# Remove positions that are not valid characters
"""
cols = [
    f"{str(pos)}{motif[pos]}" 
    for pos in positions 
    if motif[pos] in allowed_chars
]
"""

cols = [
    f"{str(pos)}{aa}"
    for pos, aa in sorted(motif.items())
    if f"{str(pos)}{aa}" in df.columns
]

df = df[cols]




PSVEPPLS*QETFSDL


In [76]:
# Sum the scores for each kinase at each position
df = df.sum(axis=1)
df

AAK1       8.9115
ACVR2A     9.0905
ACVR2B     8.8369
AKT1       6.3152
AKT2       6.2113
           ...   
YANK2     10.1362
YANK3      9.0560
YSK1       6.7054
YSK4       7.5064
ZAK        7.2170
Length: 303, dtype: float64

In [74]:
# log2 transform the scores
df = df.apply(lambda x: np.log2(x)) 
df

Unnamed: 0,-5V,-4E,-3P,-2P,-1L,1Q,2E,3T,4F
AAK1,0.693230,-0.072179,0.882369,-1.377819,0.152638,-2.091725,-0.168123,-0.015374,-0.168771
ACVR2A,0.023468,0.562865,-0.228030,-1.639355,0.349818,0.297720,0.037312,0.003458,-0.168285
ACVR2B,-0.114660,0.572405,-0.350155,-1.577336,0.464773,0.026588,0.133827,-0.011588,-0.288417
AKT1,-0.460568,-0.508198,-0.980796,-1.722610,-0.032241,0.110497,-1.344831,-0.054692,-0.595751
AKT2,-0.443203,-0.629055,-1.555121,-1.375946,-0.372580,0.247563,-0.798366,-0.170881,-0.554909
...,...,...,...,...,...,...,...,...,...
YANK2,-0.442026,0.671203,0.039981,0.294547,-0.177392,0.012497,0.858936,0.139600,-0.425075
YANK3,-0.228199,0.313594,0.061431,0.126973,-0.306502,0.237808,0.171975,-0.046623,-0.431676
YSK1,-0.238204,-0.208561,-0.190833,-0.529697,-0.149921,-0.550043,-2.355498,-0.111228,-0.478145
YSK4,-0.186722,0.133037,-0.069603,-0.541828,0.178747,-1.043943,-0.976816,-0.025618,-0.351074


{0: 'S',
 1: 'Q',
 2: 'E',
 3: 'T',
 4: 'F',
 5: 'S',
 6: 'D',
 -1: 'L',
 -2: 'P',
 -3: 'P',
 -4: 'E',
 -5: 'V'}