In [39]:
# Autoreload 
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [40]:
# TODO: 
# use decorator to validate AA seq

In [41]:
# Represent a kinase recognition motif from a string. 

"""
Expected input format:
The phosphorylation site should be denoted with an asterisk (*) after the phospho-acceptor. The acceptor must be a serine, threonine, or tyrosine.
The site sequence can include any of the 20 amino acids, 'X' for masking a position, and '_' for truncation.
Use lower-case letters (s/t/y) and check the corresponding box in the advanced options to including phosphorylated residues (pS/pT/pY).

Examples:

PSVEPPLs*QETFSDL

PSVEPPLS*QETFSDL

PSVEXPLs*QXTF___

PPLs*

PSVEPPLs*QEtFSDL (with phospho-priming option checked)

"""



"\nExpected input format:\nThe phosphorylation site should be denoted with an asterisk (*) after the phospho-acceptor. The acceptor must be a serine, threonine, or tyrosine.\nThe site sequence can include any of the 20 amino acids, 'X' for masking a position, and '_' for truncation.\nUse lower-case letters (s/t/y) and check the corresponding box in the advanced options to including phosphorylated residues (pS/pT/pY).\n\nExamples:\n\nPSVEPPLs*QETFSDL\n\nPSVEPPLS*QETFSDL\n\nPSVEXPLs*QXTF___\n\nPPLs*\n\nPSVEPPLs*QEtFSDL (with phospho-priming option checked)\n\n"

In [42]:
import pandas as pd

import numpy as np

from katlas.motif import SequenceMotif
from katlas import katlas

In [43]:
SequenceMotif('____spxLs*QExyDL', phospho_priming=False)

        -5 -4 -3 -2 -1  0  1  2  3  4  5  6
Residue  _  S  P  X  L  S  Q  E  X  Y  D  L

In [44]:
SequenceMotif('PSVEPPLS*QETFsDL', phospho_priming=True)

        -5 -4 -3 -2 -1  0  1  2  3  4   5  6
Residue  V  E  P  P  L  S  Q  E  T  F  pS  D

In [45]:
s = SequenceMotif('PSVEPPLS*QETFsDL', phospho_priming=False)
s

        -5 -4 -3 -2 -1  0  1  2  3  4  5  6
Residue  V  E  P  P  L  S  Q  E  T  F  S  D

In [50]:
s.motif

'VEPPLSQETFSD'

In [17]:
from katlas.motif.PSSM import PSSM

df = PSSM.pssm

In [18]:
sm1 = SequenceMotif("PSVEPPLs*QETFSDL")
sm1

        -5 -4 -3 -2 -1  0  1  2  3  4  5  6
Residue  V  E  P  P  L  S  Q  E  T  F  S  D

In [23]:
sm = SequenceMotif('PSVEPPLS*QETFsDL', phospho_priming=True)
sm

        -5 -4 -3 -2 -1  0  1  2  3  4   5  6
Residue  V  E  P  P  L  S  Q  E  T  F  pS  D

In [24]:
cols = list(df.columns)

# Remove any integers from the column strings 
import re
cols = [re.sub('\d', '', col) for col in cols]

# Remove any '-' characters
cols = [re.sub('-', '', col) for col in cols]

cols = list(set(cols))

allowed_chars = "".join(sorted(cols))
allowed_chars

AttributeError: 'Series' object has no attribute 'columns'

In [25]:
PSSM.allowed_chars

'ACDEFGHIKLMNPQRSTVWYsty'

In [29]:
# For each position in the motif, filter the PSSM to only include the amino acid at that position
# Then, sum the scores for each amino acid at that position
print(sm.sequence)
print(sm)
motif = sm.motif 
df = PSSM.pssm 

# For each key: value pair in the motif dictionary, 
# filter the PSSM to only include the amino acid at that position. 
# e.g. for -5: 'P' filter the PSSM to include the "-5P" column. 

# The dataframe's row index is the kinase name.  The columns are of the form {position}{amino acid}


positions = [
    -5, -4, -3, -2, -1, 1, 2, 3, 4
]
# Remove positions that are not valid characters
"""
cols = [
    f"{str(pos)}{motif[pos]}" 
    for pos in positions 
    if motif[pos] in allowed_chars
]
"""

cols = [
    f"{str(pos)}{aa}"
    for pos, aa in sorted(motif.items())
    if f"{str(pos)}{aa}" in df.columns
]

df = df[cols]




PSVEPPLS*QETFsDL
-5  -4  -3  -2  -1  0   1   2   3   4   5   6   
V   E   P   P   L   S   Q   E   T   F   s   D   


In [30]:
# Sum the scores for each kinase at each position
df = df.sum(axis=1)

# log2 transform the scores
df = df.apply(lambda x: np.log2(x)) 

# Sort the scores
df = df.sort_values(ascending=False)

df.names = ['Kinase', 'Score (log2)']


In [42]:
class KinaseAtlas():
    def kinase_scores(motif):
        print("Kinase\t  Score (log2)")
        print(df)

In [43]:
KinaseAtlas.kinase_scores(motif)

Kinase	  Score (log2)
FAM20C    4.393787
SMG1      4.212663
ATM       4.203538
DNAPK     4.058282
MAK       3.996759
            ...   
P70S6K    2.544015
PKCE      2.531893
NEK3      2.519668
PKCI      2.459615
AKT3      2.444402
Length: 303, dtype: float64


{0: 'S',
 1: 'Q',
 2: 'E',
 3: 'T',
 4: 'F',
 5: 'S',
 6: 'D',
 -1: 'L',
 -2: 'P',
 -3: 'P',
 -4: 'E',
 -5: 'V'}

In [84]:
scores = [3, 4, 3, 2, 4, 2, 5, 5, 4, 4, 1, 5, 3, 3, 3, 3, 3, 5, 5]

In [86]:
sum(scores)



67