In [1]:
# Autoreload 
%load_ext autoreload
%autoreload 2

from pathlib import Path 
import pandas as pd 
import numpy as np
import os 
import re 
import gzip 
import shutil
import Bio.PDB.MMCIF2Dict
from typing import Union, List, Tuple, Dict, Optional
from pathlib import Path
from tqdm import tqdm

pd.options.mode.chained_assignment = None  # default='warn'

from phosphosite.utils import aa1to3, aa3to1

In [3]:
# Function definitions 

UNIPROT_DATA_DIR = Path("./uniprot/") 
UNIPROT_DATA_DIR.mkdir(exist_ok=True, parents=True)

import requests 
import time 

def download_uniprot_data(
    protein_ids: List[str],
    data_dir: Path = UNIPROT_DATA_DIR,
    url: str = "https://rest.uniprot.org/uniprotkb/{protein_id}.txt",
) -> None:
    """Retrieve entries from uniprot KB"""
    for protein in protein_ids:
        download_url = url.format(protein_id=protein)

        # Save to file in data directory
        filepath = data_dir / f"{protein}.txt"
        if not filepath.exists():
            r = requests.get(download_url)
            with open(filepath, "wb") as f:
                f.write(r.content)

        # Sleep to avoid overloading server
        time.sleep(0.5)
            



In [2]:
from phosphosite import GAMMA_OXYGEN_CODES

def get_proteins_from_motif(
    df: pd.DataFrame,
    triplet: str, 
    nn: str, 
    ref_atom: str = None,
    phosphosite: bool = None,
) -> List[str]:
    """Return the list of proteins that contain a given motif."""
    
    # Filter by ref_atom
    if ref_atom is not None:
        if ref_atom == "CA": ref_atom = ["CA"]
        elif ref_atom == "oxygen": ref_atom = GAMMA_OXYGEN_CODES
        df = df[df["ref_atom"].isin(ref_atom)]
    
    # Turn dataframe cols into just residue (i.e. first character)
    for col in ["prev", "res", "next", "nn_res"]:
        df[col] = df[col].str[0] 

    # If "next" is unspecified: 
    #if len(triplet) == 2:
    #    df = df[(df["prev"] == triplet[0]) & (df["res"] == triplet[1]) & (df["nn_pos"] == nn)]
    #else:
    #    df = df[(df["prev"] == triplet[0]) & (df["res"] == triplet[1]) & (df["next"] == triplet[2]) & (df["nn_pos"] == nn)]
    
    # Filter by phosphosite
    if phosphosite is not None:
        df = df[df["phos"] == phosphosite]

    return df[(df.prev == triplet[0]) & 
              (df.res == triplet[1]) & 
              (df.next == triplet[2]) & 
              (df.nn_res == nn)].protein_id.unique().tolist()

def get_sites_from_motif(
    df: pd.DataFrame,
    triplet: str, 
    nn: str, 
    ref_atom: str = None,
    phosphosite: bool = None,
) -> List[str]:
    """Return the list of sites (protein id, site) that contain a given motif."""
    
    # Filter by ref_atom
    if ref_atom is not None:
        if ref_atom == "CA": ref_atom = ["CA"]
        elif ref_atom == "oxygen": ref_atom = GAMMA_OXYGEN_CODES
        df = df[df["ref_atom"].isin(ref_atom)]
    
    # Turn dataframe cols into just residue (i.e. first character)
    for col in ["prev", "res", "next", "nn_res"]:
        df[col] = df[col].str[0] 

    # If "next" is unspecified: 
    #if len(triplet) == 2:
    #    df = df[(df["prev"] == triplet[0]) & (df["res"] == triplet[1]) & (df["nn_pos"] == nn)]
    #else:
    #    df = df[(df["prev"] == triplet[0]) & (df["res"] == triplet[1]) & (df["next"] == triplet[2]) & (df["nn_pos"] == nn)]
    
    # Filter by phosphosite
    if phosphosite is not None:
        df = df[df["phos"] == phosphosite]

    dff = df[(df.prev == triplet[0]) & 
              (df.res == triplet[1]) & 
              (df.next == triplet[2]) & 
              (df.nn_res == nn)]

    dff = dff[["protein_id", "res", "pos"]]
    
    # Return list of tuples (protein_id, res, pos)
    return list(dff.itertuples(index=False, name=None))



In [5]:
with open("uniprot_kb.txt", "r") as f:
    uniprot_kb = f.read().splitlines()

# Download all uniprotkb ids
download_uniprot_data(uniprot_kb)