In [30]:
# %%
######################################################################################
# Libraries
######################################################################################
# print("Import Libraries & Set up directory")
import numpy as np
import pandas as pd
from Bio import SeqIO
import re
import os
from collections import defaultdict
from collections import Counter
import matplotlib.pyplot as plt

folder_path = "/Users/nguyjust/Library/CloudStorage/OneDrive-OregonHealth&ScienceUniversity/ubsite/"

### UniProt Reference File

Downloaded 8/29/2023

In [2]:
# %%
######################################################################################
# Read in UniProt Reference File
######################################################################################
# Data read in
fasta_seqs = list(SeqIO.parse(open(folder_path + 'raw_data/idmapping_2023_08_29.fasta'), 'fasta'))

#print("Read in fasta file")

## Clean fasta ID names for quicker query
for ii in range(len(fasta_seqs)):
    fasta_seqs[ii].id = fasta_seqs[ii].id.split("|")[1] 
    
fasta_seq_df = pd.DataFrame(columns = ['id', 'seq'])

for ii in range(len(fasta_seqs)):
    fasta_seq_df.loc[len(fasta_seq_df.index)] = [fasta_seqs[ii].id, fasta_seqs[ii].seq]
    #fasta_seq_df.append({'id': fasta_seqs[0].id, 'seq': fasta_seqs[0].seq}, ignore_index=True)

fasta_seq_df['seq'] = fasta_seq_df['seq'].astype(str)



73726

In [8]:
print(f"number of sequences: {len(fasta_seq_df)}")

print(f"FASTA original colummns: {fasta_seq_df.columns}")

number of sequences: 73726
FASTA original colummns: Index(['id', 'seq'], dtype='object')


### PhosphoSitePlus 

Website: 
https://www.phosphosite.org/staticDownloadshttps://www.phosphosite.org/staticDownloads

Downloaded: 
Ubiquitination_site_dataset.gz (2MB)

Last Modified: 
Fri May 17 09:42:44 EDT 2024

Date of Download: 
05/21/2024 09:58 PST

In [14]:
# %%
######################################################################################
# PhosphoSitePlus
######################################################################################

## Read in the PhosphoSite Dataset
data_psp = pd.read_csv(folder_path + '/raw_data/PSP/Ubiquitination_site_dataset',
                       sep="\t", low_memory=False, skiprows=[0, 1, 2])

## Look at the columns to check file integrity
print(f"PSP original colummns: {data_psp.columns}")

## First row
print(data_psp.iloc[0])

PSP original colummns: Index(['GENE', 'PROTEIN', 'ACC_ID', 'HU_CHR_LOC', 'MOD_RSD', 'SITE_GRP_ID',
       'ORGANISM', 'MW_kD', 'DOMAIN', 'SITE_+/-7_AA', 'LT_LIT', 'MS_LIT',
       'MS_CST', 'CST_CAT#', 'Ambiguous_Site'],
      dtype='object')
GENE                        YWHAB
PROTEIN               14-3-3 beta
ACC_ID                     P31946
HU_CHR_LOC               20q13.12
MOD_RSD                     K5-ub
SITE_GRP_ID              41481400
ORGANISM                    human
MW_kD                       28.08
DOMAIN                        NaN
SITE_+/-7_AA      ___MtMDksELVQkA
LT_LIT                        NaN
MS_LIT                        4.0
MS_CST                        NaN
CST_CAT#                      NaN
Ambiguous_Site                  0
Name: 0, dtype: object


In [15]:
## Data with selected data columns
## PROTEIN, ACC_ID, ORGANISM, MOD_RSD, SITE_+/-7_AA
data_sel_psp = data_psp[['PROTEIN', 'ACC_ID',
                         'ORGANISM', 'MOD_RSD', 'SITE_+/-7_AA']].copy()

## Check that correct columns is copied
data_sel_psp.head()

Unnamed: 0,PROTEIN,ACC_ID,ORGANISM,MOD_RSD,SITE_+/-7_AA
0,14-3-3 beta,P31946,human,K5-ub,___MtMDksELVQkA
1,14-3-3 beta,P31946,human,K11-ub,DksELVQkAkLAEQA
2,14-3-3 beta,Q9CQV8,mouse,K11-ub,DksELVQkAkLAEQA
3,14-3-3 beta,P35213,rat,K11-ub,DkSELVQkAkLAEQA
4,14-3-3 beta,P31946,human,K13-ub,sELVQkAkLAEQAER


In [18]:
## Clean the Ub location column
## ie: originally Kxx-ub, but just need to capture the location number for the modified site
data_sel_psp['ub_mod_loc'] = data_sel_psp.MOD_RSD.str.extract('(\d+)')
## Drop the old modified site column
data_sel_psp = data_sel_psp.drop(['MOD_RSD'], axis=1)

## Check that only numbers were captured in the location
data_sel_psp.head()

Unnamed: 0,PROTEIN,ACC_ID,ORGANISM,SITE_+/-7_AA,ub_mod_loc
0,14-3-3 beta,P31946,human,___MtMDksELVQkA,5
1,14-3-3 beta,P31946,human,DksELVQkAkLAEQA,11
2,14-3-3 beta,Q9CQV8,mouse,DksELVQkAkLAEQA,11
3,14-3-3 beta,P35213,rat,DkSELVQkAkLAEQA,11
4,14-3-3 beta,P31946,human,sELVQkAkLAEQAER,13


In [29]:
## Renaming column names
data_sel_psp = data_sel_psp.rename(columns={"PROTEIN": "protein",
                           "ACC_ID": "uniprot_id", 
                           "ORGANISM": "organism",
                           "SITE_+/-7_AA" : "sequence"})

data_sel_psp.head()

Unnamed: 0,protein,uniprot_id,organism,sequence,ub_mod_loc
0,14-3-3 beta,P31946,human,___MtMDksELVQkA,5
1,14-3-3 beta,P31946,human,DksELVQkAkLAEQA,11
2,14-3-3 beta,Q9CQV8,mouse,DksELVQkAkLAEQA,11
3,14-3-3 beta,P35213,rat,DkSELVQkAkLAEQA,11
4,14-3-3 beta,P31946,human,sELVQkAkLAEQAER,13


In [21]:
## Look at the unique species
print(set(data_sel_psp['organism']))

## Number of unique species
print(f"Number of unique species in PSP: {len(set(data_sel_psp['organism']))}")

{'mouse', 'chicken', 'SARSCoV2', 'cow', 'SARSCoV1', 'duck', 'pig', 'human', 'rat'}
Number of unique species in PSP: 9


** trying to clean up species name, but can do later
mouse - Mus musculus

chicken - Gallus gallus

SARSCoV2 - 

cow - Bos taurus

SARSCoV1 - 

duck - Anas platyrhynchos

pig - Sus scrofa

human - Homo sapiens

rat - Rattus norvegicus

In [48]:
len_psp = len(data_sel_psp['uniprot_id'])
psp_uniprot_unique = list(set(data_sel_psp["uniprot_id"]))

print(f'Total PSP observations: {len_psp}')
print(f"Total unique PSP proteins: {len(psp_uniprot_unique)}")

Total PSP observations: 126329
Total unique PSP proteins: 19917


In [52]:
## Save UniProt IDs to file to download full information from UniProt
np.savetxt(folder_path + "data/psp_uniprot_ids.tsv",
           psp_uniprot_unique,
           delimiter="\t",
           fmt='% s')

In [31]:


for ii in range(len(data_sel_psp)):
    if data_sel_psp['SITE_+/-7_AA'][ii][7] == "k":
        # data_sel_psp_k = pd.concat([data_sel_psp_k, data_sel_psp.iloc[ii]])
        pass
    else:
        print(ii)
        data_sel_psp_k = data_sel_psp_k.drop(index=[ii])
# %%
data_sel_psp_k['SITE_+/-7_AA'] = data_sel_psp_k['SITE_+/-7_AA'].str.upper()

data_psp_valid_entry = data_sel_psp_k

for ii in range(len(data_sel_psp_k)):
    try:
        data = fasta_seq_df.loc[fasta_seq_df['id']
                                == data_sel_psp_k['ACC_ID'][ii]]
    # if str(data_dbptm_sel['seq'][ii]) in str(data.iloc[0]['seq']):
        s = re.sub('[^a-zA-Z]+', '', str(data_sel_psp_k['SITE_+/-7_AA'][ii]))
        if s in str(data.iloc[0]['seq']):
            pass
        else:
            data_psp_valid_entry = data_psp_valid_entry.drop(index=[ii])
            print(ii)
    except:
        pass
# %%

# Clean up headers and write out
data_psp_valid_entry.columns = [x.lower()
                                for x in data_psp_valid_entry.columns]
data_psp_valid_entry = data_psp_valid_entry.rename(
    columns={"site_+/-7_aa": "seq"})
# %%
# Write out master info
# data_psp_valid_entry.to_csv(
#    folder_path + '/data/psp_info.tsv', sep='\t', index=False)

# Master information
# data_sel_psp.to_csv(folder_path + '/data/psp_info.txt', sep = '\t', index=False)

# UniProt IDs to search for full length sequences
# data_sel_psp['acc_id'].to_csv(folder_path + '/data/psp_ids.txt', sep = '\t', index=False)

# Quick EDA
# Number of sites
len(data_psp_valid_entry.index)
data_psp_valid_entry.groupby(['acc_id']).ngroups

psp_id = pd.DataFrame(list(set(data_psp_valid_entry['acc_id'])))
psp_id.to_csv(folder_path + '/data/psp_id.txt', sep='\t', index=False)

KeyError: 'SITE_+/-7_AA'