In [1]:
import requests
import json
import re
import pandas as pd
import math
import os

In [2]:
def call(mode, additional_info=None):
    if mode == "tissuelist":
        API_HOST = "https://www.proteomicsdb.org/proteomicsdb/logic/api/tissuelist.xsodata/CA_AVAILABLEBIOLOGICALSOURCES_API?$select=TISSUE_ID,TISSUE_NAME,TISSUE_GROUP_NAME,TISSUE_CATEGORY,SCOPE_ID,SCOPE_NAME,QUANTIFICATION_METHOD_ID,QUANTIFICATION_METHOD_NAME,MS_LEVEL&$format=json"
        response = requests.get(url=API_HOST)
        return json.loads(response.text)["d"]["results"]

    elif mode == "proteinlist":
        API_HOST = "https://www.proteomicsdb.org/proteomicsdb/logic/api/proteinspertissue.xsodata/InputParams(TISSUE_ID='{}',CALCULATION_METHOD={},SWISSPROT_ONLY={},NO_ISOFORM={},TAXCODE={})/Results?$select=ENTRY_NAME,UNIQUE_IDENTIFIER,DATABASE,PROTEIN_DESCRIPTION,PEPTIDES,TISSUE_ID,SAMPLE_NAME,SAMPLE_DESCRIPTION,UNNORMALIZED_EXPRESSION,NORMALIZED_EXPRESSION&$format=json"
        API_HOST = API_HOST.format(additional_info[0], additional_info[1], additional_info[2],additional_info[3],additional_info[4])
        response = requests.get(url=API_HOST)
        return json.loads(response.text)["d"]["results"]
    
    elif mode == "proteinexp":
        API_HOST = "https://www.proteomicsdb.org/proteomicsdb/logic/api/proteinexpression.xsodata/InputParams(PROTEINFILTER='{}',MS_LEVEL={},TISSUE_CATEGORY_SELECTION='',TISSUE_ID_SELECTION='',SCOPE_SELECTION={},CALCULATION_METHOD={},GROUP_BY_TISSUE={},EXP_ID={})/Results?$select=UNIQUE_IDENTIFIER,TISSUE_ID,TISSUE_NAME,TISSUE_SAP_SYNONYM,SAMPLE_ID,SAMPLE_NAME,AFFINITY_PURIFICATION,EXPERIMENT_ID,EXPERIMENT_NAME,EXPERIMENT_SCOPE,EXPERIMENT_SCOPE_NAME,PROJECT_ID,PROJECT_NAME,PROJECT_STATUS,UNNORMALIZED_INTENSITY,NORMALIZED_INTENSITY,MIN_NORMALIZED_INTENSITY,MAX_NORMALIZED_INTENSITY,SAMPLES&$format=json"
        API_HOST = API_HOST.format(additional_info[0], additional_info[1], additional_info[2], additional_info[3], additional_info[4], additional_info[5])    
        response = requests.get(url=API_HOST)
        return json.loads(response.text)["d"]["results"]
    
    elif mode == "proteinspertissue":       
        API_HOST = "https://www.proteomicsdb.org/proteomicsdb/logic/api/proteinspertissue.xsodata/InputParams(TISSUE_ID='{}',CALCULATION_METHOD=0,SWISSPROT_ONLY=1,NO_ISOFORM=1,TAXCODE=9606)/Results?$select=ENTRY_NAME,UNIQUE_IDENTIFIER,DATABASE,PROTEIN_DESCRIPTION,PEPTIDES,TISSUE_ID,SAMPLE_NAME,SAMPLE_DESCRIPTION,UNNORMALIZED_EXPRESSION,NORMALIZED_EXPRESSION&$format=json"
        API_HOST = API_HOST.format(additional_info[0])
        response = requests.get(url=API_HOST)
        return json.loads(response.text)["d"]["results"] 
    

In [3]:
tissuelist = pd.DataFrame.from_dict(call("tissuelist"))
cell_line_tissue_id = tissuelist.loc[tissuelist["TISSUE_CATEGORY"]=="cell line", "TISSUE_ID"].tolist()

In [4]:
tissue_id_to_name_dict = dict(zip(tissuelist["TISSUE_ID"], tissuelist["TISSUE_NAME"]))

In [5]:
tissue_id_to_description = pd.read_csv("BTO_Organ_dict.csv", header=None, encoding="ISO-8859-1")
tissue_id_to_description_dict = dict(zip(tissue_id_to_description.iloc[:, 0], tissue_id_to_description.iloc[:, 1]))

# Download Protein list

In [6]:
protein_list = []
protein_df = pd.DataFrame()
for tissue in set(cell_line_tissue_id):
    
    try:
        # tissue = "BTO:0000975"
        tmp_proteinlist_df = pd.DataFrame.from_dict(call("proteinlist", [tissue, 0, 1, 1, 9606]))
        # print(tmp_proteinlist_df)
        proteinlist_by_this_tissue = list(set(tmp_proteinlist_df["UNIQUE_IDENTIFIER"].tolist()))
        protein_list.extend(proteinlist_by_this_tissue)

        protein_df = pd.concat([protein_df, tmp_proteinlist_df])
    except:
        print(tissue)
        pass

unique_uniprot_protein_list = list(set(protein_list))

BTO:0004136
PDB:200014
BTO:0002418
PDB:200028
BTO:0004479
BTO:0001932
PDB:200022
BTO:0002026
PDB:200011
PDB:200029
PDB:200009
BTO:0003981
BTO:0001370
BTO:0002181
PDB:200012
BTO:0001948
BTO:0004440
PO:0000009
PDB:200013
PDB:200021
BTO:0003722
BTO:0000225
PDB:200007
PDB:200015
PDB:200027
PO:0000008
PDB:200020
PDB:200026
PDB:200017
BTO:0000793
PDB:200024
PDB:200025
BTO:0003775
PDB:200019
BTO:0003076
PDB:200010
BTO:0005102
PDB:200023
BTO:0000568
PDB:200016
PDB:200018
BTO:0002025
PDB:200008
PDB:200030
BTO:0003774
BTO:0000941


# Download proteins per tissue 

In [7]:
protein_list = []
protein_df = pd.DataFrame()
for tissue in set(cell_line_tissue_id):
    tmp_proteinlist_df = pd.DataFrame.from_dict(call("proteinspertissue", [tissue]))
    protein_df = pd.concat([protein_df, tmp_proteinlist_df])


# Preprocess

In [8]:
protein_df["NORMALIZED_EXPRESSION"] = pd.to_numeric(protein_df["NORMALIZED_EXPRESSION"])
protein_df_pivoted = protein_df.pivot_table(index="ENTRY_NAME", columns="SAMPLE_NAME", values="NORMALIZED_EXPRESSION")
protein_df_pivoted = protein_df_pivoted.round(10)
protein_df_pivoted_drop_duplicates = protein_df_pivoted.T.drop_duplicates().T

# For Enrichr 

In [9]:
protein_df["NORMALIZED_EXPRESSION"] = protein_df["NORMALIZED_EXPRESSION"].astype('float64')

In [10]:
new_column_name = "TISSUE_TYPE;TISSUE_NAME;TISSUE_ID;SAMPLE_NAME"

In [11]:
protein_df[new_column_name] = protein_df["TISSUE_ID"].map(tissue_id_to_description_dict) + " " + protein_df["TISSUE_ID"].map(tissue_id_to_name_dict) + " " + protein_df["TISSUE_ID"] + " " + protein_df["SAMPLE_DESCRIPTION"]

## SHORTEN NAME

In [12]:
new_labels = []
for x in protein_df[new_column_name].tolist():
    try:
        regex = re.compile('[)(*&^%$#@!}{?><|];:') 
        x = re.sub(r" ?\([^)]+\)", "", x)
    except:
        print(x)
    # Pass the string in search  
    # method of regex object.     
    if(regex.search(x) == None): 
        pass
    else:         
        print(x, "String is not accepted.")
    x = x.replace(" cell", "")
    x = x.replace(" line", "")
    x = x.replace("TechRep", "Rep")
    x = x.replace("GradientTime_", "")
    x = x.replace("ColumnSize_", "")
    x = x.replace("DetectionWindow_", "")
    x = x.replace("ProteinExtractionMethods_", "")
    x = x.replace("_biorep", "_rep")
    x = x.replace("Rep.of.", "")
    x = x.replace("tecRep.of.", "")
    x = x.replace("tec", "")
    x = x.replace("X131.126_HM33.GM12005.131.126_", "")
    x = x.replace("X131.126_HM34.GM12005.131.126_", "")
    x = x.replace("_Trypsin_Proteome", "")
    x = x.replace("Lymphoblastoid lymphoblastoid", "Lymphoblastoid")

                      
    new_labels.append(x)


In [13]:
protein_df[new_column_name] = new_labels

# PIVOT TABLE

In [14]:
protein_expression_df = pd.pivot_table(protein_df, values="NORMALIZED_EXPRESSION", index=new_column_name, columns="ENTRY_NAME")

# SAVE

In [15]:
output_folder_path = "./data/"

In [16]:
protein_expression_df.to_csv(output_folder_path+"ProteomicsDB_proteinspertissue_pivoted.csv")

In [17]:
protein_df.to_csv(output_folder_path+"ProteomicsDB_raw.csv")

In [18]:
protein_id_df = protein_df.loc[:, ["ENTRY_NAME", "UNIQUE_IDENTIFIER"]].drop_duplicates()
protein_id_df.to_csv(output_folder_path+"ProteomicsDB_protein_id.csv")

In [19]:
import pickle
with open(output_folder_path+"protein_list.pkl", "wb") as f:
    pickle.dump(unique_uniprot_protein_list, f)