In [39]:
import pandas as pd
import numpy as np
import requests
import sys
import regex as re

In [6]:
cofactors = pd.read_csv("data/cofactors_mapped_combined.csv")
cofactors_unique = cofactors[["Gene Name", "Complex", "Subcomplex or Module", "Own-complex paralog", "Other-complex Paralogues", "UniprotID", "Gene Names", "Bgee", "GeneID"]].groupby("Bgee").agg(list)
cofactors_unique.head()

Unnamed: 0_level_0,Gene Name,Complex,Subcomplex or Module,Own-complex paralog,Other-complex Paralogues,UniprotID,Gene Names,GeneID
Bgee,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ENSG00000005075,[POLR2J],[RNA Pol II],[nan],[nan],[nan],[P52435],[POLR2J POLR2J1],[5439.0]
ENSG00000005339,[CREBBP],[P300-CBP],[CBP],[nan],[nan],[Q92793],[CREBBP CBP],[1387.0]
ENSG00000006712,[PAF1],[Elongation],[nan],[nan],[nan],[Q8N7H5],[PAF1 PD2],[54623.0]
ENSG00000008838,[MED24],[Mediator],[[Tail]],[nan],[nan],[O75448],[MED24 ARC100 CRSP4 DRIP100 KIAA0130 THRAP4 TR...,[9862.0]
ENSG00000011007,[ELOA],[Elongation],[nan],[nan],[nan],[Q14241],[ELOA TCEB3 MSTP059],[6924.0]


In [20]:
def get_proteinatlas_specificity(ensg):

    try:
        server = "https://www.proteinatlas.org/api/search_download.php?search={0}&format=json&columns=g,eg,rnats,rnatd,rnatss,rnatsm,rnascs,rnascd,rnascss,rnascsm,rnasnbs,rnasnbd,rnasnbss,rnasnbsm,t_RNA__tau,blood_RNA__tau,brain_RNA__tau,sc_RNA__tau,Brain_sn_RNA__tau&compress=no".format(ensg)

        r = requests.get(server, headers={ "Content-Type" : "application/json"})
        if not r.ok:
            r.raise_for_status()
            sys.exit()
        

        decoded = r.json()
        return decoded
    
    except Exception as e:
        print(e, ensg)
        return np.NaN


In [47]:
labels = """
Single Cell Type RNA - Adipocytes [nTPM]	sc_RNA_Adipocytes
Single Cell Type RNA - Alveolar cells type 1 [nTPM]	sc_RNA_Alveolar_cells_type_1
Single Cell Type RNA - Alveolar cells type 2 [nTPM]	sc_RNA_Alveolar_cells_type_2
Single Cell Type RNA - Astrocytes [nTPM]	sc_RNA_Astrocytes
Single Cell Type RNA - B-cells [nTPM]	sc_RNA_B-cells
Single Cell Type RNA - Basal keratinocytes [nTPM]	sc_RNA_Basal_keratinocytes
Single Cell Type RNA - Basal prostatic cells [nTPM]	sc_RNA_Basal_prostatic_cells
Single Cell Type RNA - Basal respiratory cells [nTPM]	sc_RNA_Basal_respiratory_cells
Single Cell Type RNA - Basal squamous epithelial cells [nTPM]	sc_RNA_Basal_squamous_epithelial_cells
Single Cell Type RNA - Bipolar cells [nTPM]	sc_RNA_Bipolar_cells
Single Cell Type RNA - Breast glandular cells [nTPM]	sc_RNA_Breast_glandular_cells
Single Cell Type RNA - Breast myoepithelial cells [nTPM]	sc_RNA_Breast_myoepithelial_cells
Single Cell Type RNA - Cardiomyocytes [nTPM]	sc_RNA_Cardiomyocytes
Single Cell Type RNA - Cholangiocytes [nTPM]	sc_RNA_Cholangiocytes
Single Cell Type RNA - Ciliated cells [nTPM]	sc_RNA_Ciliated_cells
Single Cell Type RNA - Club cells [nTPM]	sc_RNA_Club_cells
Single Cell Type RNA - Collecting duct cells [nTPM]	sc_RNA_Collecting_duct_cells
Single Cell Type RNA - Cone photoreceptor cells [nTPM]	sc_RNA_Cone_photoreceptor_cells
Single Cell Type RNA - Cytotrophoblasts [nTPM]	sc_RNA_Cytotrophoblasts
Single Cell Type RNA - dendritic cells [nTPM]	sc_RNA_dendritic_cells
Single Cell Type RNA - Distal enterocytes [nTPM]	sc_RNA_Distal_enterocytes
Single Cell Type RNA - Distal tubular cells [nTPM]	sc_RNA_Distal_tubular_cells
Single Cell Type RNA - Ductal cells [nTPM]	sc_RNA_Ductal_cells
Single Cell Type RNA - Early spermatids [nTPM]	sc_RNA_Early_spermatids
Single Cell Type RNA - Endometrial stromal cells [nTPM]	sc_RNA_Endometrial_stromal_cells
Single Cell Type RNA - Endothelial cells [nTPM]	sc_RNA_Endothelial_cells
Single Cell Type RNA - Enteroendocrine cells [nTPM]	sc_RNA_Enteroendocrine_cells
Single Cell Type RNA - Erythroid cells [nTPM]	sc_RNA_Erythroid_cells
Single Cell Type RNA - Excitatory neurons [nTPM]	sc_RNA_Excitatory_neurons
Single Cell Type RNA - Exocrine glandular cells [nTPM]	sc_RNA_Exocrine_glandular_cells
Single Cell Type RNA - Extravillous trophoblasts [nTPM]	sc_RNA_Extravillous_trophoblasts
Single Cell Type RNA - Fibroblasts [nTPM]	sc_RNA_Fibroblasts
Single Cell Type RNA - Gastric mucus-secreting cells [nTPM]	sc_RNA_Gastric_mucus-secreting_cells
Single Cell Type RNA - Glandular and luminal cells [nTPM]	sc_RNA_Glandular_and_luminal_cells
Single Cell Type RNA - granulocytes [nTPM]	sc_RNA_granulocytes
Single Cell Type RNA - Granulosa cells [nTPM]	sc_RNA_Granulosa_cells
Single Cell Type RNA - Hepatocytes [nTPM]	sc_RNA_Hepatocytes
Single Cell Type RNA - Hofbauer cells [nTPM]	sc_RNA_Hofbauer_cells
Single Cell Type RNA - Horizontal cells [nTPM]	sc_RNA_Horizontal_cells
Single Cell Type RNA - Inhibitory neurons [nTPM]	sc_RNA_Inhibitory_neurons
Single Cell Type RNA - Intestinal goblet cells [nTPM]	sc_RNA_Intestinal_goblet_cells
Single Cell Type RNA - Ionocytes [nTPM]	sc_RNA_Ionocytes
Single Cell Type RNA - Kupffer cells [nTPM]	sc_RNA_Kupffer_cells
Single Cell Type RNA - Langerhans cells [nTPM]	sc_RNA_Langerhans_cells
Single Cell Type RNA - Late spermatids [nTPM]	sc_RNA_Late_spermatids
Single Cell Type RNA - Leydig cells [nTPM]	sc_RNA_Leydig_cells
Single Cell Type RNA - Lymphatic endothelial cells [nTPM]	sc_RNA_Lymphatic_endothelial_cells
Single Cell Type RNA - Macrophages [nTPM]	sc_RNA_Macrophages
Single Cell Type RNA - Melanocytes [nTPM]	sc_RNA_Melanocytes
Single Cell Type RNA - Mesothelial cells [nTPM]	sc_RNA_Mesothelial_cells
Single Cell Type RNA - Microglial cells [nTPM]	sc_RNA_Microglial_cells
Single Cell Type RNA - monocytes [nTPM]	sc_RNA_monocytes
Single Cell Type RNA - Mucus glandular cells [nTPM]	sc_RNA_Mucus_glandular_cells
Single Cell Type RNA - Muller glia cells [nTPM]	sc_RNA_Muller_glia_cells
Single Cell Type RNA - NK-cells [nTPM]	sc_RNA_NK-cells
Single Cell Type RNA - Oligodendrocyte precursor cells [nTPM]	sc_RNA_Oligodendrocyte_precursor_cells
Single Cell Type RNA - Oligodendrocytes [nTPM]	sc_RNA_Oligodendrocytes
Single Cell Type RNA - Oocytes [nTPM]	sc_RNA_Oocytes
Single Cell Type RNA - Ovarian stromal cells [nTPM]	sc_RNA_Ovarian_stromal_cells
Single Cell Type RNA - Pancreatic endocrine cells [nTPM]	sc_RNA_Pancreatic_endocrine_cells
Single Cell Type RNA - Paneth cells [nTPM]	sc_RNA_Paneth_cells
Single Cell Type RNA - Peritubular cells [nTPM]	sc_RNA_Peritubular_cells
Single Cell Type RNA - Plasma cells [nTPM]	sc_RNA_Plasma_cells
Single Cell Type RNA - Prostatic glandular cells [nTPM]	sc_RNA_Prostatic_glandular_cells
Single Cell Type RNA - Proximal enterocytes [nTPM]	sc_RNA_Proximal_enterocytes
Single Cell Type RNA - Proximal tubular cells [nTPM]	sc_RNA_Proximal_tubular_cells
Single Cell Type RNA - Rod photoreceptor cells [nTPM]	sc_RNA_Rod_photoreceptor_cells
Single Cell Type RNA - Salivary duct cells [nTPM]	sc_RNA_Salivary_duct_cells
Single Cell Type RNA - Schwann cells [nTPM]	sc_RNA_Schwann_cells
Single Cell Type RNA - Secretory cells [nTPM]	sc_RNA_Secretory_cells
Single Cell Type RNA - Serous glandular cells [nTPM]	sc_RNA_Serous_glandular_cells
Single Cell Type RNA - Sertoli cells [nTPM]	sc_RNA_Sertoli_cells
Single Cell Type RNA - Skeletal myocytes [nTPM]	sc_RNA_Skeletal_myocytes
Single Cell Type RNA - Smooth muscle cells [nTPM]	sc_RNA_Smooth_muscle_cells
Single Cell Type RNA - Spermatocytes [nTPM]	sc_RNA_Spermatocytes
Single Cell Type RNA - Spermatogonia [nTPM]	sc_RNA_Spermatogonia
Single Cell Type RNA - Squamous epithelial cells [nTPM]	sc_RNA_Squamous_epithelial_cells
Single Cell Type RNA - Suprabasal keratinocytes [nTPM]	sc_RNA_Suprabasal_keratinocytes
Single Cell Type RNA - Syncytiotrophoblasts [nTPM]	sc_RNA_Syncytiotrophoblasts
Single Cell Type RNA - T-cells [nTPM]	sc_RNA_T-cells
Single Cell Type RNA - Undifferentiated cells [nTPM]	sc_RNA_Undifferentiated_cells
"""

sclabels = [sc for sc in re.findall("(sc_.*)", labels)]
sclabels = ",".join(sclabels)

def get_proteinatlas_scRNA(ensg):

    try:
        server = "https://www.proteinatlas.org/api/search_download.php?search={0}&format=json&columns=g,eg,{1}&compress=no".format(ensg, sclabels)

        r = requests.get(server, headers={ "Content-Type" : "application/json"})
        if not r.ok:
            r.raise_for_status()
            sys.exit()
        

        decoded = r.json()
        return decoded
    
    except Exception as e:
        print(e, ensg)
        return np.NaN

In [43]:
def generate_proteinatlaslist(lst, apifunc, filename):
    maindf = pd.DataFrame({})
    for ensg in lst:
        df = pd.DataFrame(apifunc(ensg))
        maindf = pd.concat([maindf, df])
    maindf.to_csv(filename)
    return maindf

In [50]:
#Run on all cofactors
lst = cofactors_unique.index.to_list()
# cofactorspecificity = generate_proteinatlaslist(lst, get_proteinatlas_specificity, "helperdata/cofactor_specificity.csv")
cofactor_scRNA = generate_proteinatlaslist(lst, get_proteinatlas_scRNA, "helperdata/cofactor_scRNA.csv")
