<a href="https://colab.research.google.com/github/innacohen/innacohen/blob/main/part_2c_protein.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import pandas as pd
import numpy as np
import time
import requests
import json
import re

In [14]:
def View(df, rows=None, cols=None, width=None):

    with pd.option_context(
        "display.max_rows", rows,
        "display.max_columns", cols,
        "display.max_colwidth", width,
        "display.expand_frame_repr", False
    ):
        display(df.head(rows))

In [41]:
#ref: https://www.uniprot.org/api-documentation/uniprotkb
def get_protein(gene, max_results=1):
    url = f"https://rest.uniprot.org/uniprotkb/search?query=gene%3A{gene}%20AND%20reviewed%3Atrue%20AND%20organism_id%3A9606&fields=accession%2Cprotein_name%2Ccc_function%2Cft_binding&sort=annotation_score%20desc&size={max_results}"


    r = requests.get(url)
    if r.status_code != 200:
        print(f"Lookup of {gene} failed; status: {r.status_code}")
        return None

    results = r.json().get("results", [])
    parsed_results = []

    for entry in results:
        accession = entry.get("primaryAccession", "N/A")
        protein_name = (
            entry.get("proteinDescription", {})
            .get("recommendedName", {})
            .get("fullName", {})
            .get("value", "N/A")
        )

        functions_by_isoform = {}
        pmids = []

        for comment in entry.get("comments", []):
            if comment.get("commentType") == "FUNCTION":
                isoform = comment.get("molecule", "unspecified isoform")
                texts = [
                    t.get("value")
                    for t in comment.get("texts", [])
                    if "value" in t
                ]
                for text in texts:
                    pmids.extend(re.findall(r'PubMed:(\d+)', text))


                if isoform in functions_by_isoform:
                    functions_by_isoform[isoform].extend(texts)
                else:
                    functions_by_isoform[isoform] = texts

        bindings = [
            f.get("description")
            for f in entry.get("features", [])
            if f.get("type") == "BINDING" and f.get("description")
        ]

        parsed_results.append({
            "gene": gene,
            "accession": accession,
            "protein_name": protein_name,
            "functions_by_isoform": functions_by_isoform,
            "bindings": bindings,
             "ref": list(set(pmids))
        })

    return parsed_results if max_results > 1 else parsed_results[0]


In [15]:
gene_df = pd.read_csv("https://raw.githubusercontent.com/ZhaochenYe999/CBB752_FinalProject/refs/heads/main/genelist/genelist_NonSynoymousVariants.csv").drop(columns=["Unnamed: 0"])

In [17]:
#taking the top 11 since there is a tie
gene_df_sorted = gene_df.sort_values(by="Mutation_count", ascending=False).head(11)

In [42]:
get_protein("MYO18B")

{'gene': 'MYO18B',
 'accession': 'Q8IUG5',
 'protein_name': 'Unconventional myosin-XVIIIb',
 'functions_by_isoform': {'unspecified isoform': ['May be involved in intracellular trafficking of the muscle cell when in the cytoplasm, whereas entering the nucleus, may be involved in the regulation of muscle specific genes. May play a role in the control of tumor development and progression; restored MYO18B expression in lung cancer cells suppresses anchorage-independent growth']},
 'bindings': [],
 'ref': []}

In [18]:
gene_df_sorted

Unnamed: 0,Mutation_count,Gene_symbol,Chromosome
0,32,HPS4,chr22
1,21,APOL1,chr22
2,20,TRIOBP,chr22
3,19,SFI1,chr22
4,18,EFCAB6,chr22
5,16,MYO18B,chr22
6,15,ARSA,chr22
7,13,CELSR1,chr22
8,12,SUN2,chr22
9,12,SEC14L3,chr22


In [43]:
genes = list(gene_df_sorted["Gene_symbol"].values)

rows = []

for gene in genes:
    try:
      protein_data = get_protein(gene)
      rows.append({
          "gene": gene,
          "accession": protein_data["accession"],
          "protein_name": protein_data["protein_name"],
          "functions_by_isoform": protein_data["functions_by_isoform"],
          "bindings": protein_data["bindings"],
          "ref": protein_data["ref"]

      })
    except Exception as e:
        print(f"Error processing {gene}: {e}")
    time.sleep(0.4)

df = pd.DataFrame(rows)

In [44]:
df.to_csv("protein_data.csv", index=False)

In [30]:
View(df)

Unnamed: 0,gene,accession,protein_name,functions_by_isoform,bindings,ref
0,HPS4,Q9NQG7,BLOC-3 complex member HPS4,"{'unspecified isoform': ['Component of the BLOC-3 complex, a complex that acts as a guanine exchange factor (GEF) for RAB32 and RAB38, promotes the exchange of GDP to GTP, converting them from an inactive GDP-bound form into an active GTP-bound form. The BLOC-3 complex plays an important role in the control of melanin production and melanosome biogenesis and promotes the membrane localization of RAB32 and RAB38 (PubMed:23084991)']}",[],[23084991]
1,APOL1,O14791,Apolipoprotein L1,{'unspecified isoform': ['May play a role in lipid exchange and transport throughout the body. May participate in reverse cholesterol transport from peripheral cells to the liver']},[],[]
2,TRIOBP,Q9H2D6,TRIO and F-actin-binding protein,"{'Isoform 1': ['Regulates actin cytoskeletal organization, cell spreading and cell contraction by directly binding and stabilizing filamentous F-actin and prevents its depolymerization (PubMed:18194665, PubMed:28438837). May also serve as a linker protein to recruit proteins required for F-actin formation and turnover (PubMed:18194665). Essential for correct mitotic progression (PubMed:22820163, PubMed:24692559)'], 'Isoform 5': ['Plays a pivotal role in the formation of stereocilia rootlets'], 'Isoform 4': ['Plays a pivotal role in the formation of stereocilia rootlets']}",[],"[18194665, 28438837, 22820163, 24692559]"
3,SFI1,A8K8P3,Protein SFI1 homolog,{'unspecified isoform': ['Plays a role in the dynamic structure of centrosome-associated contractile fibers via its interaction with CETN2']},[],[]
4,EFCAB6,Q5THR3,EF-hand calcium-binding domain-containing protein 6,"{'unspecified isoform': ['Negatively regulates the androgen receptor by recruiting histone deacetylase complex, and protein DJ-1 antagonizes this inhibition by abrogation of this complex (PubMed:12612053). Microtubule inner protein (MIP) part of the dynein-decorated doublet microtubules (DMTs) in cilia axoneme, which is required for motile cilia beating (By similarity)']}",[],[12612053]
5,MYO18B,Q8IUG5,Unconventional myosin-XVIIIb,"{'unspecified isoform': ['May be involved in intracellular trafficking of the muscle cell when in the cytoplasm, whereas entering the nucleus, may be involved in the regulation of muscle specific genes. May play a role in the control of tumor development and progression; restored MYO18B expression in lung cancer cells suppresses anchorage-independent growth']}",[],[]
6,ARSA,O43681,ATPase GET3,"{'unspecified isoform': ['ATPase required for the post-translational delivery of tail-anchored (TA) proteins to the endoplasmic reticulum (PubMed:17382883). Recognizes and selectively binds the transmembrane domain of TA proteins in the cytosol. This complex then targets to the endoplasmic reticulum by membrane-bound receptors GET1/WRB and CAMLG/GET2, where the tail-anchored protein is released for insertion. This process is regulated by ATP binding and hydrolysis. ATP binding drives the homodimer towards the closed dimer state, facilitating recognition of newly synthesized TA membrane proteins. ATP hydrolysis is required for insertion. Subsequently, the homodimer reverts towards the open dimer state, lowering its affinity for the GET1-CAMLG receptor, and returning it to the cytosol to initiate a new round of targeting. May be involved in insulin signaling']}",[],[17382883]
7,CELSR1,Q9NYQ6,Cadherin EGF LAG seven-pass G-type receptor 1,{'unspecified isoform': ['Receptor that may have an important role in cell/cell signaling during nervous system formation']},[],[]
8,SUN2,Q9UH99,SUN domain-containing protein 2,"{'unspecified isoform': ['As a component of the LINC (LInker of Nucleoskeleton and Cytoskeleton) complex, involved in the connection between the nuclear lamina and the cytoskeleton. The nucleocytoplasmic interactions established by the LINC complex play an important role in the transmission of mechanical forces across the nuclear envelope and in nuclear movement and positioning. Specifically, SYNE2 and SUN2 assemble in arrays of transmembrane actin-associated nuclear (TAN) lines which are bound to F-actin cables and couple the nucleus to retrograde actin flow during actin-dependent nuclear movement. Required for interkinetic nuclear migration (INM) and essential for nucleokinesis and centrosome-nucleus coupling during radial neuronal migration in the cerebral cortex and during glial migration. Required for nuclear migration in retinal photoreceptor progenitors implicating association with cytoplasmic dynein-dynactin and kinesin motor complexes, and probably B-type lamins; SUN1 and SUN2 seem to act redundantly. The SUN1/2:KASH5 LINC complex couples telomeres to microtubules during meiosis; SUN1 and SUN2 seem to act at least partial redundantly. Anchors chromosome movement in the prophase of meiosis and is involved in selective gene expression of coding and non-coding RNAs needed for gametogenesis. Required for telomere attachment to nuclear envelope and gametogenesis. May also function on endocytic vesicles as a receptor for RAB5-GDP and participate in the activation of RAB5']}",[],[]
9,SEC14L3,Q9UDX4,SEC14-like protein 3,"{'unspecified isoform': ['Probable hydrophobic ligand-binding protein; may play a role in the transport of hydrophobic ligands like tocopherol, squalene and phospholipids']}",[],[]
