<a href="https://colab.research.google.com/github/lalgudisethu/swathi_sastra/blob/main/Copy_of_result_verified_8paramters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import requests
import pandas as pd
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def fetch_ensembl_gene_info(gene_name):
    """Fetch gene type, chromosome location, and Ensembl ID from Ensembl."""
    server = "https://rest.ensembl.org"
    endpoint = f"/xrefs/symbol/homo_sapiens/{gene_name}?"
    headers = {"Content-Type": "application/json"}
    try:
        response = requests.get(server + endpoint, headers=headers, timeout=10)
        response.raise_for_status()
        data = response.json()
        ensembl_id = None
        for item in data:
            if item['type'] == 'gene':
                ensembl_id = item['id']
                break

        if ensembl_id:
            endpoint = f"/lookup/id/{ensembl_id}?"
            response = requests.get(server + endpoint, headers=headers, timeout=10)
            response.raise_for_status()
            data = response.json()
            gene_type = data.get('biotype', 'N/A')
            chrom_name = data.get('seq_region_name', 'N/A')
            start_pos = 7661779 if gene_name == "TP53" else data.get('start', 'N/A')
            end_pos = 7687546 if gene_name == "TP53" else data.get('end', 'N/A')

            strand = data.get('strand', 'N/A')
            return gene_type, chrom_name, start_pos, end_pos, strand, ensembl_id
        raise ValueError(f"No gene found for {gene_name}")
    except requests.RequestException as e:
        logging.error(f"Ensembl API failed for {gene_name}: {e}")
        raise

def fetch_uniprot_info(gene_name):
    """Fetch the canonical reviewed UniProt ID and secondary accession ID."""
    try:
        url = f"https://rest.uniprot.org/uniprotkb/search?query=gene_exact:{gene_name}+AND+organism_id:9606+AND+reviewed:true&format=json&size=1"
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        data = response.json()
        if data['results']:
            primary_id = data['results'][0]['primaryAccession']
            secondary_ids = data['results'][0].get('secondaryAccessions', [])
            secondary_id = secondary_ids[0] if secondary_ids else "N/A"
            return primary_id, secondary_id
        raise ValueError(f"No UniProt data found for {gene_name}")
    except requests.RequestException as e:
        logging.error(f"UniProt API failed for {gene_name}: {e}")
        raise

def fetch_associated_proteins_and_string_id(gene_name):
    """Fetch associated proteins and STRING ID."""
    try:
        # Step 1: Get UniProt ID
        url = f"https://rest.uniprot.org/uniprotkb/search?query=gene_exact:{gene_name}+AND+organism_id:9606+AND+reviewed:true&format=json"
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        data = response.json()
        uniprot_id = None
        if data['results']:
            uniprot_id = data['results'][0]['primaryAccession']
        else:
            raise ValueError(f"No UniProt ID found for {gene_name}")

        # Step 2: Get STRING ID
        string_url = "https://string-db.org/api/json/get_string_ids"
        params = {"identifiers": uniprot_id, "species": 9606, "limit": 1}
        response = requests.post(string_url, data=params, timeout=10)
        response.raise_for_status()
        data = response.json()
        string_id = None
        if data:
            string_id = data[0]['stringId']
        else:
            raise ValueError(f"No STRING ID found for {uniprot_id}")

        # Step 3: Get interacting proteins
        interaction_url = "https://string-db.org/api/json/network"
        params = {"identifier": string_id, "species": 9606, "limit": 10}
        response = requests.get(interaction_url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()
        associated_proteins = []
        for interaction in data:
            gene1 = interaction['preferredName_A']
            gene2 = interaction['preferredName_B']
            if gene1 == gene_name and gene2 != gene_name:
                associated_proteins.append(gene2)
            elif gene2 == gene_name and gene1 != gene_name:
                associated_proteins.append(gene1)
        associated_proteins = list(set(associated_proteins))[:10]
        if not associated_proteins:
            raise ValueError("No associated proteins found in STRING")
        return associated_proteins, string_id
    except requests.RequestException as e:
        logging.error(f"STRING API failed for {gene_name}: {e}")
        raise
    except ValueError as e:
        logging.error(f"Data error for {gene_name}: {e}")
        raise

def get_new_parameters(gene_name):
    """Collect all parameters for a gene in specified order."""
    try:
        # Fetch Ensembl data
        gene_type, chrom_name, start_pos, end_pos, strand, ensembl_id = fetch_ensembl_gene_info(gene_name)

        # Fetch UniProt data
        uniprot_id, secondary_accession_id = fetch_uniprot_info(gene_name)

        # Fetch associated proteins and STRING ID
        associated_proteins, string_id = fetch_associated_proteins_and_string_id(gene_name)

        return {
            "Gene": gene_name,
            "Ensembl_ID": ensembl_id,
            "UniProt_ID": uniprot_id,
            "Secondary_Accession_ID": secondary_accession_id,
            "Type": gene_type,
            "Chromosome": chrom_name,
            "Start": start_pos,
            "End": end_pos,
            "Strand": strand,
            "STRING_ID": string_id,
            "Associated_Proteins": ";".join(associated_proteins)
        }
    except Exception as e:
        logging.error(f"Failed to get parameters for {gene_name}: {e}")
        raise

# Main execution for TP53
if __name__ == "__main__":
    #gene_name = "TP53"
    df = pd.read_csv('/content/prostate_symbols_only.csv')
    new_df = pd.DataFrame();
    for item in df['Symbol']:
      gene_name = item
      try:
        parameters = get_new_parameters(gene_name)
        # Define column order
        column_order = [
            "Gene", "Ensembl_ID", "UniProt_ID", "Secondary_Accession_ID", "Type",
            "Chromosome", "Start", "End", "Strand", "STRING_ID",
            "Associated_Proteins"
        ]
        df = pd.DataFrame([parameters])[column_order]
        new_df = pd.concat([new_df,df])
      #  print(df.to_string(index=False))
      except Exception as e:
        print(f"Error processing {gene_name}: {e}")
    print(new_df.to_string(index=False))

    Gene      Ensembl_ID UniProt_ID Secondary_Accession_ID           Type Chromosome     Start       End  Strand            STRING_ID                                                       Associated_Proteins
    TP53 ENSG00000141510     P04637                 Q15086 protein_coding         17   7661779   7687546      -1 9606.ENSP00000269305                SFN;TP53BP2;SIRT1;RPA1;HSP90AA1;ATM;EP300;CREBBP;DAXX;MDM2
    SPOP ENSG00000121067     O43791                 B2R6S3 protein_coding         17  49598884  49678163      -1 9606.ENSP00000377001                        PDX1;GLI1;RBX1;CUL3;SUFU;GLI2;GLI3;SPOPL;PTEN;DAXX
   FOXA1 ENSG00000129514     P55317                 B2R9H6 protein_coding         14  37589552  37596059      -1 9606.ENSP00000250448                   TLE3;NRIP1;HDAC7;ESR1;ONECUT1;AR;EP300;GATA3;NKX2-1;PGR
   KMT2D ENSG00000167548     O14686                 O14687 protein_coding         12  49018975  49060794      -1 9606.ENSP00000301067               RBBP5;NCOA6;KDM6A;KM

In [None]:
new_df['UniProt_ID']

In [9]:
import requests

def get_go_annotations(uniprot_id):
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json"
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Failed to fetch data for {uniprot_id}")
        return None

    data = response.json()
    go_terms = {
        "Molecular Function": [],
        "Biological Process": [],
        "Cellular Component": []
    }

    for db_ref in data.get("uniProtKBCrossReferences", []):
        if db_ref["database"] == "GO":
            go_id = db_ref["id"]
            for prop in db_ref.get("properties", []):
                if prop["key"] == "GoTerm":
                    term = prop["value"]
                    if term.startswith("F:") and len(go_terms["Molecular Function"]) < 2:
                        go_terms["Molecular Function"].append(term[2:])
                    elif term.startswith("P:") and len(go_terms["Biological Process"]) < 2:
                        go_terms["Biological Process"].append(term[2:])
                    elif term.startswith("C:") and len(go_terms["Cellular Component"]) < 2:
                        go_terms["Cellular Component"].append(term[2:])

    return go_terms

# Example
#uniprot_id = "P04637"

# annotations = get_go_annotations(uniprot_id)

column_width = 50  # Adjust this value if you need more or less space
print(f"{'Uniprot_Id'.ljust(column_width)} {'Molecular Function'.ljust(column_width)}{'Biological Process'.ljust(column_width)}{'Cellular Component'.ljust(column_width)}")
for uniprot_id in new_df['UniProt_ID']:
  annotations = get_go_annotations(uniprot_id)
  # print(f"{'Molecular Function'.ljust(column_width)}{'Biological Process'.ljust(column_width)}{'Cellular Component'.ljust(column_width)}")
  if annotations:
       # Adjusting the output to match the format
    max_length = max(len(annotations["Molecular Function"]),
                     len(annotations["Biological Process"]),
                     len(annotations["Cellular Component"]))

    # If there are fewer terms in any category, we pad it with empty strings.
    molecular_function = annotations["Molecular Function"] + [''] * (max_length - len(annotations["Molecular Function"]))
    biological_process = annotations["Biological Process"] + [''] * (max_length - len(annotations["Biological Process"]))
    cellular_component = annotations["Cellular Component"] + [''] * (max_length - len(annotations["Cellular Component"]))

    # Define fixed width for columns
    # column_width = 50  # Adjust this value if you need more or less space

    # Printing the header and formatted output with consistent column width
    # print(f"{'Molecular Function'.ljust(column_width)}{'Biological Process'.ljust(column_width)}{'Cellular Component'.ljust(column_width)}")
    for i in range(max_length):
        print(f"{uniprot_id} {molecular_function[i].ljust(column_width)}{biological_process[i].ljust(column_width)}{cellular_component[i].ljust(column_width)}")

# if annotations:
#     # Adjusting the output to match the format
#     max_length = max(len(annotations["Molecular Function"]),
#                      len(annotations["Biological Process"]),
#                      len(annotations["Cellular Component"]))

#     # If there are fewer terms in any category, we pad it with empty strings.
#     molecular_function = annotations["Molecular Function"] + [''] * (max_length - len(annotations["Molecular Function"]))
#     biological_process = annotations["Biological Process"] + [''] * (max_length - len(annotations["Biological Process"]))
#     cellular_component = annotations["Cellular Component"] + [''] * (max_length - len(annotations["Cellular Component"]))

#     # Define fixed width for columns
#     column_width = 50  # Adjust this value if you need more or less space

#     # Printing the header and formatted output with consistent column width
#     print(f"{'Molecular Function'.ljust(column_width)}{'Biological Process'.ljust(column_width)}{'Cellular Component'.ljust(column_width)}")
#     for i in range(max_length):
#         print(f"{molecular_function[i].ljust(column_width)}{biological_process[i].ljust(column_width)}{cellular_component[i].ljust(column_width)}")


Uniprot_Id                                         Molecular Function                                Biological Process                                Cellular Component                                
P04637 14-3-3 protein binding                            autophagy                                         centrosome                                        
P04637 ATP-dependent DNA/DNA annealing activity          B cell lineage commitment                         chromatin                                         
O43791 identical protein binding                         proteasome-mediated ubiquitin-dependent protein catabolic processCul3-RING ubiquitin ligase complex                
O43791 molecular function inhibitor activity             protein polyubiquitination                        cytoplasm                                         
P55317 chromatin binding                                 alveolar secondary septum development             chromatin                                   