In [1]:
import pandas as pd

In [2]:
omim_df = pd.read_excel('../../data/prot/BAT-OMIM.xlsx', header=None)
omim_df = omim_df[[1]].astype(str)
omim_df = omim_df.tail(122)
omim_df['Protein'] = omim_df[1].str.split(';').str[-1].str.strip()
omim_df = omim_df[['Protein']]
omim_df

Unnamed: 0,Protein
5,UCP1
6,PPARGC1A
7,HDAC3
8,PPARGC1B
9,SGSH
...,...
122,PRDM16
123,ANGPTL4
124,BSCL2
125,SIRT6


In [3]:
GC_df = pd.read_csv('../../data/prot/BAT-GeneCards.csv')
GC_df = GC_df[['Gene Symbol', 'Uniprot ID']]
GC_df

Unnamed: 0,Gene Symbol,Uniprot ID
0,UCP1,P25874
1,ACOT11,Q8WXI4
2,ZNF516,Q92618
3,SMARCA4,P51532
4,LETMD1,Q6P1Q0
...,...,...
405,TM4SF5,O14894
406,LPIN3,Q9BQK8
407,ZC3H10,Q96K80
408,PRLH,P81277


In [4]:
prot_df = pd.concat([omim_df['Protein'], GC_df['Gene Symbol']], ignore_index=True).drop_duplicates().reset_index(drop=True)
prot_df = pd.DataFrame(prot_df)
prot_df.rename(columns={0 : 'Gene Symbol'}, inplace=True)
prot_df

Unnamed: 0,Gene Symbol
0,UCP1
1,PPARGC1A
2,HDAC3
3,PPARGC1B
4,SGSH
...,...
468,TM4SF5
469,LPIN3
470,ZC3H10
471,PRLH


In [5]:
import requests
from tqdm.auto import tqdm

# Fungsi untuk mendapatkan detail dari UniProt
def get_uniprot_details(gene_symbol):
    url = f"https://rest.uniprot.org/uniprotkb/search?query=gene:{gene_symbol}&fields=accession,gene_primary,protein_name,sequence&format=json"
    response = requests.get(url)
    if response.status_code == 200:
        results = response.json().get('results', [])
        if results:
            try:
                uniprot_id = results[0]['primaryAccession']
                protein_name = results[0]['proteinDescription']['recommendedName']['fullName']['value']
                sequence = results[0]['sequence']['value']
                return uniprot_id, protein_name, sequence
            except KeyError:
                # Handle cases where one of the fields is missing
                return None, None, None
        else:
            return None, None, None
    else:
        return None, None, None

# Fungsi untuk memproses DataFrame
def add_uniprot_details_to_df(df, gene_symbol_column):
    # Tambahkan kolom baru untuk hasil
    df['UniProt ID'] = None
    df['Protein Name'] = None
    df['FASTA Sequence'] = None
    
    # Iterasi setiap gene symbol
    for index, row in df.iterrows():
        gene_symbol = row[gene_symbol_column]
        uniprot_id, protein_name, fasta_sequence = get_uniprot_details(gene_symbol)
        df.at[index, 'UniProt ID'] = uniprot_id
        df.at[index, 'Protein Name'] = protein_name
        df.at[index, 'FASTA Sequence'] = fasta_sequence
    
    return df

# Tambahkan detail UniProt ke DataFrame
prot_df_2 = add_uniprot_details_to_df(prot_df, 'Gene Symbol')
prot_df_2

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,Gene Symbol,UniProt ID,Protein Name,FASTA Sequence
0,UCP1,P25874,Mitochondrial brown fat uncoupling protein 1,MGGLTASDVHPTLGVQLFSAGIAACLADVITFPLDTAKVRLQVQGE...
1,PPARGC1A,Q9UBK2,Peroxisome proliferator-activated receptor gam...,MAWDMCNQDSESVWSDIECAALVGEDQPLCPDLPELDLSELDVNDL...
2,HDAC3,O15379,Histone deacetylase 3,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...
3,PPARGC1B,Q86YN6,Peroxisome proliferator-activated receptor gam...,MAGNDCGALLDEELSSFFLNYLADTQGGGSGEEQLYADFPELDLSQ...
4,SGSH,P51688,N-sulphoglucosamine sulphohydrolase,MSCPVPACCALLLVLGLCRARPRNALLLLADDGGFESGAYNNSAIA...
...,...,...,...,...
468,TM4SF5,O14894,Transmembrane 4 L6 family member 5,MCTGKCARCVGLSLITLCLVCIVANALLLVPNGETSWTNTNHLSLQ...
469,LPIN3,Q9BQK8,Phosphatidate phosphatase LPIN3,MNYVGQLAETVFGTVKELYRGLNPATLSGGIDVLVVKQVDGSFRCS...
470,ZC3H10,Q96K80,Zinc finger CCCH domain-containing protein 10,MPDRDSYANGTGSSGGGPGGGGSEEASGAGVGSGGASSDAICRDFL...
471,PRLH,P81277,Prolactin-releasing peptide,MKVLRAWLLCLLMLGLALRGAASRTHRHSMEIRTPDINPAWYASRG...


In [7]:
prot_df_2.to_csv('../../data/prot/ProteinComplete.csv', index = False)