In [1]:
import pandas as pd

In [9]:
prot_df = pd.read_csv('../../data/prot/ProteinComplete.csv')
prot_df = prot_df.drop_duplicates(subset='UniProt ID')
prot_df = prot_df.dropna(subset='UniProt ID')
prot_df = prot_df.reset_index(drop=True)
prot_df 

Unnamed: 0,Gene Symbol,UniProt ID,Protein Name,FASTA Sequence
0,UCP1,P25874,Mitochondrial brown fat uncoupling protein 1,MGGLTASDVHPTLGVQLFSAGIAACLADVITFPLDTAKVRLQVQGE...
1,PPARGC1A,Q9UBK2,Peroxisome proliferator-activated receptor gam...,MAWDMCNQDSESVWSDIECAALVGEDQPLCPDLPELDLSELDVNDL...
2,HDAC3,O15379,Histone deacetylase 3,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...
3,PPARGC1B,Q86YN6,Peroxisome proliferator-activated receptor gam...,MAGNDCGALLDEELSSFFLNYLADTQGGGSGEEQLYADFPELDLSQ...
4,SGSH,P51688,N-sulphoglucosamine sulphohydrolase,MSCPVPACCALLLVLGLCRARPRNALLLLADDGGFESGAYNNSAIA...
...,...,...,...,...
463,TM4SF5,O14894,Transmembrane 4 L6 family member 5,MCTGKCARCVGLSLITLCLVCIVANALLLVPNGETSWTNTNHLSLQ...
464,LPIN3,Q9BQK8,Phosphatidate phosphatase LPIN3,MNYVGQLAETVFGTVKELYRGLNPATLSGGIDVLVVKQVDGSFRCS...
465,ZC3H10,Q96K80,Zinc finger CCCH domain-containing protein 10,MPDRDSYANGTGSSGGGPGGGGSEEASGAGVGSGGASSDAICRDFL...
466,PRLH,P81277,Prolactin-releasing peptide,MKVLRAWLLCLLMLGLALRGAASRTHRHSMEIRTPDINPAWYASRG...


In [3]:
# prot_df = prot_df.head(3)
# prot_df 

In [None]:
import requests
import pandas as pd

# Fungsi untuk mendapatkan obat dan SMILES untuk UniProt ID
def get_drugs_and_smiles_for_uniprot(uniprot_id):
    url = f"https://www.ebi.ac.uk/chembl/api/data/target?target_components__accession={uniprot_id}&format=json"
    response = requests.get(url)
    
    if response.status_code == 200:
        results = response.json().get('targets', [])
        if results:
            drugs_with_smiles = []
            for target in results:
                target_chembl_id = target['target_chembl_id']
                
                # Cari aktivitas terkait target ini
                drug_url = f"https://www.ebi.ac.uk/chembl/api/data/activity?target_chembl_id={target_chembl_id}&format=json"
                drug_response = requests.get(drug_url)
                
                if drug_response.status_code == 200:
                    activities = drug_response.json().get('activities', [])
                    for activity in activities:
                        molecule_chembl_id = activity.get('molecule_chembl_id')
                        drug_name = activity.get('molecule_pref_name', 'Unknown Drug')
                        
                        # Ambil informasi molekul untuk mendapatkan SMILES
                        molecule_url = f"https://www.ebi.ac.uk/chembl/api/data/molecule/{molecule_chembl_id}?format=json"
                        molecule_response = requests.get(molecule_url)
                        
                        if molecule_response.status_code == 200:
                            molecule_data = molecule_response.json()
                            smiles = molecule_data.get('molecule_structures', {}).get('canonical_smiles', 'No SMILES')
                            drugs_with_smiles.append((drug_name, smiles))
            return drugs_with_smiles
        else:
            return []
    else:
        return []

# Fungsi untuk menambahkan informasi obat dan SMILES ke DataFrame
def add_drugs_and_smiles_to_dataframe(df, uniprot_column):
    df['Drugs'] = None
    df['SMILES'] = None
    
    for index, row in df.iterrows():
        uniprot_id = row[uniprot_column]
        drugs_with_smiles = get_drugs_and_smiles_for_uniprot(uniprot_id)
        
        if drugs_with_smiles:
            # Filter elemen None
            drug_names = [drug[0] for drug in drugs_with_smiles if drug[0] is not None]
            smiles_list = [drug[1] for drug in drugs_with_smiles if drug[1] is not None]
            
            df.at[index, 'Drugs'] = ', '.join(drug_names) if drug_names else 'No drugs found'
            df.at[index, 'SMILES'] = ', '.join(smiles_list) if smiles_list else 'No SMILES found'
        else:
            df.at[index, 'Drugs'] = 'No drugs found'
            df.at[index, 'SMILES'] = 'No SMILES found'
    
    return df

# Tambahkan informasi obat dan SMILES ke DataFrame
interaction_df = add_drugs_and_smiles_to_dataframe(prot_df, 'UniProt ID')
interaction_df

In [10]:
import requests

uniprot_df = prot_df[['UniProt ID']]
# Fungsi untuk mendapatkan nama obat untuk UniProt ID
def get_drugs_for_uniprot(uniprot_id):
    try:
        url = f"https://www.ebi.ac.uk/chembl/api/data/target?target_components__accession={uniprot_id}&format=json"
        response = requests.get(url)
        response.raise_for_status()
        results = response.json().get('targets', [])
        
        drugs = []
        for target in results:
            target_chembl_id = target['target_chembl_id']
            
            # Cari aktivitas terkait target ini
            drug_url = f"https://www.ebi.ac.uk/chembl/api/data/activity?target_chembl_id={target_chembl_id}&format=json"
            drug_response = requests.get(drug_url)
            drug_response.raise_for_status()
            activities = drug_response.json().get('activities', [])
            
            for activity in activities:
                drug_name = activity.get('molecule_pref_name')
                if drug_name and drug_name not in drugs:  # Hindari duplikasi dan None
                    drugs.append(drug_name)
        
        return drugs
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for UniProt ID {uniprot_id}: {e}")
        return []

# Fungsi untuk menambahkan informasi obat ke DataFrame
def add_drugs_to_dataframe(df, uniprot_column):
    drugs_list = []
    
    for index, uniprot_id in enumerate(df[uniprot_column]):
        drugs = get_drugs_for_uniprot(uniprot_id)
        
        # Filter elemen None dan gabungkan menjadi string
        clean_drugs = [drug for drug in drugs if drug is not None]
        drugs_list.append(', '.join(clean_drugs) if clean_drugs else 'No drugs found')
        
        # Tambahkan jeda untuk menghindari rate limit
        # sleep(0.5)
    
    df['Drugs'] = drugs_list
    return df

# Contoh DataFrame (ganti dengan prot_df)
#prot_df = pd.DataFrame({'UniProt ID': ['P12345', 'Q67890']})
interaction_df = add_drugs_to_dataframe(uniprot_df, 'UniProt ID')
interaction_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Drugs'] = drugs_list


Unnamed: 0,UniProt ID,Drugs
0,P25874,No drugs found
1,Q9UBK2,"COLFORSIN, DEXAMETHASONE"
2,O15379,"VORINOSTAT, TACEDINALINE, DACINOSTAT, SPLITOMI..."
3,Q86YN6,No drugs found
4,P51688,No drugs found
...,...,...
463,O14894,"4,4'-DIHYDROXYCHALCONE"
464,Q9BQK8,No drugs found
465,Q96K80,No drugs found
466,P81277,"KISSPEPTIN-10, DAMGO, NALOXONE, NOCICEPTIN"


In [11]:
interaction_df = interaction_df[~prot_df['UniProt ID'].isin(['Q92813', 'Q9NNW7', 'P55073'])]
interaction_df

Unnamed: 0,UniProt ID,Drugs
0,P25874,No drugs found
1,Q9UBK2,"COLFORSIN, DEXAMETHASONE"
2,O15379,"VORINOSTAT, TACEDINALINE, DACINOSTAT, SPLITOMI..."
3,Q86YN6,No drugs found
4,P51688,No drugs found
...,...,...
463,O14894,"4,4'-DIHYDROXYCHALCONE"
464,Q9BQK8,No drugs found
465,Q96K80,No drugs found
466,P81277,"KISSPEPTIN-10, DAMGO, NALOXONE, NOCICEPTIN"


In [12]:
import pandas as pd

# Fungsi untuk menambahkan informasi obat dan SMILES ke DataFrame
def normalize_drug_dataframe(df, uniprot_column):
    # Buat DataFrame kosong untuk hasil normalisasi
    normalized_data = []
    
    for index, row in df.iterrows():
        uniprot_id = row[uniprot_column]
        drugs = row['Drugs'].split(', ') if isinstance(row['Drugs'], str) else []
        #smiles_list = row['SMILES'].split(', ') if isinstance(row['SMILES'], str) else []
        
        # Pastikan panjang daftar drug dan SMILES sama
        for drug in drugs:
            normalized_data.append({'UniProt ID': uniprot_id, 'Drug': drug})
    
    # Buat DataFrame dari data yang dinormalisasi
    normalized_df = pd.DataFrame(normalized_data)
    return normalized_df

# Normalisasi DataFrame
normalized_df = normalize_drug_dataframe(interaction_df, 'UniProt ID')

# Lihat hasilnya
normalized_df

Unnamed: 0,UniProt ID,Drug
0,P25874,No drugs found
1,Q9UBK2,COLFORSIN
2,Q9UBK2,DEXAMETHASONE
3,O15379,VORINOSTAT
4,O15379,TACEDINALINE
...,...,...
1015,P81277,KISSPEPTIN-10
1016,P81277,DAMGO
1017,P81277,NALOXONE
1018,P81277,NOCICEPTIN


In [14]:
no_drug = normalized_df[normalized_df['Drug'] != 'No drugs found']
no_drug = no_drug.reset_index(drop=True)
no_drug

Unnamed: 0,UniProt ID,Drug
0,Q9UBK2,COLFORSIN
1,Q9UBK2,DEXAMETHASONE
2,O15379,VORINOSTAT
3,O15379,TACEDINALINE
4,O15379,DACINOSTAT
...,...,...
697,O14894,"4,4'-DIHYDROXYCHALCONE"
698,P81277,KISSPEPTIN-10
699,P81277,DAMGO
700,P81277,NALOXONE


In [16]:
print("number of unique BAT protein that has interactions data", no_drug["UniProt ID"].nunique())
print("number of unique drug that has interactions with BAT protein", no_drug["Drug"].nunique())

number of unique BAT protein that has interactions data 147
number of unique drug that has interactions with BAT protein 479


In [17]:
import pandas as pd
import requests

# Fungsi untuk mendapatkan SMILES dari PubChem
def get_smiles(drug_name):
    url = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{drug_name}/property/CanonicalSMILES/JSON'
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        try:
            smiles = data['PropertyTable']['Properties'][0]['CanonicalSMILES']
            return smiles
        except (KeyError, IndexError):
            return None
    else:
        return None

# # Daftar nama obat yang ingin dicari SMILES-nya
# drug_names = ['COLFORSIN', 'DEXAMETHASONE']  # Anda bisa menambah obat lain di sini

# # Membuat DataFrame dengan nama obat
# df = pd.DataFrame(drug_names, columns=['Drug_Name'])

# Mendapatkan SMILES untuk setiap obat dan menambahkannya ke DataFrame
no_drug['SMILES'] = no_drug['Drug'].apply(get_smiles)

# Menampilkan DataFrame dengan SMILES
no_drug

Unnamed: 0,UniProt ID,Drug,SMILES
0,Q9UBK2,COLFORSIN,CC(=O)OC1C(C2C(CCC(C2(C3(C1(OC(CC3=O)(C)C=C)C)...
1,Q9UBK2,DEXAMETHASONE,CC1CC2C3CCC4=CC(=O)C=CC4(C3(C(CC2(C1(C(=O)CO)O...
2,O15379,VORINOSTAT,C1=CC=C(C=C1)NC(=O)CCCCCCC(=O)NO
3,O15379,TACEDINALINE,CC(=O)NC1=CC=C(C=C1)C(=O)NC2=CC=CC=C2N
4,O15379,DACINOSTAT,C1=CC=C2C(=C1)C(=CN2)CCN(CCO)CC3=CC=C(C=C3)C=C...
...,...,...,...
697,O14894,"4,4'-DIHYDROXYCHALCONE",C1=CC(=CC=C1C=CC(=O)C2=CC=C(C=C2)O)O
698,P81277,KISSPEPTIN-10,CC(C)CC(C(=O)NC(CCCN=C(N)N)C(=O)NC(CC1=CC=CC=C...
699,P81277,DAMGO,CC(C(=O)NCC(=O)N(C)C(CC1=CC=CC=C1)C(=O)NCCO)NC...
700,P81277,NALOXONE,C=CCN1CCC23C4C(=O)CCC2(C1CC5=C3C(=C(C=C5)O)O4)O


In [22]:
nan_rows = no_drug[no_drug['SMILES'].isna()]
nan_rows

Unnamed: 0,UniProt ID,Drug,SMILES
116,P55055,T091317,
147,Q13131,TETRABROMOBENZOTRIAZOLE,
153,Q13133,T091317,
191,O60674,CRYTOSPORIOPSIN,
224,Q9Y478,TETRABROMOBENZOTRIAZOLE,
264,P54619,TETRABROMOBENZOTRIAZOLE,
280,P46089,BENZONAPHTHYRIDINE,
355,P21817,BASTADIN 5,
380,Q92523,PALMITOYLAMINOCARNITINE,
428,O00206,POLYMYXIN B,


In [24]:
no_drug = no_drug.dropna(subset='SMILES')
no_drug = no_drug.reset_index(drop=True)
no_drug

Unnamed: 0,UniProt ID,Drug,SMILES
0,Q9UBK2,COLFORSIN,CC(=O)OC1C(C2C(CCC(C2(C3(C1(OC(CC3=O)(C)C=C)C)...
1,Q9UBK2,DEXAMETHASONE,CC1CC2C3CCC4=CC(=O)C=CC4(C3(C(CC2(C1(C(=O)CO)O...
2,O15379,VORINOSTAT,C1=CC=C(C=C1)NC(=O)CCCCCCC(=O)NO
3,O15379,TACEDINALINE,CC(=O)NC1=CC=C(C=C1)C(=O)NC2=CC=CC=C2N
4,O15379,DACINOSTAT,C1=CC=C2C(=C1)C(=CN2)CCN(CCO)CC3=CC=C(C=C3)C=C...
...,...,...,...
674,O14894,"4,4'-DIHYDROXYCHALCONE",C1=CC(=CC=C1C=CC(=O)C2=CC=C(C=C2)O)O
675,P81277,KISSPEPTIN-10,CC(C)CC(C(=O)NC(CCCN=C(N)N)C(=O)NC(CC1=CC=CC=C...
676,P81277,DAMGO,CC(C(=O)NCC(=O)N(C)C(CC1=CC=CC=C1)C(=O)NCCO)NC...
677,P81277,NALOXONE,C=CCN1CCC23C4C(=O)CCC2(C1CC5=C3C(=C(C=C5)O)O4)O


In [25]:
print("number of unique BAT protein that has interactions data", no_drug["UniProt ID"].nunique())
print("number of unique drug that has interactions with BAT protein", no_drug["Drug"].nunique())

number of unique BAT protein that has interactions data 143
number of unique drug that has interactions with BAT protein 461


In [29]:
fasta_df = prot_df.drop(columns=['Protein Name', 'Gene Symbol'])
fasta_df

Unnamed: 0,UniProt ID,FASTA Sequence
0,P25874,MGGLTASDVHPTLGVQLFSAGIAACLADVITFPLDTAKVRLQVQGE...
1,Q9UBK2,MAWDMCNQDSESVWSDIECAALVGEDQPLCPDLPELDLSELDVNDL...
2,O15379,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...
3,Q86YN6,MAGNDCGALLDEELSSFFLNYLADTQGGGSGEEQLYADFPELDLSQ...
4,P51688,MSCPVPACCALLLVLGLCRARPRNALLLLADDGGFESGAYNNSAIA...
...,...,...
463,O14894,MCTGKCARCVGLSLITLCLVCIVANALLLVPNGETSWTNTNHLSLQ...
464,Q9BQK8,MNYVGQLAETVFGTVKELYRGLNPATLSGGIDVLVVKQVDGSFRCS...
465,Q96K80,MPDRDSYANGTGSSGGGPGGGGSEEASGAGVGSGGASSDAICRDFL...
466,P81277,MKVLRAWLLCLLMLGLALRGAASRTHRHSMEIRTPDINPAWYASRG...


In [30]:
inter_df = pd.merge(no_drug, fasta_df, on='UniProt ID', how='inner')
inter_df

Unnamed: 0,UniProt ID,Drug,SMILES,FASTA Sequence
0,Q9UBK2,COLFORSIN,CC(=O)OC1C(C2C(CCC(C2(C3(C1(OC(CC3=O)(C)C=C)C)...,MAWDMCNQDSESVWSDIECAALVGEDQPLCPDLPELDLSELDVNDL...
1,Q9UBK2,DEXAMETHASONE,CC1CC2C3CCC4=CC(=O)C=CC4(C3(C(CC2(C1(C(=O)CO)O...,MAWDMCNQDSESVWSDIECAALVGEDQPLCPDLPELDLSELDVNDL...
2,O15379,VORINOSTAT,C1=CC=C(C=C1)NC(=O)CCCCCCC(=O)NO,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...
3,O15379,TACEDINALINE,CC(=O)NC1=CC=C(C=C1)C(=O)NC2=CC=CC=C2N,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...
4,O15379,DACINOSTAT,C1=CC=C2C(=C1)C(=CN2)CCN(CCO)CC3=CC=C(C=C3)C=C...,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...
...,...,...,...,...
674,O14894,"4,4'-DIHYDROXYCHALCONE",C1=CC(=CC=C1C=CC(=O)C2=CC=C(C=C2)O)O,MCTGKCARCVGLSLITLCLVCIVANALLLVPNGETSWTNTNHLSLQ...
675,P81277,KISSPEPTIN-10,CC(C)CC(C(=O)NC(CCCN=C(N)N)C(=O)NC(CC1=CC=CC=C...,MKVLRAWLLCLLMLGLALRGAASRTHRHSMEIRTPDINPAWYASRG...
676,P81277,DAMGO,CC(C(=O)NCC(=O)N(C)C(CC1=CC=CC=C1)C(=O)NCCO)NC...,MKVLRAWLLCLLMLGLALRGAASRTHRHSMEIRTPDINPAWYASRG...
677,P81277,NALOXONE,C=CCN1CCC23C4C(=O)CCC2(C1CC5=C3C(=C(C=C5)O)O4)O,MKVLRAWLLCLLMLGLALRGAASRTHRHSMEIRTPDINPAWYASRG...


In [31]:
inter_df.to_csv("../../data/interaction/interaction_chembl.csv", index=False)