In [1]:
import pandas as pd
import numpy as np
import glob
from tqdm.auto import tqdm
import itertools
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

## compound package
from rdkit import Chem
from rdkit.Chem import AllChem, MACCSkeys
import pubchempy as pc
from padelpy import from_smiles, padeldescriptor

## protein package
from protlearn.preprocessing import remove_unnatural
from protlearn.features import aac
from protlearn.features import paac
from protlearn.features import aaindex1

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
inter_bindingdb_df = pd.read_csv("../../data/1-preparation/interaction/interaction_bindingdb.csv")
inter_bindingdb_df

Unnamed: 0,uniprot_id,canonical_smiles,isomeric_smiles,affinity_type,affinity_score
0,O15379,O=C(O)C=Cc1ccccc1,OC(=O)\C=C\c1ccccc1,Ki,8.2
1,O15379,CC(=CC(C)C=CC(=O)NO)C(=O)c1ccc(N(C)C)cc1,CC(C=CC(=O)NO)C=C(C)C(=O)c1ccc(cc1)N(C)C,IC50,0.6
2,O15379,CC(=CC(C)C=CC(=O)NO)C(=O)c1ccc(N(C)C)cc1,CC(C=CC(=O)NO)C=C(C)C(=O)c1ccc(cc1)N(C)C,IC50,9
3,O15379,CC(=CC(C)C=CC(=O)NO)C(=O)c1ccc(N(C)C)cc1,CC(C=CC(=O)NO)C=C(C)C(=O)c1ccc(cc1)N(C)C,Ki,0.26
4,O15379,CC(=CC(C)C=CC(=O)NO)C(=O)c1ccc(N(C)C)cc1,CC(C=CC(=O)NO)C=C(C)C(=O)c1ccc(cc1)N(C)C,Ki,0.58
...,...,...,...,...,...
46143,Q7Z4H4,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2CC3(Cc2c1)C(=O)Nc1...,IC50,7.9
46144,Q7Z4H4,CNCc1ccccc1CN(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)Nc...,CNCc1ccccc1CN(CC(=O)Nc1ccc2CC3(Cc2c1)C(=O)Nc1n...,IC50,0.630957
46145,Q7Z4H4,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2CC3(Cc2c1)C(=O)Nc1...,IC50,1.000000
46146,Q7Z4H4,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2CC3(Cc2c1)C(=O)Nc1...,IC50,1.6


In [3]:
print("number of unique protein that has interactions data", inter_bindingdb_df["uniprot_id"].nunique())
print("number of unique compound isomeric_smiles that has interactions with BAT proteins", inter_bindingdb_df["isomeric_smiles"].nunique())
print("number of unique compound canonical_smiles that has interactions with BAT proteins", inter_bindingdb_df["canonical_smiles"].nunique())

number of unique protein that has interactions data 136
number of unique compound isomeric_smiles that has interactions with BAT proteins 41229
number of unique compound canonical_smiles that has interactions with BAT proteins 38718


### Pengecekan interaksi senyawa-protein yang redundansi

In [4]:
duplikat = inter_bindingdb_df.duplicated(subset=['uniprot_id', 'canonical_smiles'], keep=False)
duplikat

0        False
1         True
2         True
3         True
4         True
         ...  
46143    False
46144    False
46145    False
46146    False
46147    False
Length: 46148, dtype: bool

In [7]:
df = pd.DataFrame(inter_bindingdb_df)

In [8]:
# Menampilkan baris yang merupakan duplikat
df_duplikat = df[duplikat]
df_duplikat

Unnamed: 0,uniprot_id,canonical_smiles,isomeric_smiles,affinity_type,affinity_score
1,O15379,CC(=CC(C)C=CC(=O)NO)C(=O)c1ccc(N(C)C)cc1,CC(C=CC(=O)NO)C=C(C)C(=O)c1ccc(cc1)N(C)C,IC50,0.6
2,O15379,CC(=CC(C)C=CC(=O)NO)C(=O)c1ccc(N(C)C)cc1,CC(C=CC(=O)NO)C=C(C)C(=O)c1ccc(cc1)N(C)C,IC50,9
3,O15379,CC(=CC(C)C=CC(=O)NO)C(=O)c1ccc(N(C)C)cc1,CC(C=CC(=O)NO)C=C(C)C(=O)c1ccc(cc1)N(C)C,Ki,0.26
4,O15379,CC(=CC(C)C=CC(=O)NO)C(=O)c1ccc(N(C)C)cc1,CC(C=CC(=O)NO)C=C(C)C(=O)c1ccc(cc1)N(C)C,Ki,0.58
5,O15379,O=C(CCCCCCC(=O)Nc1ccccc1)NO,ONC(=O)CCCCCCC(=O)Nc1ccccc1,IC50,1.4
...,...,...,...,...,...
46088,P49763,CCC(C)C1NC(=O)C(Cc2ccc(O)cc2)NC(=O)C(Cc2cnc[nH...,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)NC(=...,IC50,0.4
46089,P49763,CCC(C)C1NC(=O)C(Cc2ccc(O)cc2)NC(=O)C(Cc2cnc[nH...,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)NC(=...,Kd,0.1
46135,Q9H093,CNC1CC2OC(C)(C1OC)n1c3ccccc3c3c4c(c5c6ccccc6n2...,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...,IC50,1.7
46136,Q9H093,CNC1CC2OC(C)(C1OC)n1c3ccccc3c3c4c(c5c6ccccc6n2...,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...,Kd,0.086


### Penanganan Redundasi interaksi yang berasal dari nilai affinity score berbeda

In [4]:
inter_df = inter_bindingdb_df.drop(columns=['isomeric_smiles', 'affinity_type', 'affinity_score'])
inter_df

Unnamed: 0,uniprot_id,canonical_smiles
0,O15379,O=C(O)C=Cc1ccccc1
1,O15379,CC(=CC(C)C=CC(=O)NO)C(=O)c1ccc(N(C)C)cc1
2,O15379,CC(=CC(C)C=CC(=O)NO)C(=O)c1ccc(N(C)C)cc1
3,O15379,CC(=CC(C)C=CC(=O)NO)C(=O)c1ccc(N(C)C)cc1
4,O15379,CC(=CC(C)C=CC(=O)NO)C(=O)c1ccc(N(C)C)cc1
...,...,...
46143,Q7Z4H4,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...
46144,Q7Z4H4,CNCc1ccccc1CN(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)Nc...
46145,Q7Z4H4,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...
46146,Q7Z4H4,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...


In [5]:
print("number of unique compound canonical_smiles that has interactions with BAT proteins", inter_df["canonical_smiles"].nunique())

number of unique compound canonical_smiles that has interactions with BAT proteins 38718


In [6]:
inter_df = inter_df.drop_duplicates()
inter_df.reset_index(drop=True, inplace=True)
inter_df

Unnamed: 0,uniprot_id,canonical_smiles
0,O15379,O=C(O)C=Cc1ccccc1
1,O15379,CC(=CC(C)C=CC(=O)NO)C(=O)c1ccc(N(C)C)cc1
2,O15379,O=C(CCCCCCC(=O)Nc1ccccc1)NO
3,O15379,CC=C1NC(=O)C2CSSCCC=CC(CC(=O)NC(C(C)C)C(=O)N2)...
4,O15379,Nc1ccccc1NC(=O)c1ccc(CNC(=O)OCc2cccnc2)cc1
...,...,...
40669,Q7Z4H4,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...
40670,Q7Z4H4,CNCc1ccccc1CN(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)Nc...
40671,Q7Z4H4,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...
40672,Q7Z4H4,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...


In [7]:
print("number of unique compound canonical_smiles that has interactions with BAT proteins", inter_df["canonical_smiles"].nunique())

number of unique compound canonical_smiles that has interactions with BAT proteins 38718


In [8]:
# Buat dictionary mapping untuk nilai unik canonical_smiles
unique_smiles = inter_df['canonical_smiles'].unique()
mol_id_mapping = {smiles: f"d{i+1}" for i, smiles in enumerate(unique_smiles)}

In [9]:
# Tambahkan inisialisasi senyawa kedalam dataframe
inter_df['mol_id'] = inter_df['canonical_smiles'].map(mol_id_mapping)
inter_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inter_df['mol_id'] = inter_df['canonical_smiles'].map(mol_id_mapping)


Unnamed: 0,uniprot_id,canonical_smiles,mol_id
0,O15379,O=C(O)C=Cc1ccccc1,d1
1,O15379,CC(=CC(C)C=CC(=O)NO)C(=O)c1ccc(N(C)C)cc1,d2
2,O15379,O=C(CCCCCCC(=O)Nc1ccccc1)NO,d3
3,O15379,CC=C1NC(=O)C2CSSCCC=CC(CC(=O)NC(C(C)C)C(=O)N2)...,d4
4,O15379,Nc1ccccc1NC(=O)c1ccc(CNC(=O)OCc2cccnc2)cc1,d5
...,...,...,...
40669,Q7Z4H4,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...,d38714
40670,Q7Z4H4,CNCc1ccccc1CN(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)Nc...,d38715
40671,Q7Z4H4,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...,d38716
40672,Q7Z4H4,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...,d38717


In [14]:
inter_df.rename(columns={'uniprot_id': 'UniProt ID'}, inplace=True)
inter_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inter_df.rename(columns={'uniprot_id': 'UniProt ID'}, inplace=True)


Unnamed: 0,UniProt ID,canonical_smiles,mol_id
0,O15379,O=C(O)C=Cc1ccccc1,d1
1,O15379,CC(=CC(C)C=CC(=O)NO)C(=O)c1ccc(N(C)C)cc1,d2
2,O15379,O=C(CCCCCCC(=O)Nc1ccccc1)NO,d3
3,O15379,CC=C1NC(=O)C2CSSCCC=CC(CC(=O)NC(C(C)C)C(=O)N2)...,d4
4,O15379,Nc1ccccc1NC(=O)c1ccc(CNC(=O)OCc2cccnc2)cc1,d5
...,...,...,...
40669,Q7Z4H4,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...,d38714
40670,Q7Z4H4,CNCc1ccccc1CN(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)Nc...,d38715
40671,Q7Z4H4,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...,d38716
40672,Q7Z4H4,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...,d38717


In [11]:
protein_df = pd.read_csv("../../data/1-preparation/prot/protein.csv")
protein_df

Unnamed: 0,Gene Symbol,UniProt ID,Protein Name,FASTA Sequence
0,UCP1,P25874,Mitochondrial brown fat uncoupling protein 1,MGGLTASDVHPTLGVQLFSAGIAACLADVITFPLDTAKVRLQVQGE...
1,PPARGC1A,Q9UBK2,Peroxisome proliferator-activated receptor gam...,MAWDMCNQDSESVWSDIECAALVGEDQPLCPDLPELDLSELDVNDL...
2,HDAC3,O15379,Histone deacetylase 3,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...
3,PPARGC1B,Q86YN6,Peroxisome proliferator-activated receptor gam...,MAGNDCGALLDEELSSFFLNYLADTQGGGSGEEQLYADFPELDLSQ...
4,SGSH,P51688,N-sulphoglucosamine sulphohydrolase,MSCPVPACCALLLVLGLCRARPRNALLLLADDGGFESGAYNNSAIA...
...,...,...,...,...
460,TM4SF5,O14894,Transmembrane 4 L6 family member 5,MCTGKCARCVGLSLITLCLVCIVANALLLVPNGETSWTNTNHLSLQ...
461,LPIN3,Q9BQK8,Phosphatidate phosphatase LPIN3,MNYVGQLAETVFGTVKELYRGLNPATLSGGIDVLVVKQVDGSFRCS...
462,ZC3H10,Q96K80,Zinc finger CCCH domain-containing protein 10,MPDRDSYANGTGSSGGGPGGGGSEEASGAGVGSGGASSDAICRDFL...
463,PRLH,P81277,Prolactin-releasing peptide,MKVLRAWLLCLLMLGLALRGAASRTHRHSMEIRTPDINPAWYASRG...


In [12]:
fasta_df = protein_df.drop(columns=['Protein Name', 'Gene Symbol'])
fasta_df

Unnamed: 0,UniProt ID,FASTA Sequence
0,P25874,MGGLTASDVHPTLGVQLFSAGIAACLADVITFPLDTAKVRLQVQGE...
1,Q9UBK2,MAWDMCNQDSESVWSDIECAALVGEDQPLCPDLPELDLSELDVNDL...
2,O15379,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...
3,Q86YN6,MAGNDCGALLDEELSSFFLNYLADTQGGGSGEEQLYADFPELDLSQ...
4,P51688,MSCPVPACCALLLVLGLCRARPRNALLLLADDGGFESGAYNNSAIA...
...,...,...
460,O14894,MCTGKCARCVGLSLITLCLVCIVANALLLVPNGETSWTNTNHLSLQ...
461,Q9BQK8,MNYVGQLAETVFGTVKELYRGLNPATLSGGIDVLVVKQVDGSFRCS...
462,Q96K80,MPDRDSYANGTGSSGGGPGGGGSEEASGAGVGSGGASSDAICRDFL...
463,P81277,MKVLRAWLLCLLMLGLALRGAASRTHRHSMEIRTPDINPAWYASRG...


In [13]:
print("number of protein FASTA Sequence that has interactions with BAT proteins", fasta_df["FASTA Sequence"].nunique())

number of protein FASTA Sequence that has interactions with BAT proteins 465


In [15]:
marge_df = pd.merge(inter_df, fasta_df, on='UniProt ID', how='inner')
marge_df

Unnamed: 0,UniProt ID,canonical_smiles,mol_id,FASTA Sequence
0,O15379,O=C(O)C=Cc1ccccc1,d1,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...
1,O15379,CC(=CC(C)C=CC(=O)NO)C(=O)c1ccc(N(C)C)cc1,d2,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...
2,O15379,O=C(CCCCCCC(=O)Nc1ccccc1)NO,d3,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...
3,O15379,CC=C1NC(=O)C2CSSCCC=CC(CC(=O)NC(C(C)C)C(=O)N2)...,d4,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...
4,O15379,Nc1ccccc1NC(=O)c1ccc(CNC(=O)OCc2cccnc2)cc1,d5,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...
...,...,...,...,...
40669,Q7Z4H4,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...,d38714,MARIPTAALGCISLLCLQLPGSLSRSLGGDPRPVKPREPPARSPSS...
40670,Q7Z4H4,CNCc1ccccc1CN(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)Nc...,d38715,MARIPTAALGCISLLCLQLPGSLSRSLGGDPRPVKPREPPARSPSS...
40671,Q7Z4H4,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...,d38716,MARIPTAALGCISLLCLQLPGSLSRSLGGDPRPVKPREPPARSPSS...
40672,Q7Z4H4,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...,d38717,MARIPTAALGCISLLCLQLPGSLSRSLGGDPRPVKPREPPARSPSS...


In [16]:
# Tentukan urutan kolom yang diinginkan
atur_colom = ['UniProt ID','FASTA Sequence','canonical_smiles','mol_id']

In [17]:
marge_df = marge_df[atur_colom]
marge_df

Unnamed: 0,UniProt ID,FASTA Sequence,canonical_smiles,mol_id
0,O15379,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...,O=C(O)C=Cc1ccccc1,d1
1,O15379,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...,CC(=CC(C)C=CC(=O)NO)C(=O)c1ccc(N(C)C)cc1,d2
2,O15379,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...,O=C(CCCCCCC(=O)Nc1ccccc1)NO,d3
3,O15379,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...,CC=C1NC(=O)C2CSSCCC=CC(CC(=O)NC(C(C)C)C(=O)N2)...,d4
4,O15379,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...,Nc1ccccc1NC(=O)c1ccc(CNC(=O)OCc2cccnc2)cc1,d5
...,...,...,...,...
40669,Q7Z4H4,MARIPTAALGCISLLCLQLPGSLSRSLGGDPRPVKPREPPARSPSS...,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...,d38714
40670,Q7Z4H4,MARIPTAALGCISLLCLQLPGSLSRSLGGDPRPVKPREPPARSPSS...,CNCc1ccccc1CN(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)Nc...,d38715
40671,Q7Z4H4,MARIPTAALGCISLLCLQLPGSLSRSLGGDPRPVKPREPPARSPSS...,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...,d38716
40672,Q7Z4H4,MARIPTAALGCISLLCLQLPGSLSRSLGGDPRPVKPREPPARSPSS...,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...,d38717


In [29]:
marge_df.rename(columns={'canonical_smiles': 'SMILES'}, inplace=True)

In [30]:
marge_df.rename(columns={'mol_id': 'Drug'}, inplace=True)

In [31]:
marge_df.to_csv('../../data/1-preparation/interaction/interaction_bindingdb_nodup.csv', index = False)

## Ekstraksi Fasta

In [19]:
interaksi = pd.read_csv("../../data/1-preparation/interaction/interaction_bindingdb_nodup.csv")
interaksi

Unnamed: 0,UniProt ID,FASTA Sequence,canonical_smiles,mol_id
0,O15379,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...,O=C(O)C=Cc1ccccc1,d1
1,O15379,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...,CC(=CC(C)C=CC(=O)NO)C(=O)c1ccc(N(C)C)cc1,d2
2,O15379,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...,O=C(CCCCCCC(=O)Nc1ccccc1)NO,d3
3,O15379,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...,CC=C1NC(=O)C2CSSCCC=CC(CC(=O)NC(C(C)C)C(=O)N2)...,d4
4,O15379,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...,Nc1ccccc1NC(=O)c1ccc(CNC(=O)OCc2cccnc2)cc1,d5
...,...,...,...,...
40669,Q7Z4H4,MARIPTAALGCISLLCLQLPGSLSRSLGGDPRPVKPREPPARSPSS...,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...,d38714
40670,Q7Z4H4,MARIPTAALGCISLLCLQLPGSLSRSLGGDPRPVKPREPPARSPSS...,CNCc1ccccc1CN(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)Nc...,d38715
40671,Q7Z4H4,MARIPTAALGCISLLCLQLPGSLSRSLGGDPRPVKPREPPARSPSS...,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...,d38716
40672,Q7Z4H4,MARIPTAALGCISLLCLQLPGSLSRSLGGDPRPVKPREPPARSPSS...,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...,d38717


In [24]:
# protein_inter = pd.read_csv("../../../data/fasta/fasta_df_a.csv")
protein_inter = interaksi.drop(columns=['canonical_smiles', 'mol_id']).drop_duplicates()
protein_inter

Unnamed: 0,UniProt ID,FASTA Sequence
0,O15379,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...
459,P36894,MPQLYIYIRLLGAYLFIISRVQGQNLDSMLHGTGMKSDSDQKKSEN...
469,P20393,MTTLDSNNNTGGVITYIGSSGSSPSRTSPESLYSDNSNGSFQSLTQ...
471,P06213,MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTR...
700,P35354,MLARALLLCAVLALSHTANPCCSHPCQNRGVCMSVGFDQYKCDCTR...
...,...,...
40618,Q01469,MATVQQLEGRWRLVDSKGFDEYMKELGVGIALRKMGAMAKPDCIIT...
40621,O43194,MASPSLPGSDCSQIIDHSHVPEFEVATWIKITLILVYLIIFVMGLL...
40653,Q5NUL3,MSPECARAAGDAPLRSLEQANRTRFPFFSDVKGDHRLVLAAVETTV...
40663,Q9H093,MESLVFARRSGPTPSAAELARPLAEGLIKSPKPLMKKQAVKRHHHK...


In [25]:
fasta_list = protein_inter["FASTA Sequence"].tolist()
seqs = remove_unnatural(fasta_list)
len(seqs)

136

In [26]:
##Feature Engineering Fasta - AAIndex1
aaind, inds = aaindex1(seqs, standardize="zscore")

aaindex_feats = pd.DataFrame(aaind, columns=inds)
aaindex_feats

Unnamed: 0,ANDN920101,ARGP820101,ARGP820102,ARGP820103,BEGF750101,BEGF750102,BEGF750103,BHAR880101,BIGC670101,BIOV880101,...,KARS160113,KARS160114,KARS160115,KARS160116,KARS160117,KARS160118,KARS160119,KARS160120,KARS160121,KARS160122
0,1.481179,0.352386,-0.917555,-0.859283,-0.851430,0.637549,0.663336,-0.204360,0.324345,0.464680,...,0.798668,0.503254,0.699992,0.575395,1.159542,-0.351364,0.733510,-0.427558,0.588095,-0.869535
1,0.087000,0.409973,0.027134,-0.013078,-0.234903,0.362438,0.108317,-0.064774,0.518921,0.401332,...,0.260927,0.624565,0.488038,0.769947,0.534061,0.847145,0.559739,-0.680648,0.392316,-0.522968
2,1.671921,-1.886599,-0.144091,-0.436771,-2.038396,-1.307917,1.742980,0.879783,-1.706989,-0.876098,...,-1.035911,-0.780838,-0.708928,-0.654071,-1.028422,0.681530,-0.422602,0.323512,-0.068375,2.112944
3,0.669067,0.083973,-0.412078,-0.506273,-0.777474,-0.355536,0.783156,0.270328,0.217461,-0.012653,...,0.392370,0.461248,0.410749,0.541657,0.582346,0.274170,0.610168,-0.042244,0.596539,0.395726
4,0.690767,0.857941,-0.127644,-0.124409,-0.598821,0.493526,0.152147,-0.292531,0.967405,0.345587,...,1.079885,0.733900,1.027551,0.537329,1.046938,-0.227387,0.808093,-0.739256,1.049860,0.029800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,-1.161522,-1.893245,-1.066896,-1.069508,0.149970,0.246082,-0.127904,0.201894,0.016186,-0.949905,...,-1.564009,0.343043,-0.403174,0.326697,0.164051,-0.000942,0.558890,-3.248932,-0.436852,-1.375183
132,-0.099996,0.501666,1.453936,1.314199,0.639487,1.287084,-1.329829,-1.364833,0.922932,1.165799,...,0.503393,0.579186,0.371441,0.532668,0.593643,0.865040,0.559590,-0.898337,0.827878,0.934296
133,-1.998093,2.690891,2.688581,2.738906,2.059807,2.585238,-2.589222,-2.173599,1.420785,2.817378,...,0.425640,-0.572702,-0.614288,-0.975269,-0.172633,-0.148652,-0.010380,-0.143334,-0.098141,0.654487
134,-0.200674,-0.519778,-0.811321,-0.853692,-0.468208,-1.237807,0.813239,1.156230,-0.526568,-1.043800,...,-0.186901,-0.063429,-0.032355,0.121340,-0.390961,-0.881760,-0.817878,0.705140,-0.588808,0.248363


In [27]:
aaindex_feats =  pd.concat([protein_inter[['UniProt ID']].reset_index(drop=True), aaindex_feats.reset_index(drop=True)], axis=1)
aaindex_feats

Unnamed: 0,UniProt ID,ANDN920101,ARGP820101,ARGP820102,ARGP820103,BEGF750101,BEGF750102,BEGF750103,BHAR880101,BIGC670101,...,KARS160113,KARS160114,KARS160115,KARS160116,KARS160117,KARS160118,KARS160119,KARS160120,KARS160121,KARS160122
0,O15379,1.481179,0.352386,-0.917555,-0.859283,-0.851430,0.637549,0.663336,-0.204360,0.324345,...,0.798668,0.503254,0.699992,0.575395,1.159542,-0.351364,0.733510,-0.427558,0.588095,-0.869535
1,P36894,0.087000,0.409973,0.027134,-0.013078,-0.234903,0.362438,0.108317,-0.064774,0.518921,...,0.260927,0.624565,0.488038,0.769947,0.534061,0.847145,0.559739,-0.680648,0.392316,-0.522968
2,P20393,1.671921,-1.886599,-0.144091,-0.436771,-2.038396,-1.307917,1.742980,0.879783,-1.706989,...,-1.035911,-0.780838,-0.708928,-0.654071,-1.028422,0.681530,-0.422602,0.323512,-0.068375,2.112944
3,P06213,0.669067,0.083973,-0.412078,-0.506273,-0.777474,-0.355536,0.783156,0.270328,0.217461,...,0.392370,0.461248,0.410749,0.541657,0.582346,0.274170,0.610168,-0.042244,0.596539,0.395726
4,P35354,0.690767,0.857941,-0.127644,-0.124409,-0.598821,0.493526,0.152147,-0.292531,0.967405,...,1.079885,0.733900,1.027551,0.537329,1.046938,-0.227387,0.808093,-0.739256,1.049860,0.029800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,Q01469,-1.161522,-1.893245,-1.066896,-1.069508,0.149970,0.246082,-0.127904,0.201894,0.016186,...,-1.564009,0.343043,-0.403174,0.326697,0.164051,-0.000942,0.558890,-3.248932,-0.436852,-1.375183
132,O43194,-0.099996,0.501666,1.453936,1.314199,0.639487,1.287084,-1.329829,-1.364833,0.922932,...,0.503393,0.579186,0.371441,0.532668,0.593643,0.865040,0.559590,-0.898337,0.827878,0.934296
133,Q5NUL3,-1.998093,2.690891,2.688581,2.738906,2.059807,2.585238,-2.589222,-2.173599,1.420785,...,0.425640,-0.572702,-0.614288,-0.975269,-0.172633,-0.148652,-0.010380,-0.143334,-0.098141,0.654487
134,Q9H093,-0.200674,-0.519778,-0.811321,-0.853692,-0.468208,-1.237807,0.813239,1.156230,-0.526568,...,-0.186901,-0.063429,-0.032355,0.121340,-0.390961,-0.881760,-0.817878,0.705140,-0.588808,0.248363


In [28]:
aaindex_feats.to_csv('../../data/2-feature/prot/aaindex_nrlmf_bindingdb.csv', index=False)

In [32]:
##Feature Engineering Fasta - AAC 
comp, aa = aac(seqs)

aac_df = pd.DataFrame(comp, columns=list(aa))
aac_df =  pd.concat([protein_inter[['UniProt ID']].reset_index(drop=True), aac_df.reset_index(drop=True)], axis=1)
aac_df

Unnamed: 0,UniProt ID,A,C,D,E,F,G,H,I,K,...,M,N,P,Q,R,S,T,V,W,Y
0,O15379,0.053738,0.028037,0.077103,0.063084,0.058411,0.074766,0.044393,0.051402,0.035047,...,0.018692,0.049065,0.049065,0.039720,0.042056,0.051402,0.042056,0.067757,0.004673,0.067757
1,P36894,0.058271,0.043233,0.054511,0.056391,0.031955,0.063910,0.020677,0.075188,0.060150,...,0.030075,0.039474,0.037594,0.041353,0.054511,0.071429,0.052632,0.054511,0.015038,0.043233
2,P20393,0.063518,0.024430,0.032573,0.045603,0.042345,0.065147,0.027687,0.021173,0.034202,...,0.029316,0.060261,0.089577,0.050489,0.058632,0.133550,0.057003,0.052117,0.004886,0.017915
3,P06213,0.047757,0.034009,0.050651,0.076700,0.042692,0.066570,0.026773,0.047757,0.048480,...,0.021708,0.052098,0.059334,0.027496,0.059334,0.075977,0.049928,0.062952,0.014472,0.036903
4,P35354,0.051325,0.021523,0.043046,0.059603,0.062914,0.061258,0.031457,0.056291,0.056291,...,0.024834,0.048013,0.066225,0.051325,0.044702,0.057947,0.056291,0.057947,0.009934,0.044702
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,Q01469,0.044444,0.044444,0.059259,0.088889,0.029630,0.081481,0.007407,0.044444,0.103704,...,0.037037,0.029630,0.007407,0.044444,0.044444,0.029630,0.125926,0.074074,0.014815,0.014815
132,O43194,0.066225,0.026490,0.013245,0.048565,0.057395,0.033113,0.033113,0.055188,0.035320,...,0.030905,0.028698,0.046358,0.048565,0.061810,0.114790,0.064018,0.077263,0.015453,0.028698
133,Q5NUL3,0.088643,0.030471,0.027701,0.027701,0.066482,0.041551,0.011080,0.080332,0.024931,...,0.016620,0.027701,0.044321,0.027701,0.072022,0.072022,0.052632,0.099723,0.027701,0.013850
134,Q9H093,0.071656,0.014331,0.049363,0.070064,0.025478,0.074841,0.039809,0.042994,0.068471,...,0.014331,0.027070,0.076433,0.036624,0.068471,0.097134,0.033439,0.047771,0.011146,0.023885


In [33]:
aac_df.isna().sum()

UniProt ID    0
A             0
C             0
D             0
E             0
F             0
G             0
H             0
I             0
K             0
L             0
M             0
N             0
P             0
Q             0
R             0
S             0
T             0
V             0
W             0
Y             0
dtype: int64

In [34]:
aac_df.to_csv('../../data/2-feature/prot/aac_bindingdb.csv', index=False)

In [35]:
#Feature Engineering Fasta - PAAC
paac, desc = paac(seqs)

paac_df = pd.DataFrame(paac, columns=list(desc))
paac_df =  pd.concat([protein_inter[['UniProt ID']].reset_index(drop=True), paac_df.reset_index(drop=True)], axis=1)
paac_df

Unnamed: 0,UniProt ID,A,C,D,E,F,G,H,I,K,...,lambda21,lambda22,lambda23,lambda24,lambda25,lambda26,lambda27,lambda28,lambda29,lambda30
0,O15379,6.049162,3.156084,8.679232,7.101190,6.575176,8.416225,4.997134,5.786155,3.945106,...,0.024141,0.023809,0.023924,0.025473,0.025120,0.024836,0.023188,0.026421,0.023579,0.024655
1,P36894,7.904335,5.864507,7.394378,7.649357,4.334636,8.669271,2.804764,10.199143,8.159314,...,0.025455,0.026202,0.026789,0.023657,0.024494,0.024887,0.025775,0.025150,0.024265,0.026425
2,P20393,11.306579,4.348684,5.798246,8.117544,7.537719,11.596491,4.928509,3.768860,6.088158,...,0.022686,0.024973,0.023975,0.022988,0.023430,0.023180,0.023373,0.022219,0.023159,0.024141
3,P06213,16.898808,12.033999,17.922978,27.140509,15.106510,23.555914,9.473574,16.898808,17.154850,...,0.024750,0.026032,0.025201,0.024866,0.024903,0.025252,0.025008,0.024895,0.024053,0.024623
4,P35354,8.217525,3.446059,6.892117,9.542932,10.073095,9.808013,5.036547,9.012769,9.012769,...,0.025630,0.024608,0.024065,0.024727,0.024990,0.024915,0.024862,0.024369,0.023628,0.024535
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,Q01469,1.533375,1.533375,2.044500,3.066751,1.022250,2.811188,0.255563,1.533375,3.577876,...,0.024124,0.022139,0.024353,0.021422,0.023904,0.025296,0.023799,0.022502,0.025380,0.022774
132,O43194,8.100353,3.240141,1.620071,5.940259,7.020306,4.050177,4.050177,6.750294,4.320188,...,0.024839,0.026534,0.024563,0.024512,0.026145,0.025950,0.026819,0.025754,0.026042,0.025244
133,Q5NUL3,8.171606,2.808989,2.553627,2.553627,6.128704,3.830440,1.021451,7.405518,2.298264,...,0.025669,0.025526,0.025306,0.025695,0.024066,0.024742,0.026580,0.025372,0.025178,0.024201
134,Q9H093,11.357704,2.271541,7.824196,11.105310,4.038295,11.862490,6.309835,6.814622,10.852917,...,0.025021,0.025921,0.024100,0.024251,0.025776,0.025914,0.024098,0.025293,0.025519,0.024684


In [36]:
paac_df.isna().sum()

UniProt ID    0
A             0
C             0
D             0
E             0
F             0
G             0
H             0
I             0
K             0
L             0
M             0
N             0
P             0
Q             0
R             0
S             0
T             0
V             0
W             0
Y             0
lambda1       0
lambda2       0
lambda3       0
lambda4       0
lambda5       0
lambda6       0
lambda7       0
lambda8       0
lambda9       0
lambda10      0
lambda11      0
lambda12      0
lambda13      0
lambda14      0
lambda15      0
lambda16      0
lambda17      0
lambda18      0
lambda19      0
lambda20      0
lambda21      0
lambda22      0
lambda23      0
lambda24      0
lambda25      0
lambda26      0
lambda27      0
lambda28      0
lambda29      0
lambda30      0
dtype: int64

In [37]:
paac_df.to_csv('../../data/2-feature/prot/paac_bindingdb.csv', index=False)

In [38]:
#Feature Engineering Fasta - ATC
from protlearn.features import atc
atoms, bonds = atc(seqs)

atc_df = pd.concat([pd.DataFrame(atoms), pd.DataFrame(bonds)], axis=1)
atc_df.columns = [int(i) for i in range(atc_df.shape[1])]
atc_df =  pd.concat([protein_inter[['UniProt ID']].reset_index(drop=True), atc_df.reset_index(drop=True)], axis=1)
atc_df

Unnamed: 0,UniProt ID,0,1,2,3,4,5,6,7
0,O15379,0.274581,0.515512,0.071929,0.135477,0.002502,7664.0,6912.0,752.0
1,P36894,0.266221,0.526452,0.072270,0.131164,0.003893,9573.0,8736.0,837.0
2,P20393,0.261121,0.522241,0.076514,0.137147,0.002978,10584.0,9661.0,923.0
3,P06213,0.267874,0.522500,0.073352,0.133300,0.002974,24776.0,22542.0,2234.0
4,P35354,0.272918,0.523070,0.071784,0.129786,0.002442,10997.0,9987.0,1010.0
...,...,...,...,...,...,...,...,...,...
131,Q01469,0.258907,0.529295,0.071655,0.135788,0.004355,2403.0,2204.0,199.0
132,O43194,0.269191,0.528510,0.072582,0.126698,0.003019,8247.0,7528.0,719.0
133,Q5NUL3,0.271637,0.535472,0.070221,0.120214,0.002456,6629.0,6067.0,562.0
134,Q9H093,0.263063,0.526383,0.076837,0.132174,0.001542,11164.0,10207.0,957.0


In [39]:
atc_df.isna().sum()

UniProt ID    0
0             0
1             0
2             0
3             0
4             0
5             0
6             0
7             0
dtype: int64

In [40]:
atc_df.to_csv('../../data/2-feature/prot/atc_bindingdb.csv', index=False)