In [1]:
import pandas as pd
import numpy as np
import glob
from tqdm.auto import tqdm
import itertools
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

## compound package
from rdkit import Chem
from rdkit.Chem import AllChem, MACCSkeys
import pubchempy as pc
from padelpy import from_smiles, padeldescriptor

## protein package
from protlearn.preprocessing import remove_unnatural
from protlearn.features import aac
from protlearn.features import paac
from protlearn.features import aaindex1

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
inter_df = pd.read_csv("../../data/preparation/interaction/interaction_chembl.csv")
inter_df

Unnamed: 0,UniProt ID,Drug,SMILES,FASTA Sequence
0,Q9UBK2,COLFORSIN,CC(=O)OC1C(C2C(CCC(C2(C3(C1(OC(CC3=O)(C)C=C)C)...,MAWDMCNQDSESVWSDIECAALVGEDQPLCPDLPELDLSELDVNDL...
1,Q9UBK2,DEXAMETHASONE,CC1CC2C3CCC4=CC(=O)C=CC4(C3(C(CC2(C1(C(=O)CO)O...,MAWDMCNQDSESVWSDIECAALVGEDQPLCPDLPELDLSELDVNDL...
2,O15379,VORINOSTAT,C1=CC=C(C=C1)NC(=O)CCCCCCC(=O)NO,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...
3,O15379,TACEDINALINE,CC(=O)NC1=CC=C(C=C1)C(=O)NC2=CC=CC=C2N,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...
4,O15379,DACINOSTAT,C1=CC=C2C(=C1)C(=CN2)CCN(CCO)CC3=CC=C(C=C3)C=C...,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...
...,...,...,...,...
674,O14894,"4,4'-DIHYDROXYCHALCONE",C1=CC(=CC=C1C=CC(=O)C2=CC=C(C=C2)O)O,MCTGKCARCVGLSLITLCLVCIVANALLLVPNGETSWTNTNHLSLQ...
675,P81277,KISSPEPTIN-10,CC(C)CC(C(=O)NC(CCCN=C(N)N)C(=O)NC(CC1=CC=CC=C...,MKVLRAWLLCLLMLGLALRGAASRTHRHSMEIRTPDINPAWYASRG...
676,P81277,DAMGO,CC(C(=O)NCC(=O)N(C)C(CC1=CC=CC=C1)C(=O)NCCO)NC...,MKVLRAWLLCLLMLGLALRGAASRTHRHSMEIRTPDINPAWYASRG...
677,P81277,NALOXONE,C=CCN1CCC23C4C(=O)CCC2(C1CC5=C3C(=C(C=C5)O)O4)O,MKVLRAWLLCLLMLGLALRGAASRTHRHSMEIRTPDINPAWYASRG...


In [6]:
# protein_inter = pd.read_csv("../../../data/fasta/fasta_df_a.csv")
protein_inter = inter_df.drop(columns=['Drug', 'SMILES']).drop_duplicates()
protein_inter

Unnamed: 0,UniProt ID,FASTA Sequence
0,Q9UBK2,MAWDMCNQDSESVWSDIECAALVGEDQPLCPDLPELDLSELDVNDL...
2,O15379,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...
12,P36894,MPQLYIYIRLLGAYLFIISRVQGQNLDSMLHGTGMKSDSDQKKSEN...
17,P20393,MTTLDSNNNTGGVITYIGSSGSSPSRTSPESLYSDNSNGSFQSLTQ...
20,P06213,MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTR...
...,...,...
657,Q5NUL3,MSPECARAAGDAPLRSLEQANRTRFPFFSDVKGDHRLVLAAVETTV...
658,Q9H093,MESLVFARRSGPTPSAAELARPLAEGLIKSPKPLMKKQAVKRHHHK...
673,Q96P68,MNEPLDYLANASDFPDYAAAFGNCTDENIPLKMHYLPVIYGIIFLV...
674,O14894,MCTGKCARCVGLSLITLCLVCIVANALLLVPNGETSWTNTNHLSLQ...


In [7]:
fasta_list = protein_inter["FASTA Sequence"].tolist()
seqs = remove_unnatural(fasta_list)
len(seqs)

143

In [8]:
##Feature Engineering Fasta - AAIndex1
aaind, inds = aaindex1(seqs, standardize="zscore")

aaindex_feats = pd.DataFrame(aaind, columns=inds)
aaindex_feats

Unnamed: 0,ANDN920101,ARGP820101,ARGP820102,ARGP820103,BEGF750101,BEGF750102,BEGF750103,BHAR880101,BIGC670101,BIOV880101,...,KARS160113,KARS160114,KARS160115,KARS160116,KARS160117,KARS160118,KARS160119,KARS160120,KARS160121,KARS160122
0,3.066707,-2.170256,-1.977858,-2.333132,-2.488069,-2.677095,2.738514,2.638519,-0.334663,-2.453463,...,0.786476,1.497195,1.333951,1.710537,1.116361,1.790829,1.253216,0.093426,1.453492,1.391010
1,1.383463,0.391879,-0.959167,-0.895293,-0.823573,0.666852,0.722144,-0.242320,0.448794,0.459470,...,0.904216,0.622215,0.815818,0.684566,1.251251,-0.314495,0.867716,-0.476666,0.726892,-0.836357
2,0.160383,0.446220,0.006955,-0.031379,-0.234900,0.376213,0.147584,-0.092977,0.645348,0.397075,...,0.406457,0.735826,0.615034,0.866117,0.649316,0.935664,0.686485,-0.791727,0.532325,-0.506287
3,1.550796,-1.720927,-0.168155,-0.463940,-1.956911,-1.388424,1.839802,0.917593,-1.603182,-0.861150,...,-0.793959,-0.580371,-0.518858,-0.462731,-0.854349,0.762912,-0.338028,0.458311,0.074486,2.004152
4,0.671017,0.138592,-0.442222,-0.534896,-0.752958,-0.382287,0.846182,0.265544,0.340824,-0.010687,...,0.528128,0.582875,0.541818,0.653083,0.695783,0.337997,0.739079,0.002996,0.735284,0.368676
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,-1.668818,2.598595,2.728783,2.778199,1.956135,2.724477,-2.644934,-2.349186,1.556375,2.776795,...,0.558923,-0.385446,-0.429205,-0.762463,-0.030775,-0.103047,0.091890,-0.122846,0.044905,0.615119
139,-0.091986,-0.431134,-0.850523,-0.889586,-0.457664,-1.314357,0.877325,1.213361,-0.410766,-1.026330,...,-0.008074,0.091502,0.122063,0.260857,-0.240885,-0.867749,-0.750275,0.933385,-0.442723,0.228327
140,0.449759,2.457485,2.295890,2.158853,0.374100,2.402465,-1.457472,-2.448792,1.659945,2.644675,...,0.783185,0.266564,0.627526,0.147723,0.799098,1.555960,1.348129,-2.907567,1.299479,0.729973
141,-1.646841,0.029187,2.121777,1.724249,-0.375711,0.849929,0.514484,-1.778334,-1.817757,2.509689,...,-2.871880,-2.333196,-2.698247,-2.041874,-2.252202,-0.356339,-1.938327,0.833921,-2.549134,-0.326071


In [9]:
aaindex_feats =  pd.concat([protein_inter[['UniProt ID']].reset_index(drop=True), aaindex_feats.reset_index(drop=True)], axis=1)
aaindex_feats

Unnamed: 0,UniProt ID,ANDN920101,ARGP820101,ARGP820102,ARGP820103,BEGF750101,BEGF750102,BEGF750103,BHAR880101,BIGC670101,...,KARS160113,KARS160114,KARS160115,KARS160116,KARS160117,KARS160118,KARS160119,KARS160120,KARS160121,KARS160122
0,Q9UBK2,3.066707,-2.170256,-1.977858,-2.333132,-2.488069,-2.677095,2.738514,2.638519,-0.334663,...,0.786476,1.497195,1.333951,1.710537,1.116361,1.790829,1.253216,0.093426,1.453492,1.391010
1,O15379,1.383463,0.391879,-0.959167,-0.895293,-0.823573,0.666852,0.722144,-0.242320,0.448794,...,0.904216,0.622215,0.815818,0.684566,1.251251,-0.314495,0.867716,-0.476666,0.726892,-0.836357
2,P36894,0.160383,0.446220,0.006955,-0.031379,-0.234900,0.376213,0.147584,-0.092977,0.645348,...,0.406457,0.735826,0.615034,0.866117,0.649316,0.935664,0.686485,-0.791727,0.532325,-0.506287
3,P20393,1.550796,-1.720927,-0.168155,-0.463940,-1.956911,-1.388424,1.839802,0.917593,-1.603182,...,-0.793959,-0.580371,-0.518858,-0.462731,-0.854349,0.762912,-0.338028,0.458311,0.074486,2.004152
4,P06213,0.671017,0.138592,-0.442222,-0.534896,-0.752958,-0.382287,0.846182,0.265544,0.340824,...,0.528128,0.582875,0.541818,0.653083,0.695783,0.337997,0.739079,0.002996,0.735284,0.368676
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,Q5NUL3,-1.668818,2.598595,2.728783,2.778199,1.956135,2.724477,-2.644934,-2.349186,1.556375,...,0.558923,-0.385446,-0.429205,-0.762463,-0.030775,-0.103047,0.091890,-0.122846,0.044905,0.615119
139,Q9H093,-0.091986,-0.431134,-0.850523,-0.889586,-0.457664,-1.314357,0.877325,1.213361,-0.410766,...,-0.008074,0.091502,0.122063,0.260857,-0.240885,-0.867749,-0.750275,0.933385,-0.442723,0.228327
140,Q96P68,0.449759,2.457485,2.295890,2.158853,0.374100,2.402465,-1.457472,-2.448792,1.659945,...,0.783185,0.266564,0.627526,0.147723,0.799098,1.555960,1.348129,-2.907567,1.299479,0.729973
141,O14894,-1.646841,0.029187,2.121777,1.724249,-0.375711,0.849929,0.514484,-1.778334,-1.817757,...,-2.871880,-2.333196,-2.698247,-2.041874,-2.252202,-0.356339,-1.938327,0.833921,-2.549134,-0.326071


In [10]:
aaindex_feats.isna().sum()

UniProt ID    0
ANDN920101    0
ARGP820101    0
ARGP820102    0
ARGP820103    0
             ..
KARS160118    0
KARS160119    0
KARS160120    0
KARS160121    0
KARS160122    0
Length: 554, dtype: int64

In [19]:
aaindex_feats.to_csv('../../data/feature/prot/aaindex_nrlmf_chembl.csv', index=False)

In [11]:
##Feature Engineering Fasta - AAC 
comp, aa = aac(seqs)

aac_df = pd.DataFrame(comp, columns=list(aa))
aac_df =  pd.concat([protein_inter[['UniProt ID']].reset_index(drop=True), aac_df.reset_index(drop=True)], axis=1)
aac_df

Unnamed: 0,UniProt ID,A,C,D,E,F,G,H,I,K,...,M,N,P,Q,R,S,T,V,W,Y
0,Q9UBK2,0.038847,0.025063,0.076441,0.080201,0.032581,0.036341,0.020050,0.026316,0.058897,...,0.007519,0.050125,0.066416,0.051378,0.078947,0.140351,0.058897,0.026316,0.007519,0.036341
1,O15379,0.053738,0.028037,0.077103,0.063084,0.058411,0.074766,0.044393,0.051402,0.035047,...,0.018692,0.049065,0.049065,0.039720,0.042056,0.051402,0.042056,0.067757,0.004673,0.067757
2,P36894,0.058271,0.043233,0.054511,0.056391,0.031955,0.063910,0.020677,0.075188,0.060150,...,0.030075,0.039474,0.037594,0.041353,0.054511,0.071429,0.052632,0.054511,0.015038,0.043233
3,P20393,0.063518,0.024430,0.032573,0.045603,0.042345,0.065147,0.027687,0.021173,0.034202,...,0.029316,0.060261,0.089577,0.050489,0.058632,0.133550,0.057003,0.052117,0.004886,0.017915
4,P06213,0.047757,0.034009,0.050651,0.076700,0.042692,0.066570,0.026773,0.047757,0.048480,...,0.021708,0.052098,0.059334,0.027496,0.059334,0.075977,0.049928,0.062952,0.014472,0.036903
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,Q5NUL3,0.088643,0.030471,0.027701,0.027701,0.066482,0.041551,0.011080,0.080332,0.024931,...,0.016620,0.027701,0.044321,0.027701,0.072022,0.072022,0.052632,0.099723,0.027701,0.013850
139,Q9H093,0.071656,0.014331,0.049363,0.070064,0.025478,0.074841,0.039809,0.042994,0.068471,...,0.014331,0.027070,0.076433,0.036624,0.068471,0.097134,0.033439,0.047771,0.011146,0.023885
140,Q96P68,0.062315,0.047478,0.029674,0.023739,0.068249,0.026706,0.026706,0.100890,0.032641,...,0.020772,0.056380,0.038576,0.017804,0.038576,0.083086,0.074184,0.065282,0.011869,0.047478
141,O14894,0.081218,0.091371,0.020305,0.025381,0.030457,0.131980,0.015228,0.040609,0.020305,...,0.025381,0.050761,0.035533,0.015228,0.055838,0.050761,0.050761,0.086294,0.025381,0.015228


In [13]:
aac_df.isna().sum()

UniProt ID    0
A             0
C             0
D             0
E             0
F             0
G             0
H             0
I             0
K             0
L             0
M             0
N             0
P             0
Q             0
R             0
S             0
T             0
V             0
W             0
Y             0
dtype: int64

In [21]:
# feature_aac = '../../../data/feature/aac_fitur.csv'
# aac_df_b.to_csv(feature_aac, index= False)

In [14]:
#Feature Engineering Fasta - PAAC
paac, desc = paac(seqs)

paac_df = pd.DataFrame(paac, columns=list(desc))
paac_df =  pd.concat([protein_inter[['UniProt ID']].reset_index(drop=True), paac_df.reset_index(drop=True)], axis=1)
paac_df

Unnamed: 0,UniProt ID,A,C,D,E,F,G,H,I,K,...,lambda21,lambda22,lambda23,lambda24,lambda25,lambda26,lambda27,lambda28,lambda29,lambda30
0,Q9UBK2,8.392463,5.414492,16.514202,17.326375,7.038840,7.851014,4.331594,5.685217,12.724057,...,0.024281,0.024291,0.025416,0.023530,0.024145,0.023755,0.024902,0.023883,0.023610,0.024707
1,O15379,6.049162,3.156084,8.679232,7.101190,6.575176,8.416225,4.997134,5.786155,3.945106,...,0.024141,0.023809,0.023924,0.025473,0.025120,0.024836,0.023188,0.026421,0.023579,0.024655
2,P36894,7.904335,5.864507,7.394378,7.649357,4.334636,8.669271,2.804764,10.199143,8.159314,...,0.025455,0.026202,0.026789,0.023657,0.024494,0.024887,0.025775,0.025150,0.024265,0.026425
3,P20393,11.306579,4.348684,5.798246,8.117544,7.537719,11.596491,4.928509,3.768860,6.088158,...,0.022686,0.024973,0.023975,0.022988,0.023430,0.023180,0.023373,0.022219,0.023159,0.024141
4,P06213,16.898808,12.033999,17.922978,27.140509,15.106510,23.555914,9.473574,16.898808,17.154850,...,0.024750,0.026032,0.025201,0.024866,0.024903,0.025252,0.025008,0.024895,0.024053,0.024623
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,Q5NUL3,8.171606,2.808989,2.553627,2.553627,6.128704,3.830440,1.021451,7.405518,2.298264,...,0.025669,0.025526,0.025306,0.025695,0.024066,0.024742,0.026580,0.025372,0.025178,0.024201
139,Q9H093,11.357704,2.271541,7.824196,11.105310,4.038295,11.862490,6.309835,6.814622,10.852917,...,0.025021,0.025921,0.024100,0.024251,0.025776,0.025914,0.024098,0.025293,0.025519,0.024684
140,Q96P68,6.078460,4.631208,2.894505,2.315604,6.657361,2.605054,2.605054,9.841317,3.183955,...,0.023088,0.025417,0.023993,0.024071,0.025503,0.024242,0.024179,0.021892,0.023822,0.022426
141,O14894,4.352261,4.896293,1.088065,1.360082,1.632098,7.072424,0.816049,2.176130,1.088065,...,0.024261,0.025647,0.023637,0.026474,0.025633,0.025016,0.023712,0.025014,0.027465,0.023980


In [15]:
paac_df.isna().sum()

UniProt ID    0
A             0
C             0
D             0
E             0
F             0
G             0
H             0
I             0
K             0
L             0
M             0
N             0
P             0
Q             0
R             0
S             0
T             0
V             0
W             0
Y             0
lambda1       0
lambda2       0
lambda3       0
lambda4       0
lambda5       0
lambda6       0
lambda7       0
lambda8       0
lambda9       0
lambda10      0
lambda11      0
lambda12      0
lambda13      0
lambda14      0
lambda15      0
lambda16      0
lambda17      0
lambda18      0
lambda19      0
lambda20      0
lambda21      0
lambda22      0
lambda23      0
lambda24      0
lambda25      0
lambda26      0
lambda27      0
lambda28      0
lambda29      0
lambda30      0
dtype: int64

In [25]:
# feature_aac = '../../../data/feature/paac_fitur.csv'
# paac_df_b.to_csv(feature_aac, index= False)

In [17]:
#Feature Engineering Fasta - ATC
from protlearn.features import atc
atoms, bonds = atc(seqs)

atc_df = pd.concat([pd.DataFrame(atoms), pd.DataFrame(bonds)], axis=1)
atc_df.columns = [int(i) for i in range(atc_df.shape[1])]
atc_df =  pd.concat([protein_inter[['UniProt ID']].reset_index(drop=True), atc_df.reset_index(drop=True)], axis=1)
atc_df

Unnamed: 0,UniProt ID,0,1,2,3,4,5,6,7
0,Q9UBK2,0.261790,0.516929,0.077455,0.142080,0.001747,14224.0,12936.0,1288.0
1,O15379,0.274581,0.515512,0.071929,0.135477,0.002502,7664.0,6912.0,752.0
2,P36894,0.266221,0.526452,0.072270,0.131164,0.003893,9573.0,8736.0,837.0
3,P20393,0.261121,0.522241,0.076514,0.137147,0.002978,10584.0,9661.0,923.0
4,P06213,0.267874,0.522500,0.073352,0.133300,0.002974,24776.0,22542.0,2234.0
...,...,...,...,...,...,...,...,...,...
138,Q5NUL3,0.271637,0.535472,0.070221,0.120214,0.002456,6629.0,6067.0,562.0
139,Q9H093,0.263063,0.526383,0.076837,0.132174,0.001542,11164.0,10207.0,957.0
140,Q96P68,0.274126,0.529915,0.067444,0.124942,0.003574,6167.0,5623.0,544.0
141,O14894,0.261156,0.530320,0.073799,0.128146,0.006579,3328.0,3045.0,283.0


In [18]:
atc_df.isna().sum()

UniProt ID    0
0             0
1             0
2             0
3             0
4             0
5             0
6             0
7             0
dtype: int64

In [None]:
# feature_atc = '../../../data/feature/atc_fitur.csv'
# atc_df_b.to_csv(feature_atc, index= False)