In [1]:
import pandas as pd
import numpy as np
import glob
from tqdm.auto import tqdm
import itertools
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

## compound package
from rdkit import Chem
from rdkit.Chem import AllChem, MACCSkeys
import pubchempy as pc
from padelpy import from_smiles, padeldescriptor

## protein package
from protlearn.preprocessing import remove_unnatural
from protlearn.features import aac
from protlearn.features import paac
from protlearn.features import aaindex1

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
protein_inter = pd.read_csv("../../../data/fasta/fasta_df_a.csv")
protein_inter

Unnamed: 0,uniprot_id,protein_name,fasta_seq
0,P05231,IL6_HUMAN,MNSFSTSAFGPVAFSLGLLLVLPAAFPAPVPPGEDSKDVAAPHRQP...
1,P04637,P53_HUMAN,MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...
2,Q05469,LIPS_HUMAN,MEPGSKSVSRSDWQPEPHQRPITPLEPGPEKTPIAQPESKTLQGSN...
3,P46531,NOTC1_HUMAN,MPPLLAPLLCLALLPALAARGPRCSQPGETCLNGGKCEAANGTEAC...
4,P00750,TPA_HUMAN,MDAMKRGLCCVLLLCGAVFVSPSQEIHARFRRGARSYQVICRDEKT...
...,...,...,...
1168,Q6PHR2,ULK3_HUMAN,MAGPGWGPPRLDGFILTERLGSGTYATVYKAYAKKDTREVVAIKCV...
1169,Q8NE63,HIPK4_HUMAN,MSTIQSETDCYDIIEVLGKGTFGEVAKGWRRSTGEMVAIKILKNDA...
1170,O94768,ST17B_HUMAN,MSRRRFDCRSISGLLTTTPQIPIKMENFNNFYILTSKELGRGKFAV...
1171,Q9GZN0,GPR88_HUMAN,MTNSSSTSTSSTTGGSLLLLCEEEESWAGRRIPVSLLYSGLAIGGT...


In [3]:
inter_cytos = pd.read_csv("../../../data/positif/inter_cytoscape_df.csv")
inter_cytos

Unnamed: 0,uniprot_id,canonical_smiles,isomeric_smiles,class
0,P05231,CC12C=CC(=O)C=C1CCC1C2C(O)CC2(C)C1CCC2(O)C(=O)CO,[H][C@@]12CC[C@](O)(C(=O)CO)[C@@]1(C)C[C@H](O)...,1
1,P05231,O=C1CCC(N2Cc3c(OCc4ccc(CN5CCOCC5)cc4)cccc3C2=O...,O=C1N(Cc2c1cccc2OCc1ccc(CN2CCOCC2)cc1)[C@H]1CC...,1
2,P05231,Cc1cnc(NC(=O)c2ccc3c(c2)CCC2CC(O)(C(F)(F)F)CCC...,Cc1cnc(NC(=O)c2ccc3c(CC[C@@H]4C[C@](O)(CC[C@@]...,1
3,P05231,Cc1ncccc1NC(=O)c1ccc2c(c1)CCC1CC(O)(CCC(F)(F)F...,Cc1ncccc1NC(=O)c1ccc2c(CC[C@@H]3C[C@@](O)(CCC(...,1
4,P05231,Cc1ccncc1NC(=O)c1ccc2c(c1)CCC1CC(O)(C(F)(F)F)C...,Cc1ccncc1NC(=O)c1ccc2c(CC[C@@H]3C[C@](O)(CC[C@...,1
...,...,...,...,...
134114,P25098,O=C(NCc1ccccc1Cl)c1cccc(NCc2nnc(-c3ccncc3)[nH]...,Clc1ccccc1CNC(=O)c1cccc(NCc2nnc([nH]2)-c2ccncc...,1
134115,P25098,COc1ccccc1CNC(=O)c1cccc(NCc2nnc(-c3ccncc3)[nH]...,COc1ccccc1CNC(=O)c1cccc(NCc2nnc([nH]2)-c2ccncc...,1
134116,P25098,O=C(NCc1c(F)cccc1F)c1cccc(NCc2nnc3n2CCOc2cnccc...,Fc1cccc(F)c1CNC(=O)c1cccc(NCc2nnc3-c4ccncc4OCC...,1
134117,P25098,COc1cccc(Cn2cnc3ccc(-c4cn[nH]c4)cc3c2=O)c1,COc1cccc(Cn2cnc3ccc(cc3c2=O)-c2cn[nH]c2)c1,1


In [4]:
# Filter the new data based on UniProt IDs from the previous merged data
filtered_data = protein_inter[protein_inter['uniprot_id'].isin(inter_cytos['uniprot_id'])]
filtered_data

Unnamed: 0,uniprot_id,protein_name,fasta_seq
0,P05231,IL6_HUMAN,MNSFSTSAFGPVAFSLGLLLVLPAAFPAPVPPGEDSKDVAAPHRQP...
1,P04637,P53_HUMAN,MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...
3,P46531,NOTC1_HUMAN,MPPLLAPLLCLALLPALAARGPRCSQPGETCLNGGKCEAANGTEAC...
5,P01375,TNFA_HUMAN,MSTESMIRDVELAEEALPKKTGGPQGSRRCLFLSLFSFLIVAGATT...
7,P31749,AKT1_HUMAN,MSDVAIVKEGWLHKRGEYIKTWRPRYFLLKNDGTFIGYKERPQDVD...
...,...,...,...
888,Q9GZQ8,MLP3B_HUMAN,MPSEKTFKQRRTFEQRVEDVRLIREQHPTKIPVIIERYKGEKQLPV...
891,P22694,KAPCB_HUMAN,MGNAATAKKGSEVESVKEFLAKAKEDFLKKWENPTQNNAGLEDFER...
917,P41594,GRM5_HUMAN,MVLLLILSVLLLKEDVRGSAQSSERRVVAHMPGDIIIGALFSVHHQ...
960,Q9UQM7,KCC2A_HUMAN,MATITCTRFTEEYQLFEELGKGAFSVVRRCVKVLAGQEYAAKIINT...


In [5]:
fasta_list = filtered_data["fasta_seq"].tolist()
print(len(fasta_list))

250


In [6]:
##Remove unnatural
seqs = remove_unnatural(fasta_list)
print(len(seqs))

250


In [7]:
##Feature Engineering Fasta - AAIndex1
aaind, inds = aaindex1(seqs, standardize="zscore")

aaindex_feats = pd.DataFrame(aaind, columns=inds)
aaindex_feats

Unnamed: 0,ANDN920101,ARGP820101,ARGP820102,ARGP820103,BEGF750101,BEGF750102,BEGF750103,BHAR880101,BIGC670101,BIOV880101,...,KARS160113,KARS160114,KARS160115,KARS160116,KARS160117,KARS160118,KARS160119,KARS160120,KARS160121,KARS160122
0,0.547860,-0.806375,1.122797,0.993967,0.901913,-0.407392,-1.123467,-0.406683,0.037629,-0.392525,...,-0.018397,-0.030017,0.284803,-0.379376,-0.330449,1.309956,0.418666,0.305048,0.247866,-0.277617
1,1.452201,-0.789701,-0.810213,-0.945349,-1.839315,-1.817951,1.952321,1.247417,-1.219126,-1.291299,...,-0.068955,-0.176163,0.006610,-0.133951,-0.573911,0.300568,-0.172753,0.570096,0.782865,2.575731
2,1.917058,-2.076390,-0.450742,-1.124569,-3.455047,-1.778815,2.760809,0.467887,-3.323410,0.016633,...,-3.152039,-2.487183,-2.245594,-2.141387,-2.325624,1.343353,-1.091530,0.590342,-1.321354,1.340941
3,-1.819937,-0.021239,0.790435,0.745496,0.494711,0.197155,-1.016935,0.281353,-0.537302,0.412860,...,-0.962635,-1.340863,-1.317873,-1.481905,-1.260887,-0.895102,-1.289387,0.914975,-1.434610,0.152279
4,0.235076,0.119467,-0.758563,-0.547985,0.526117,0.149263,-0.193600,0.015412,1.085001,-0.481558,...,1.420515,1.727314,1.420169,1.613305,1.648406,-0.571471,0.799049,-0.536437,0.928923,-1.075826
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,-1.172829,0.615396,-0.362487,-0.247263,0.918840,0.901192,-1.557151,0.370087,2.236261,-0.326379,...,1.955941,2.637523,2.168672,2.353255,2.061364,0.263433,2.057782,-0.068128,2.170943,0.292081
246,0.015278,1.091769,-0.618308,-0.293560,1.017486,0.498179,-0.748460,-0.441568,1.533792,-0.108949,...,1.427967,1.315828,1.403781,1.151442,1.488188,-1.018822,0.141978,-0.078139,0.241712,-2.079991
247,-0.616619,-0.178797,0.079077,0.159962,-0.297389,0.071973,-0.024606,0.189752,-1.162580,0.220042,...,-1.406762,-1.258938,-1.400364,-1.000360,-1.403534,0.157854,-1.322555,-0.279911,-1.253076,0.188224
248,-0.600086,-0.009338,-0.630617,-0.465073,0.674337,0.305443,-0.664615,-0.195164,0.238606,-0.091031,...,0.137076,0.228908,0.178580,0.292715,0.361121,-1.099671,-0.512767,-1.178603,-0.676119,-1.301813


In [8]:
aaindex_feats =  pd.concat([filtered_data[['uniprot_id']].reset_index(drop=True), aaindex_feats.reset_index(drop=True)], axis=1)

In [9]:
aaindex_feats

Unnamed: 0,uniprot_id,ANDN920101,ARGP820101,ARGP820102,ARGP820103,BEGF750101,BEGF750102,BEGF750103,BHAR880101,BIGC670101,...,KARS160113,KARS160114,KARS160115,KARS160116,KARS160117,KARS160118,KARS160119,KARS160120,KARS160121,KARS160122
0,P05231,0.547860,-0.806375,1.122797,0.993967,0.901913,-0.407392,-1.123467,-0.406683,0.037629,...,-0.018397,-0.030017,0.284803,-0.379376,-0.330449,1.309956,0.418666,0.305048,0.247866,-0.277617
1,P04637,1.452201,-0.789701,-0.810213,-0.945349,-1.839315,-1.817951,1.952321,1.247417,-1.219126,...,-0.068955,-0.176163,0.006610,-0.133951,-0.573911,0.300568,-0.172753,0.570096,0.782865,2.575731
2,P46531,1.917058,-2.076390,-0.450742,-1.124569,-3.455047,-1.778815,2.760809,0.467887,-3.323410,...,-3.152039,-2.487183,-2.245594,-2.141387,-2.325624,1.343353,-1.091530,0.590342,-1.321354,1.340941
3,P01375,-1.819937,-0.021239,0.790435,0.745496,0.494711,0.197155,-1.016935,0.281353,-0.537302,...,-0.962635,-1.340863,-1.317873,-1.481905,-1.260887,-0.895102,-1.289387,0.914975,-1.434610,0.152279
4,P31749,0.235076,0.119467,-0.758563,-0.547985,0.526117,0.149263,-0.193600,0.015412,1.085001,...,1.420515,1.727314,1.420169,1.613305,1.648406,-0.571471,0.799049,-0.536437,0.928923,-1.075826
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,Q9GZQ8,-1.172829,0.615396,-0.362487,-0.247263,0.918840,0.901192,-1.557151,0.370087,2.236261,...,1.955941,2.637523,2.168672,2.353255,2.061364,0.263433,2.057782,-0.068128,2.170943,0.292081
246,P22694,0.015278,1.091769,-0.618308,-0.293560,1.017486,0.498179,-0.748460,-0.441568,1.533792,...,1.427967,1.315828,1.403781,1.151442,1.488188,-1.018822,0.141978,-0.078139,0.241712,-2.079991
247,P41594,-0.616619,-0.178797,0.079077,0.159962,-0.297389,0.071973,-0.024606,0.189752,-1.162580,...,-1.406762,-1.258938,-1.400364,-1.000360,-1.403534,0.157854,-1.322555,-0.279911,-1.253076,0.188224
248,Q9UQM7,-0.600086,-0.009338,-0.630617,-0.465073,0.674337,0.305443,-0.664615,-0.195164,0.238606,...,0.137076,0.228908,0.178580,0.292715,0.361121,-1.099671,-0.512767,-1.178603,-0.676119,-1.301813


In [12]:
aaindex_feats.isna().sum()

uniprot_id    0
ANDN920101    0
ARGP820101    0
ARGP820102    0
ARGP820103    0
             ..
KARS160118    0
KARS160119    0
KARS160120    0
KARS160121    0
KARS160122    0
Length: 554, dtype: int64

In [13]:
#fasta_aaindex = '../../../data/feature/aaindex_fitur.csv'
#aaindex_feats.to_csv(fasta_aaindex, index= False)

In [14]:
##Feature Engineering Fasta - AAC 
comp, aa = aac(seqs)

aac_df = pd.DataFrame(comp, columns=list(aa))
aac_df

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y
0,0.084906,0.018868,0.037736,0.075472,0.051887,0.033019,0.009434,0.042453,0.066038,0.132075,0.028302,0.051887,0.051887,0.066038,0.042453,0.089623,0.061321,0.037736,0.004717,0.014151
1,0.061069,0.025445,0.050891,0.076336,0.027990,0.058524,0.030534,0.020356,0.050891,0.081425,0.030534,0.035623,0.114504,0.038168,0.066158,0.096692,0.055980,0.045802,0.010178,0.022901
2,0.066536,0.096282,0.060665,0.053229,0.025049,0.100587,0.032094,0.021918,0.024658,0.073190,0.011350,0.059100,0.078278,0.047358,0.040313,0.073973,0.055186,0.048532,0.007828,0.023875
3,0.081545,0.017167,0.030043,0.068670,0.042918,0.072961,0.017167,0.051502,0.034335,0.128755,0.008584,0.030043,0.064378,0.055794,0.060086,0.085837,0.042918,0.068670,0.008584,0.030043
4,0.052083,0.014583,0.058333,0.102083,0.056250,0.062500,0.027083,0.041667,0.075000,0.085417,0.033333,0.027083,0.045833,0.035417,0.062500,0.047917,0.062500,0.058333,0.014583,0.037500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,0.024000,0.000000,0.040000,0.096000,0.056000,0.032000,0.024000,0.064000,0.080000,0.088000,0.040000,0.032000,0.048000,0.056000,0.072000,0.072000,0.048000,0.096000,0.000000,0.032000
246,0.068376,0.005698,0.054131,0.076923,0.068376,0.062678,0.022792,0.056980,0.096866,0.091168,0.019943,0.039886,0.037037,0.034188,0.048433,0.045584,0.048433,0.059829,0.017094,0.045584
247,0.080033,0.031353,0.043729,0.052805,0.035479,0.075908,0.018152,0.065182,0.052805,0.082508,0.028053,0.033003,0.055281,0.032178,0.048680,0.100660,0.054455,0.062706,0.012376,0.034653
248,0.077406,0.020921,0.050209,0.081590,0.037657,0.066946,0.043933,0.064854,0.066946,0.089958,0.016736,0.029289,0.041841,0.035565,0.056485,0.056485,0.058577,0.056485,0.016736,0.031381


In [15]:
aac_df_b =  pd.concat([filtered_data[['uniprot_id']].reset_index(drop=True), aac_df.reset_index(drop=True)], axis=1)
aac_df_b