In [1]:
#Carico librerie, funzioni utili e modelli
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor

np.random.seed(42)
RANDOM_STATE = 42

In [2]:
#Carico DrugSpaceX
DSX = pd.read_csv("DSX_features_142.csv")
#Creo un set di allenamento e un set di test
X_train, X_test = train_test_split(DSX, random_state=RANDOM_STATE, test_size=0.25, shuffle=True)

In [10]:
#Creo il modello utilizzando EllipticEnvelope
model_1 = EllipticEnvelope(random_state=RANDOM_STATE, contamination=0.1, support_fraction=0.9).fit(X_train)
#Salvo il modello
"""
model_1_name = "EllipticEnvelope_142_01.sav"
pickle.dump(model_1, open(model_1_name, "wb"))
"""

In [14]:
#Creo il modello di novelty detection utilizzando LocalOutlierFactor
model_2 = LocalOutlierFactor(n_neighbors=20, contamination="auto", novelty=True).fit(X_train)
#Salvo il modello
"""
model_2_name = "LocalOutlierFactor_142.sav"
pickle.dump(model_2, open(model_2_name, "wb"))
"""

In [16]:
#Creo un terzo modello, applicando prima il modello1 e poi usando LOF

In [23]:
#Resetto gli indici di X_train (non necessario)
X_train.reset_index(inplace=True, drop=True)

In [28]:
#Creo la colonna che rappresenterà se le molecole sono outlier o inlier
prediction_X_train = model_1.predict(X_train)

In [29]:
#Aggiungo la colonna al DataFrame
X_train["inlier"] = prediction_X_train

In [30]:
X_train

Unnamed: 0,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_LOGPHI,BCUT2D_LOGPLOW,BCUT2D_MRHI,BCUT2D_MRLOW,BCUT2D_MWHI,BCUT2D_MWLOW,BalabanJ,EState_VSA11,...,VSA_EState2,VSA_EState3,VSA_EState4,VSA_EState5,fr_Al_COO,fr_Ar_N,fr_benzene,fr_phenol_noOrthoHbond,fr_phos_ester,inlier
0,2.570725,-2.296475,2.350283,-2.629019,7.981256,-0.141921,32.166388,9.963254,1.616942,43.150835,...,765.306285,259.326141,96.341896,-75.133098,2.0,0.0,3.0,2.0,0.0,1
1,2.257254,-2.214359,2.321778,-2.277620,7.915712,0.103375,32.233457,9.998832,1.784718,0.000000,...,16.179002,0.334241,0.968609,-0.307202,0.0,0.0,2.0,0.0,0.0,-1
2,2.582084,-2.494463,2.457842,-2.650842,7.997956,-0.156942,32.166681,9.870645,1.864470,4.390415,...,56.884832,21.910577,4.766433,-6.166719,2.0,1.0,0.0,0.0,0.0,1
3,2.114672,-2.108312,2.283061,-2.122840,6.175353,0.103298,35.495663,10.182167,2.588968,0.000000,...,11.385041,0.000000,0.948545,0.976006,0.0,0.0,1.0,0.0,0.0,1
4,2.324205,-2.318977,2.385178,-2.488328,14.115504,0.015617,126.914741,10.060823,2.710901,0.000000,...,35.716411,59.194215,0.337135,-1.507778,0.0,0.0,1.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
525445,2.091654,-2.277972,1.917228,-2.467883,4.698224,0.230976,15.187233,10.267161,2.131282,0.000000,...,2.403935,3.305471,5.661168,0.000000,0.0,0.0,0.0,0.0,0.0,1
525446,2.325645,-2.206212,2.198629,-2.371269,5.955362,0.110249,16.724033,10.015980,4.562920,0.000000,...,59.587161,136.889491,29.514924,-8.003152,0.0,0.0,0.0,0.0,0.0,1
525447,2.239163,-2.265961,2.187031,-2.424064,5.952558,-0.869216,16.153707,10.039623,2.044148,0.000000,...,14.346208,0.000000,1.064218,0.289829,0.0,0.0,0.0,0.0,0.0,1
525448,2.962826,-2.446394,2.750388,-2.635426,5.340747,-0.407361,17.077246,9.985688,0.322911,0.000000,...,0.000000,1361.408091,0.000000,-130.800825,0.0,0.0,0.0,0.0,0.0,1


In [37]:
#Filtro X_train
X_train_clear = X_train[X_train["inlier"] == 1]

In [39]:
#Rimuovo la colonna "inlier" da X_train_clear
X_train_clear.drop(columns="inlier", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_clear.drop(columns="inlier", inplace=True)


In [40]:
X_train_clear

Unnamed: 0,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_LOGPHI,BCUT2D_LOGPLOW,BCUT2D_MRHI,BCUT2D_MRLOW,BCUT2D_MWHI,BCUT2D_MWLOW,BalabanJ,EState_VSA11,...,SlogP_VSA3,VSA_EState2,VSA_EState3,VSA_EState4,VSA_EState5,fr_Al_COO,fr_Ar_N,fr_benzene,fr_phenol_noOrthoHbond,fr_phos_ester
0,2.570725,-2.296475,2.350283,-2.629019,7.981256,-0.141921,32.166388,9.963254,1.616942,43.150835,...,282.904336,765.306285,259.326141,96.341896,-75.133098,2.0,0.0,3.0,2.0,0.0
2,2.582084,-2.494463,2.457842,-2.650842,7.997956,-0.156942,32.166681,9.870645,1.864470,4.390415,...,24.015737,56.884832,21.910577,4.766433,-6.166719,2.0,1.0,0.0,0.0,0.0
3,2.114672,-2.108312,2.283061,-2.122840,6.175353,0.103298,35.495663,10.182167,2.588968,0.000000,...,16.080608,11.385041,0.000000,0.948545,0.976006,0.0,0.0,1.0,0.0,0.0
4,2.324205,-2.318977,2.385178,-2.488328,14.115504,0.015617,126.914741,10.060823,2.710901,0.000000,...,4.736863,35.716411,59.194215,0.337135,-1.507778,0.0,0.0,1.0,0.0,0.0
5,2.584004,-2.337769,2.367478,-2.626298,7.981230,-0.148405,32.166386,9.967228,1.361620,52.739909,...,268.578398,753.319014,239.334604,62.099595,-72.339204,6.0,4.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
525445,2.091654,-2.277972,1.917228,-2.467883,4.698224,0.230976,15.187233,10.267161,2.131282,0.000000,...,0.000000,2.403935,3.305471,5.661168,0.000000,0.0,0.0,0.0,0.0,0.0
525446,2.325645,-2.206212,2.198629,-2.371269,5.955362,0.110249,16.724033,10.015980,4.562920,0.000000,...,9.473726,59.587161,136.889491,29.514924,-8.003152,0.0,0.0,0.0,0.0,0.0
525447,2.239163,-2.265961,2.187031,-2.424064,5.952558,-0.869216,16.153707,10.039623,2.044148,0.000000,...,4.794537,14.346208,0.000000,1.064218,0.289829,0.0,0.0,0.0,0.0,0.0
525448,2.962826,-2.446394,2.750388,-2.635426,5.340747,-0.407361,17.077246,9.985688,0.322911,0.000000,...,369.475310,0.000000,1361.408091,0.000000,-130.800825,0.0,0.0,0.0,0.0,0.0


In [41]:
#Creo il terzo modello
model_3 = LocalOutlierFactor(n_neighbors=20, contamination="auto", novelty=True).fit(X_train_clear)

In [42]:
#Salvo il modello
"""
model_3_name = "LocalOutlierFactor_142_filtered.sav"
pickle.dump(model_3, open(model_3_name, "wb"))
"""

In [43]:
#Applico i modelli a X_test
prediction_1 = model_1.predict(X_test)
prediction_2 = model_2.predict(X_test)
prediction_3 = model_3.predict(X_test)



In [44]:
#Classifico a priori le molecole di X_test, ottenendo il vettore y
X_test["inlier"] = 1
y = X_test.inlier
X_test.drop(columns="inlier", inplace=True)

In [45]:
#Calcolo i tre valori di accuracy
accuracy_1 = accuracy_score(y, prediction_1)
accuracy_2 = accuracy_score(y, prediction_2)
accuracy_3 = accuracy_score(y, prediction_3)

In [46]:
print("Accuracy 1: ", accuracy_1)
print("Accuracy 2: ", accuracy_2)
print("Accuracy 3: ", accuracy_3)

Accuracy 1:  0.9001661423571661
Accuracy 2:  0.9781388630381785
Accuracy 3:  0.9437456822969894


In [2]:
#Carico i modelli
model_1 = pickle.load(open("EllipticEnvelope_142_01.sav", "rb"))
model_2 = pickle.load(open("LocalOutlierFactor_142.sav", "rb"))
model_3 = pickle.load(open("LocalOutlierFactor_142_filtered.sav", "rb"))

In [3]:
DSX = pd.read_csv("DSX_features_142.csv")

In [4]:
columns = DSX.columns

In [5]:
df_boutique = pd.read_csv("Features_boutique.csv")

In [8]:
df_boutique = df_boutique[columns]

In [9]:
df_boutique

Unnamed: 0,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_LOGPHI,BCUT2D_LOGPLOW,BCUT2D_MRHI,BCUT2D_MRLOW,BCUT2D_MWHI,BCUT2D_MWLOW,BalabanJ,EState_VSA11,...,SlogP_VSA3,VSA_EState2,VSA_EState3,VSA_EState4,VSA_EState5,fr_Al_COO,fr_Ar_N,fr_benzene,fr_phenol_noOrthoHbond,fr_phos_ester
0,2.074885,-1.977526,2.282907,-1.868714,7.169529,0.111825,32.133445,10.041874,2.789130,0.0,...,6.420822,10.759754,12.089876,2.184307,0.000000,0.0,0.0,1.0,0.0,0.0
1,2.104999,-1.842214,2.307317,-1.988981,7.188269,-0.135828,35.496017,10.251911,2.233416,0.0,...,11.215359,11.255583,12.200411,0.404650,-0.386841,1.0,1.0,0.0,0.0,0.0
2,2.340180,-2.287821,2.289989,-2.425161,5.797032,-0.136628,16.566555,9.842522,1.752072,0.0,...,9.531400,25.606137,2.840168,-0.482020,0.500014,0.0,0.0,0.0,0.0,0.0
3,2.395565,-2.355345,2.350746,-2.433843,4.851384,-0.179201,16.737168,9.897638,1.685242,0.0,...,14.210589,0.000000,3.542529,0.000000,-0.231099,0.0,0.0,0.0,0.0,0.0
4,2.224010,-2.330655,2.248154,-2.442589,9.103214,0.107422,79.918731,10.137360,2.279062,0.0,...,4.736863,2.100139,9.065724,2.274289,0.000000,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,2.071751,-2.070769,2.167558,-2.210753,14.106801,0.415050,126.912704,10.204613,2.818996,0.0,...,11.475224,16.424554,0.622130,0.674477,1.485198,0.0,2.0,1.0,0.0,0.0
199996,2.162659,-2.091235,2.171774,-2.220974,6.282952,-0.141832,35.495702,10.103821,2.428765,0.0,...,9.531400,15.190404,0.368694,7.152050,-0.476887,0.0,2.0,0.0,0.0,0.0
199997,2.140229,-2.181222,2.220710,-2.224715,6.311401,0.348050,35.495691,10.092994,1.971847,0.0,...,0.000000,2.406547,4.446241,2.285157,0.000000,0.0,0.0,1.0,0.0,0.0
199998,2.060906,-2.103990,2.195369,-2.047205,5.360484,0.335592,16.476548,10.068728,2.782218,0.0,...,11.215359,13.753189,0.000000,1.074861,0.868471,0.0,0.0,1.0,0.0,0.0


In [10]:
df_boutique.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_boutique.dropna(inplace=True)


In [11]:
df_boutique

Unnamed: 0,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_LOGPHI,BCUT2D_LOGPLOW,BCUT2D_MRHI,BCUT2D_MRLOW,BCUT2D_MWHI,BCUT2D_MWLOW,BalabanJ,EState_VSA11,...,SlogP_VSA3,VSA_EState2,VSA_EState3,VSA_EState4,VSA_EState5,fr_Al_COO,fr_Ar_N,fr_benzene,fr_phenol_noOrthoHbond,fr_phos_ester
0,2.074885,-1.977526,2.282907,-1.868714,7.169529,0.111825,32.133445,10.041874,2.789130,0.0,...,6.420822,10.759754,12.089876,2.184307,0.000000,0.0,0.0,1.0,0.0,0.0
1,2.104999,-1.842214,2.307317,-1.988981,7.188269,-0.135828,35.496017,10.251911,2.233416,0.0,...,11.215359,11.255583,12.200411,0.404650,-0.386841,1.0,1.0,0.0,0.0,0.0
2,2.340180,-2.287821,2.289989,-2.425161,5.797032,-0.136628,16.566555,9.842522,1.752072,0.0,...,9.531400,25.606137,2.840168,-0.482020,0.500014,0.0,0.0,0.0,0.0,0.0
3,2.395565,-2.355345,2.350746,-2.433843,4.851384,-0.179201,16.737168,9.897638,1.685242,0.0,...,14.210589,0.000000,3.542529,0.000000,-0.231099,0.0,0.0,0.0,0.0,0.0
4,2.224010,-2.330655,2.248154,-2.442589,9.103214,0.107422,79.918731,10.137360,2.279062,0.0,...,4.736863,2.100139,9.065724,2.274289,0.000000,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,2.071751,-2.070769,2.167558,-2.210753,14.106801,0.415050,126.912704,10.204613,2.818996,0.0,...,11.475224,16.424554,0.622130,0.674477,1.485198,0.0,2.0,1.0,0.0,0.0
199996,2.162659,-2.091235,2.171774,-2.220974,6.282952,-0.141832,35.495702,10.103821,2.428765,0.0,...,9.531400,15.190404,0.368694,7.152050,-0.476887,0.0,2.0,0.0,0.0,0.0
199997,2.140229,-2.181222,2.220710,-2.224715,6.311401,0.348050,35.495691,10.092994,1.971847,0.0,...,0.000000,2.406547,4.446241,2.285157,0.000000,0.0,0.0,1.0,0.0,0.0
199998,2.060906,-2.103990,2.195369,-2.047205,5.360484,0.335592,16.476548,10.068728,2.782218,0.0,...,11.215359,13.753189,0.000000,1.074861,0.868471,0.0,0.0,1.0,0.0,0.0


In [12]:
"""
df_boutique.to_csv("features_boutique_142.csv", index=False)
"""

In [13]:
df_farmaci = pd.read_csv("Farmaci_features.csv")

In [15]:
df_farmaci.dropna(inplace=True)

In [16]:
df_farmaci = df_farmaci[columns]

In [18]:
"""
df_farmaci.to_csv("Farmaci_features_142.csv", index=False)
"""

In [19]:
df_toxic = pd.read_csv("toxic_features.csv")

In [20]:
df_non_toxic = pd.read_csv("non_toxic_features.csv")

In [22]:
df_toxic = df_toxic[columns]

In [23]:
df_non_toxic = df_non_toxic[columns]

In [26]:
df_toxic.to_csv("toxic_142.csv", index=False)

In [27]:
df_non_toxic.to_csv("non_toxic_142.csv", index=False)

In [28]:
#Carico i farmaci
df_farmaci = pd.read_csv("Farmaci_features_142.csv")

In [29]:
#Applico i modelli a df_farmaci
prediction_farmaci_1 = model_1.predict(df_farmaci)
prediction_farmaci_2 = model_2.predict(df_farmaci)
prediction_farmaci_3 = model_3.predict(df_farmaci)



In [30]:
df_farmaci["inlier"] = 1
y_farmaci = df_farmaci.inlier
df_farmaci.drop(columns="inlier", inplace=True)

In [31]:
#Calcolo i tre valori di accuracy
accuracy_1 = accuracy_score(y_farmaci, prediction_farmaci_1)
accuracy_2 = accuracy_score(y_farmaci, prediction_farmaci_2)
accuracy_3 = accuracy_score(y_farmaci, prediction_farmaci_3)

In [32]:
print("Accuracy 1: ", accuracy_1)
print("Accuracy 2: ", accuracy_2)
print("Accuracy 3: ", accuracy_3)

Accuracy 1:  0.9310344827586207
Accuracy 2:  1.0
Accuracy 3:  1.0


In [33]:
#Carico le molecole boutique
df_boutique = pd.read_csv("features_boutique_142.csv")

In [34]:
df_boutique

Unnamed: 0,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_LOGPHI,BCUT2D_LOGPLOW,BCUT2D_MRHI,BCUT2D_MRLOW,BCUT2D_MWHI,BCUT2D_MWLOW,BalabanJ,EState_VSA11,...,SlogP_VSA3,VSA_EState2,VSA_EState3,VSA_EState4,VSA_EState5,fr_Al_COO,fr_Ar_N,fr_benzene,fr_phenol_noOrthoHbond,fr_phos_ester
0,2.074885,-1.977526,2.282907,-1.868714,7.169529,0.111825,32.133445,10.041874,2.789130,0.0,...,6.420822,10.759754,12.089876,2.184307,0.000000,0.0,0.0,1.0,0.0,0.0
1,2.104999,-1.842214,2.307317,-1.988981,7.188269,-0.135828,35.496017,10.251911,2.233416,0.0,...,11.215359,11.255583,12.200411,0.404650,-0.386841,1.0,1.0,0.0,0.0,0.0
2,2.340180,-2.287821,2.289989,-2.425161,5.797032,-0.136628,16.566555,9.842522,1.752072,0.0,...,9.531400,25.606137,2.840168,-0.482020,0.500014,0.0,0.0,0.0,0.0,0.0
3,2.395565,-2.355345,2.350746,-2.433843,4.851384,-0.179201,16.737168,9.897638,1.685242,0.0,...,14.210589,0.000000,3.542529,0.000000,-0.231099,0.0,0.0,0.0,0.0,0.0
4,2.224010,-2.330655,2.248154,-2.442589,9.103214,0.107422,79.918731,10.137360,2.279062,0.0,...,4.736863,2.100139,9.065724,2.274289,0.000000,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199982,2.071751,-2.070769,2.167558,-2.210753,14.106801,0.415050,126.912704,10.204613,2.818996,0.0,...,11.475224,16.424554,0.622130,0.674477,1.485198,0.0,2.0,1.0,0.0,0.0
199983,2.162659,-2.091235,2.171774,-2.220974,6.282952,-0.141832,35.495702,10.103821,2.428765,0.0,...,9.531400,15.190404,0.368694,7.152050,-0.476887,0.0,2.0,0.0,0.0,0.0
199984,2.140229,-2.181222,2.220710,-2.224715,6.311401,0.348050,35.495691,10.092994,1.971847,0.0,...,0.000000,2.406547,4.446241,2.285157,0.000000,0.0,0.0,1.0,0.0,0.0
199985,2.060906,-2.103990,2.195369,-2.047205,5.360484,0.335592,16.476548,10.068728,2.782218,0.0,...,11.215359,13.753189,0.000000,1.074861,0.868471,0.0,0.0,1.0,0.0,0.0


In [35]:
#Applico i modelli a df_boutique
prediction_boutique_1 = model_1.predict(df_boutique)
prediction_boutique_2 = model_2.predict(df_boutique)
prediction_boutique_3 = model_3.predict(df_boutique)



In [36]:
df_boutique["inlier"] = -1
y_boutique = df_boutique.inlier
df_boutique.drop(columns="inlier", inplace=True)

In [38]:
#Calcolo i tre valori di accuracy
accuracy_1 = accuracy_score(y_boutique, prediction_boutique_1)
accuracy_2 = accuracy_score(y_boutique, prediction_boutique_2)
accuracy_3 = accuracy_score(y_boutique, prediction_boutique_3)

In [39]:
print("Accuracy 1: ", accuracy_1)
print("Accuracy 2: ", accuracy_2)
print("Accuracy 3: ", accuracy_3)

Accuracy 1:  0.044372884237475436
Accuracy 2:  0.006685434553245961
Accuracy 3:  0.006825443653837499


In [41]:
#Carico le molecole tossiche e non tossiche
df_toxic = pd.read_csv("toxic_142.csv")
df_non_toxic = pd.read_csv("non_toxic_142.csv")

In [42]:
prediction_toxic_1 = model_1.predict(df_toxic)
prediction_toxic_2 = model_2.predict(df_toxic)
prediction_toxic_3 = model_3.predict(df_toxic)
prediction_non_toxic_1 = model_1.predict(df_non_toxic)
prediction_non_toxic_2 = model_2.predict(df_non_toxic)
prediction_non_toxic_3 = model_3.predict(df_non_toxic)



In [43]:
df_toxic["inlier"] = -1
y_toxic = df_toxic.inlier
df_toxic.drop(columns="inlier", inplace=True)
df_non_toxic["inlier"] = 1
y_non_toxic = df_non_toxic.inlier
df_non_toxic.drop(columns="inlier", inplace=True)

In [44]:
accuracy_toxic_1 = accuracy_score(y_toxic, prediction_toxic_1)
accuracy_toxic_2 = accuracy_score(y_toxic, prediction_toxic_2)
accuracy_toxic_3 = accuracy_score(y_toxic, prediction_toxic_3)
accuracy_non_toxic_1 = accuracy_score(y_non_toxic, prediction_non_toxic_1)
accuracy_non_toxic_2 = accuracy_score(y_non_toxic, prediction_non_toxic_2)
accuracy_non_toxic_3 = accuracy_score(y_non_toxic, prediction_non_toxic_3)

In [45]:
print(accuracy_toxic_1)
print(accuracy_toxic_2)
print(accuracy_toxic_3)
print(accuracy_non_toxic_1)
print(accuracy_non_toxic_2)
print(accuracy_non_toxic_3)

0.08161044613710555
0.05005440696409141
0.05005440696409141
0.8896856881293866
0.9302715898687824
0.9244736039060116


In [46]:
df_toxic

Unnamed: 0,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_LOGPHI,BCUT2D_LOGPLOW,BCUT2D_MRHI,BCUT2D_MRLOW,BCUT2D_MWHI,BCUT2D_MWLOW,BalabanJ,EState_VSA11,...,SlogP_VSA3,VSA_EState2,VSA_EState3,VSA_EState4,VSA_EState5,fr_Al_COO,fr_Ar_N,fr_benzene,fr_phenol_noOrthoHbond,fr_phos_ester
0,2.560278,-2.502017,2.629287,-2.440620,7.458601,-0.021738,35.496755,9.552057,1.481851e+00,0.0,...,15.509617,32.734843,0.000000,2.321340,2.437138,0.0,0.0,1.0,0.0,1.0
1,2.273262,-2.339969,2.308850,-2.489209,6.302582,-0.144466,35.495693,10.079901,1.023796e+00,0.0,...,30.278159,72.249225,4.147726,2.981649,-0.909304,0.0,1.0,3.0,0.0,0.0
2,2.387724,-2.236895,2.433779,-2.250591,6.309332,-0.392610,35.495692,10.081022,2.725550e+00,0.0,...,12.721055,21.471764,22.710713,-4.527194,-0.760956,0.0,0.0,2.0,0.0,0.0
3,2.226891,-2.245194,2.202223,-2.278598,5.869734,-0.147397,16.552914,9.990214,2.592783e+00,0.0,...,9.531400,11.915690,0.000000,1.597556,-0.348655,0.0,0.0,1.0,0.0,0.0
4,2.265333,-2.271394,2.423717,-2.186714,6.374304,0.474268,35.499021,9.778808,2.614907e+00,0.0,...,5.414990,0.000000,19.977383,1.044765,-0.306196,0.0,0.0,2.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
914,2.225062,-2.114663,2.322489,-2.075637,6.344141,-0.110325,16.150004,10.038228,1.979473e+00,0.0,...,4.794537,14.073225,2.881870,4.737896,-0.038994,0.0,0.0,2.0,0.0,0.0
915,2.070319,-1.978442,2.252643,-2.025982,5.892455,-0.133714,16.370587,10.082064,2.612500e-07,0.0,...,16.009896,22.379630,26.088284,8.942417,-2.213360,2.0,1.0,1.0,1.0,0.0
916,2.517250,-2.320295,2.511932,-2.448767,5.859994,0.214783,16.550122,9.811926,1.563430e+00,0.0,...,5.414990,16.875049,2.749975,3.259958,0.580564,0.0,0.0,2.0,0.0,0.0
917,2.378928,-2.435979,2.395859,-2.515387,5.853029,-0.095641,35.453001,9.957321,1.333333e-06,0.0,...,4.736863,2.431701,9.623061,0.978753,1.112605,0.0,0.0,1.0,1.0,0.0


In [47]:
df_drugs = pd.read_csv("fe.cs

In [48]:
df_drugs

Unnamed: 0.1,Unnamed: 0,generic_name,smiles
0,0,Abacavir,NC1=NC2=C(N=CN2[C@@H]2C[C@H](CO)C=C2)C(NC2CC2)=N1
1,1,Abiraterone,CC(=O)O[C@H]1CC[C@]2(C)C3CC[C@@]4(C)C(CC=C4C4=...
2,2,Acamprosate,CC(=O)NCCCS(O)(=O)=O
3,3,Acarbose,C[C@H]1O[C@H](O[C@@H]2[C@@H](CO)O[C@H](O[C@@H]...
4,4,Acebutolol,CCCC(=O)NC1=CC(C(C)=O)=C(OCC(O)CNC(C)C)C=C1
...,...,...,...
1492,1492,Zolmitriptan,CN(C)CCC1=CNC2=CC=C(C[C@H]3COC(=O)N3)C=C12
1493,1493,Zolpidem,CN(C)C(=O)CC1=C(N=C2C=CC(C)=CN12)C1=CC=C(C)C=C1
1494,1494,Zonisamide,NS(=O)(=O)CC1=NOC2=CC=CC=C12
1495,1495,Zopiclone,CN1CCN(CC1)C(=O)OC1N(C(=O)C2=NC=CN=C12)C1=NC=C...
