In [1]:
import pandas as pd

In [2]:
import deepchem as dc

# Cargo dataset

In [3]:
df = pd.read_csv('data/acetylcholinesterase_02_bioactivity_data_preprocessed.csv')

In [4]:
df

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,standard_value_norm,pIC50
0,CHEMBL133897,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,750.0,750.0,6.124939
1,CHEMBL336398,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,100.0,100.0,7.000000
2,CHEMBL131588,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,50000.0,50000.0,4.301030
3,CHEMBL130628,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,300.0,300.0,6.522879
4,CHEMBL130478,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,800.0,800.0,6.096910
...,...,...,...,...,...
5038,CHEMBL4554172,Cc1ccc(-c2nc(NC(=O)C3CCN(Cc4cccc(F)c4)CC3)sc2-...,10000.0,10000.0,5.000000
5039,CHEMBL4533844,Cc1ccc(-c2nc(NC(=O)C3CCN(Cc4ccccc4C(F)(F)F)CC3...,7570.0,7570.0,5.120904
5040,CHEMBL4570655,Cc1ccc(-c2nc(NC(=O)C3CCN(Cc4ccccc4C)CC3)sc2-c2...,10000.0,10000.0,5.000000
5041,CHEMBL4571704,COc1cc(O)c2c(c1)C1=C(c3c(O)c(O)cc(C)c3-c3cc(OC...,15500.0,15500.0,4.809668


# Pongo el dataset en el formato que deepchem necesita
Quizas aca se podía bajar directo en el formato

In [5]:
dataset = dc.data.NumpyDataset(X=df['canonical_smiles'].values, y=df['pIC50'].values, ids=df['canonical_smiles'].values)

# Divido en train y test
Faltaría validación estrictametne

In [6]:
splitter = dc.splits.RandomSplitter()

In [7]:
train, test = splitter.train_test_split(dataset)

# Genero el diccionario de smiles que necesita el modelo para contruir el embedding

In [8]:
smiles_dict, lenght = dc.models.TextCNNModel.build_char_dict(dataset)

In [9]:
smiles_dict

{'#': 1,
 '(': 2,
 ')': 3,
 '+': 4,
 '-': 5,
 '/': 6,
 '1': 7,
 '2': 8,
 '3': 9,
 '4': 10,
 '5': 11,
 '6': 12,
 '7': 13,
 '8': 14,
 '=': 15,
 'C': 16,
 'F': 17,
 'H': 18,
 'I': 19,
 'N': 20,
 'O': 21,
 'P': 22,
 'S': 23,
 '[': 24,
 '\\': 25,
 ']': 26,
 '_': 27,
 'c': 28,
 'Cl': 29,
 'Br': 30,
 'n': 31,
 'o': 32,
 's': 33,
 '@': 34,
 '.': 35,
 'a': 36,
 'B': 37,
 'e': 38,
 'i': 39}

# Instancio el modelo

In [10]:
model = dc.models.TextCNNModel(
    1, smiles_dict, lenght, mode='regression', n_embedding=128, kernel_sizes=[3, 4, 5], num_filters=[128, 128, 128], dropout=0., 
    log_frequency=5, batch_size=128, optimizer=dc.models.optimizers.Adam()
)

In [11]:
model.model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 248)]        0                                            
__________________________________________________________________________________________________
dtnn_embedding (DTNNEmbedding)  (None, 248, 128)     5120        input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 246, 128)     49280       dtnn_embedding[0][0]             
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 245, 128)     65664       dtnn_embedding[0][0]             
______________________________________________________________________________________________

In [12]:
# Es necesario para ver resultados mientras entrana
vc = dc.models.ValidationCallback(test, 20, dc.metrics.r2_score)

# Entreno

In [183]:
model.fit(train, 100, callbacks=[vc])

Step 20 validation: metric-1=-0.106052
Step 40 validation: metric-1=0.213509
Step 60 validation: metric-1=0.270936
Step 80 validation: metric-1=0.302676
Step 100 validation: metric-1=0.332069
Step 120 validation: metric-1=0.348674
Step 140 validation: metric-1=0.39992
Step 160 validation: metric-1=0.433392
Step 180 validation: metric-1=0.457629
Step 200 validation: metric-1=0.468889
Step 220 validation: metric-1=0.507943
Step 240 validation: metric-1=0.526728
Step 260 validation: metric-1=0.519358
Step 280 validation: metric-1=0.53856
Step 300 validation: metric-1=0.568068
Step 320 validation: metric-1=0.483881
Step 340 validation: metric-1=0.535976
Step 360 validation: metric-1=0.587307
Step 380 validation: metric-1=0.604389
Step 400 validation: metric-1=0.583606
Step 420 validation: metric-1=0.577483
Step 440 validation: metric-1=0.550915
Step 460 validation: metric-1=0.603084
Step 480 validation: metric-1=0.583663
Step 500 validation: metric-1=0.609059
Step 520 validation: metric-1=

0.17212021350860596

# Evaluó

In [184]:
model.evaluate(train, dc.metrics.r2_score)

{'metric-1': 0.9373599628309585}

In [185]:
model.evaluate(test, dc.metrics.r2_score)

{'metric-1': 0.6170177792298378}