In [1]:
import sys
sys.path.append('./Scripts')
import aqdnet
import pandas as pd 
from structure import ElementwiseDNN

# Make fg_input.dat file

FeatureGenerator class takes as input a text file containing the paths of pdb files, and performs feature extraction from the pdb files described in the text file. First, we create this text file (fg_input.dat).

In [6]:
# make fg_input_file
fg_input_file_train = './Features/fg_input_trainset.dat'
label_train_df = pd.read_csv('./Features/label_trainset.csv')

with open(fg_input_file_train, mode='w') as f:
    f.write('\n'.join(label_train_df['file_name'].tolist()))

    
fg_input_file_valid = './Features/fg_input_validset.dat'
label_valid_df = pd.read_csv('./Features/label_validset.csv')

with open(fg_input_file_valid, mode='w') as f:
    f.write('\n'.join(label_valid_df['file_name'].tolist()))

In [7]:
fg_mother_param = dict(
    distance_threshold_radial=12,
    distance_threshold_angular=6,
    target_elements=["H", "C", "N", "O", "P", "S", "Cl", "Zn", "DU"],
    Rs_radial_step=0.5,
    Rs_angular_step=2.0,
    n_theta=8
)
fg_params = aqdnet.mother_params_to_fg_params(fg_mother_param)
print(fg_params)

{'distance_threshold_radial': 12, 'distance_threshold_angular': 6, 'target_elements': ['H', 'C', 'N', 'O', 'P', 'S', 'Cl', 'Zn', 'DU'], 'Rs_list_radial': [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 8.5, 9.0, 9.5, 10.0, 10.5, 11.0, 11.5], 'Rs_list_angular': [0.5, 2.5, 4.5], 'theta_list': [0.0, 0.7853981852531433, 1.5707963705062866, 2.356194496154785, 3.1415927410125732, 3.9269907474517822, 4.71238899230957, 5.497786998748779]}


In [11]:
num_cpu = 10
fg = aqdnet.FeatureGenerator(**fg_params)
train_dataset = fg.generate(fg_input_file_train, num_cpu=num_cpu)


100%|██████████| 1502/1502 [59:04<00:00,  2.36s/it] 
MainProcess: Feature generation completed ...... 


In [12]:
output_train_pkl_file = './Features/feature_trainset.pkl'
output_train_csv_files = './Features/feature_trainset_*.csv'
train_dataset.to_pickle(output_train_pkl_file)
train_dataset.to_csv_parallelized(output_train_csv_files, scheduler='threading')

Saving the dataset in csv files
[########################################] | 100% Completed |  1min 12.4s
./Features/feature_trainset_*.csv was saved...


In [13]:
ElementwiseDNN.write_tfrecords(feature_file='./Features/feature_trainset.pkl', 
                               label_file='./Features/label_trainset.csv', 
                               tfr_filename='./Features/feature_trainset.tfrecords', 
                               feature_dimension=11583,
                               label_colnames=['pKa_energy'])

100%|██████████| 1502/1502 [00:50<00:00, 29.65it/s]

./Features/feature_trainset.tfrecords was saved...





In [13]:
train_dataset.head()

Unnamed: 0,H_H_0,H_H_1,H_H_2,H_H_3,H_H_4,H_H_5,H_H_6,H_H_7,H_H_8,H_H_9,...,DU_DU_DU_4_2,DU_DU_DU_5_0,DU_DU_DU_5_1,DU_DU_DU_5_2,DU_DU_DU_6_0,DU_DU_DU_6_1,DU_DU_DU_6_2,DU_DU_DU_7_0,DU_DU_DU_7_1,DU_DU_DU_7_2
./SampleStructures/184l/smina_184l_docking_1247.pdb,0.000346,0.066896,2.082037,12.66343,26.431372,47.278744,75.936883,96.554963,115.838086,132.449398,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
./SampleStructures/184l/smina_184l_docking_1869.pdb,0.000284,0.061736,2.090402,13.121278,26.104995,45.904875,77.555062,98.030261,114.037981,133.574724,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
./SampleStructures/184l/smina_184l_docking_0285.pdb,0.000665,0.093888,2.324606,12.463945,26.094088,48.658723,76.655379,94.948475,114.848632,132.433054,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
./SampleStructures/184l/smina_184l_docking_0500.pdb,0.000506,0.082843,2.258289,12.933138,26.894942,47.035903,76.257962,96.418172,115.241602,134.532175,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
./SampleStructures/184l/smina_184l_docking_1379.pdb,0.000642,0.092579,2.313696,12.440111,26.116268,48.698368,76.51039,94.969166,115.038057,132.231465,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
num_cpu = 10
fg = aqdnet.FeatureGenerator(**fg_params)
valid_dataset = fg.generate(fg_input_file_valid, num_cpu=num_cpu)


100%|██████████| 303/303 [10:26<00:00,  2.07s/it] 
MainProcess: Feature generation completed ...... 


In [9]:
output_valid_pkl_file = './Features/feature_validset.pkl'
output_valid_csv_files = './Features/feature_validset_*.csv'
valid_dataset.to_pickle(output_valid_pkl_file)
valid_dataset.to_csv_parallelized(output_valid_csv_files, scheduler='threading')

Saving the dataset in csv files
[########################################] | 100% Completed | 14.8s
./Features/feature_validset_*.csv was saved...


In [10]:
ElementwiseDNN.write_tfrecords(feature_file='./Features/feature_validset.pkl', 
                               label_file='./Features/label_validset.csv', 
                               tfr_filename='./Features/feature_validset.tfrecords', 
                               feature_dimension=11583,
                               label_colnames=['pKa_energy'])

100%|██████████| 303/303 [00:09<00:00, 30.56it/s]

./Features/feature_validset.tfrecords was saved...





In [None]:
valid_dataset.head()