# Input/Output split

In [1]:
import pandas as pd
import numpy as np

In [2]:
import torch

from torch import nn
from torch.utils.data.dataset import random_split

In [3]:
data = pd.read_csv('molml_data.csv')
data = data.drop('0', axis=1)

# Set threshold
below_thr, above_thr = np.quantile(data['442'], [0.05, 0.95])
tmp = []
for i in range(1520):
    energy = data.iloc[i]['442']
    if energy < below_thr or energy > above_thr:
        tmp.append(i)
data = data.drop(tmp)

In [4]:
input_data = data.drop('442', axis=1)
selected_col = []
for i in range(21):
    for j in range(i, 21):
        selected_col.append(str(21*i+(j+1)))
input_data = input_data[selected_col]
output_data = data['442']

In [6]:
dataset=[]
for idx in list(input_data.index):
    input_tmp = torch.tensor(list(input_data.loc[idx]))
    output_tmp = torch.tensor(float(output_data.loc[idx]))
    dataset.append([input_tmp, output_tmp])

In [13]:
len(dataset)

1368

# Make train/valid/test dataset

In [9]:
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.enabled = True

# gpu_option
gpu_use = 1
which_gpu = 0

In [10]:
batch_size = 100

In [14]:
def make_loader(dataset):
    torch.manual_seed(0)
    train, valid, test = random_split(dataset, [968, 200, 200])
    
    # Data Loader (Input Pipeline)
    train_loader = torch.utils.data.DataLoader(dataset=train, batch_size=batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(dataset=valid, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(dataset=test, batch_size=batch_size, shuffle=False)
    
    return train_loader, valid_loader, test_loader

In [15]:
train_loader, valid_loader, test_loader = make_loader(dataset)

# Model

In [16]:
import time

In [17]:
class pred_net(nn.Module):
    def __init__(self):
        super(pred_net, self).__init__()
        self.hidden1 = nn.Linear(231, 128)
        self.hidden2 = nn.Linear(128, 64)
        self.hidden3 = nn.Linear(64, 32)
        self.hidden4 = nn.Linear(32, 16)
        self.out = nn.Linear(16, 1)

    def forward(self, x):
        x = torch.tanh(self.hidden1(x))
        x = torch.tanh(self.hidden2(x))
        x = torch.tanh(self.hidden3(x))
        x = torch.tanh(self.hidden4(x))
        out = self.out(x)
        out = out.view(-1)
        return out

In [18]:
num_epochs = 50000+1
learning_rate = 1e-5
criterion = nn.MSELoss()

In [19]:
def fit(model,train_loader,valid_loader,criterion,learning_rate,num_epochs):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), weight_decay=1e-6)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)
    
    for epoch in range(num_epochs):
        model.train()
        acc = []
        train_error = 0
        
        for i, data in enumerate(train_loader):
            atom2vec = data[0].type(torch.FloatTensor).cuda(which_gpu)
            real_energy = data[1].type(torch.FloatTensor).cuda(which_gpu)
            
            optimizer.zero_grad()
            pred_energy = model(atom2vec)
            
            loss = criterion(pred_energy, real_energy)
            loss.backward()
            optimizer.step()

            train_error += loss.item()
       
        if epoch % 1000 == 0:
            avg_train_error = train_error/len(train_loader)
            print ("epoch : [%d/%d], The average loss of train error %.4f" % (epoch+1, num_epochs, avg_train_error))
            eval_loss, output_all, label_all = eval(model, valid_loader, criterion)
            scheduler.step(eval_loss)
        
def eval(model,valid_loader,criterion):

    eval_loss = 0.0
    output_all = []
    label_all = []

    model.eval()
    for i, data in enumerate(valid_loader):
        atom2vec = data[0].type(torch.FloatTensor).cuda(which_gpu)
        real_energy = data[1].type(torch.FloatTensor).cuda(which_gpu)

        pred_energy = model(atom2vec)
        loss = criterion(pred_energy, real_energy)
        eval_loss += loss.item()

        output_all.append(pred_energy.data.cpu().numpy())
        label_all.append(real_energy.data.cpu().numpy())

    avg_loss = eval_loss/len(valid_loader)
    print ('The average loss of valid error: {:.4f} \n'. format(avg_loss))

    return avg_loss, output_all, label_all

In [20]:
# load model
model = pred_net().cuda(which_gpu)
# run
start_time = time.time()
fit(model,train_loader,valid_loader,criterion,learning_rate,num_epochs)
print("--- %s seconds ---" % (time.time() - start_time))

# evaluation
avg_loss, pred_all, real_all = eval(model,test_loader,criterion)

epoch : [1/50001], The average loss of train error 1.6185
The average loss of valid error: 1.5759 

epoch : [1001/50001], The average loss of train error 0.3205
The average loss of valid error: 0.3764 

epoch : [2001/50001], The average loss of train error 0.2760
The average loss of valid error: 0.3884 

epoch : [3001/50001], The average loss of train error 0.2231
The average loss of valid error: 0.4091 

epoch : [4001/50001], The average loss of train error 0.2052
The average loss of valid error: 0.3692 

epoch : [5001/50001], The average loss of train error 0.1875
The average loss of valid error: 0.3859 

epoch : [6001/50001], The average loss of train error 0.1924
The average loss of valid error: 0.3342 

epoch : [7001/50001], The average loss of train error 0.2055
The average loss of valid error: 0.3494 

epoch : [8001/50001], The average loss of train error 0.1893
The average loss of valid error: 0.3493 

epoch : [9001/50001], The average loss of train error 0.1807
The average los

In [31]:
idx = 70
print (real_all[0][idx], pred_all[0][idx])

0.857231 1.05948
