In [10]:
import os
import numpy as np
import torch
import pickle
import json
import pandas as pd
import torch.nn as nn
from torch.nn import DataParallel
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from itertools import cycle
import torch.autograd as autograd
from torch.nn.utils import parameters_to_vector, vector_to_parameters
from sklearn.decomposition import PCA
import torch.optim as optim
from torch.nn import DataParallel

In [11]:
seed = 42
ratio_test = 0.2
np.random.seed(seed)
torch.manual_seed(seed)
exp_name = 'exp_12e'
curdir = '' ## '/curdir/' ## ''
device_ids = [1, 3] ## [0, 1, 2, 3]

In [12]:
class SiameseNetwork(torch.nn.Module):
    def __init__(self, len_embedding, abstract_len_embedding):
        super(SiameseNetwork, self).__init__()
        self.loss = nn.L1Loss(reduction="mean") 
        self.len_embedding = len_embedding
        self.abstract_len_embedding = abstract_len_embedding  
        self.nn_reg = nn.Sequential(
            ## 1024 to 2048
            nn.Linear(self.len_embedding, int(self.len_embedding*2)),
            nn.ReLU(),
            nn.BatchNorm1d(int(self.len_embedding*2)), 
            ## 2048 to 1536
            nn.Linear(int(self.len_embedding*2), int(self.len_embedding*1.5)),
            nn.ReLU(),
            nn.BatchNorm1d(int(self.len_embedding*1.5)),
            ## 1526 to 1024
            nn.Linear(int(self.len_embedding*1.5), self.abstract_len_embedding),
        )
        self.nn_final_reg = nn.Sequential(
            ##  (1024+1024) to 1
            nn.Linear(self.abstract_len_embedding * 2, 1),
        )

    def forward_reg(self, x):
        output = self.nn_reg(x)
        return output

    def forward_final_reg(self, x):
        output = self.nn_final_reg(x)
        return output

    def forward(self, fp1, fp2):
        a = self.forward_reg(fp1)
        b = self.forward_reg(fp2)
        x = torch.cat([a, b], dim=1)  # hstack
        output = self.forward_final_reg(x)
        return output

In [13]:
def get_secondary_env(env1):
    x, y = env1[0], env1[1]
    print(x.shape, y.shape)
    list_secondary_feature, list_secondary_target = [], []
    for i in range(x.shape[0]):
        for j in range(x.shape[0]):
            if i != j:
                sf = np.hstack((x[i], x[j]))
                st = y[i] - y[j]
                list_secondary_feature.append(sf)
                list_secondary_target.append(st)
    array_secondary_feature = np.array(list_secondary_feature, dtype='float32')
    array_secondary_target = np.array(list_secondary_target, dtype='float32').reshape((-1, 1))
    senv = torch.from_numpy(array_secondary_feature), torch.from_numpy(array_secondary_target)
    print(senv[0].shape, senv[1].shape)
    return senv

def get_model_siamese(senvironments, len_embedding, abstract_len_embedding, batch_size=512, num_gpus=4):
    print(f'len_embedding: {len_embedding}, abstract_len_embedding: {abstract_len_embedding}')

    _lr, num_iterations = 1e-3, 1000
    device = torch.device(f"cuda:{device_ids[0]}" if torch.cuda.is_available() else "cpu") ## torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_siamese = SiameseNetwork(len_embedding, abstract_len_embedding)
    if torch.cuda.is_available():    
        model_siamese = DataParallel(model_siamese, device_ids=device_ids)
    model_siamese.to(device)
    optimizer_siamese = torch.optim.Adam(model_siamese.parameters(), lr=_lr)    

    for epoch in range(num_iterations):
        total_loss = 0.0

        for x, y in senvironments:
            p = torch.randperm(len(x))
            x, y = x[p], y[p]
            
            for i in range(0, len(x), batch_size):
                batch_x = x[i:i + batch_size].to(device)
                batch_y = y[i:i + batch_size].to(device)
                fp1 = batch_x[:, list(range(0, len_embedding, 1))]
                fp2 = batch_x[:, list(range(len_embedding, 2 * len_embedding, 1))]
                y_pred_siamese = model_siamese(fp1, fp2)
                batch_loss = model_siamese.module.loss(y_pred_siamese, batch_y) if isinstance(model_siamese, DataParallel) else model_siamese.loss(y_pred_siamese, batch_y)
                
                optimizer_siamese.zero_grad()
                batch_loss.backward()
                optimizer_siamese.step()
                
                total_loss += batch_loss.item()
                

        if epoch % 1 == 0:
            with open(f'{curdir}v2_script_10/logger_' + exp_name + '.log', 'a+') as file1:
                file1.writelines(f'epoch: {epoch}, total_loss: {total_loss:.6f}\n\n')

    return model_siamese

In [21]:
# list_pid = [
#     'AraComputational2022', ## PBE-D3
#     'BajdichWO32018', ## PBE+U
#     'BoesAdsorption2018', ## RPBE
#     'ComerUnraveling2022', ## PBE+U
#     'HossainInvestigation2022', ## PBE+U    
#     # 'MamunHighT2019',    
# ]

# senvironments = []
# for pid in list_pid:
#     with open(f'{curdir}v2_script_10/exp12/df_{pid}.pickle', 'rb') as f:
#         df = pickle.load(f)  
#         print(df.shape)
#         X = df.iloc[:, :-1].values
#         y = df['nre'].values
#         env = (X, y)
#         senv = get_secondary_env(env)
#         print(env[0].shape, env[1].shape, senv[0].shape, senv[1].shape)
#         print()
#         senvironments.append(senv)
# print()
# print(len(senvironments))
# len_embedding, abstract_len_embedding = 1024, 1024    
# model_siamese = get_model_siamese(
#     senvironments, 
#     len_embedding, 
#     abstract_len_embedding,    
# )
# ## save the model
# model_path = f'{curdir}v2_script_10/model_siamese_{exp_name}.pt'
# torch.save(model_siamese.state_dict(), model_path)

In [22]:
## test on cathub

In [24]:
df = pd.read_pickle(f'datasets/df_cathub_dpp_combined.pickle')
tno = 0
X = df.iloc[:, :-1].values
y = df['nre'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ratio_test, random_state=seed+tno)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

## evaluation original
model_original = xgb.XGBRegressor(learning_rate=0.2, max_depth=8, n_estimators=500)
model_original.fit(X_train, y_train)
y_pred_original = model_original.predict(X_test)
mae_original = mean_absolute_error(y_test, y_pred_original)
r2score_original = r2_score(y_test, y_pred_original)    
print(mae_original, r2score_original)

## evaluation invariant
len_embedding, abstract_len_embedding = 1024, 1024    
list_en = ['exp_12e', 'exp_12e2', 'exp_12e3', 'exp_12e4', 'exp_12e5', 'exp_12e6']
for exp_name in list_en[:]:
    print()
    print(exp_name)
    model_path = 'v2_script_10/model_siamese_' + exp_name + '.pt'
    model_siamese = DataParallel(SiameseNetwork(len_embedding, abstract_len_embedding))
    model_siamese.load_state_dict(torch.load(model_path))
    model_siamese = model_siamese.module
    model_siamese.train()
    X_train_siamese = model_siamese.forward_reg(torch.from_numpy(X_train).float()).detach().numpy() 
    X_test_siamese = model_siamese.forward_reg(torch.from_numpy(X_test).float()).detach().numpy()

    model_xgb = xgb.XGBRegressor(learning_rate=0.2, max_depth=8, n_estimators=500)
    model_xgb.fit(X_train_siamese, y_train)
    y_pred_siamese = model_xgb.predict(X_test_siamese)
    mae_siamese = mean_absolute_error(y_test, y_pred_siamese)
    r2score_siamese = r2_score(y_test, y_pred_siamese)
    print(mae_siamese, r2score_siamese)

(8750, 1024) (2188, 1024) (8750,) (2188,)
0.2892252702905325 0.9516118684044564

exp_12e
0.8567992178217364 0.6884155062712407

exp_12e2
0.8441069290428107 0.6889985704153496

exp_12e3
0.8955289714688585 0.6646255699238051

exp_12e4
1.2350543815518094 0.4572872109456434

exp_12e5
0.7950296517331757 0.7190282695365164

exp_12e6
0.7514127500682865 0.7487928403556561


In [25]:
df = pd.read_pickle(f'datasets/df_ocp_dpp_combined.pickle') ## change
tno = 0
X = df.iloc[:, :-1].values
y = df['energy'].values ## change
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ratio_test, random_state=seed+tno)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

## evaluation original
model_original = xgb.XGBRegressor(learning_rate=0.2, max_depth=8, n_estimators=500)
model_original.fit(X_train, y_train)
y_pred_original = model_original.predict(X_test)
mae_original = mean_absolute_error(y_test, y_pred_original)
r2score_original = r2_score(y_test, y_pred_original)    
print(mae_original, r2score_original)

## evaluation invariant
len_embedding, abstract_len_embedding = 1024, 1024    
list_en = ['exp_12e', 'exp_12e2', 'exp_12e3', 'exp_12e4', 'exp_12e5', 'exp_12e6']
for exp_name in list_en[:]:
    print()
    print(exp_name)
    model_path = 'v2_script_10/model_siamese_' + exp_name + '.pt'
    model_siamese = DataParallel(SiameseNetwork(len_embedding, abstract_len_embedding))
    model_siamese.load_state_dict(torch.load(model_path))
    model_siamese = model_siamese.module
    model_siamese.train()
    X_train_siamese = model_siamese.forward_reg(torch.from_numpy(X_train).float()).detach().numpy() 
    X_test_siamese = model_siamese.forward_reg(torch.from_numpy(X_test).float()).detach().numpy()

    model_xgb = xgb.XGBRegressor(learning_rate=0.2, max_depth=8, n_estimators=500)
    model_xgb.fit(X_train_siamese, y_train)
    y_pred_siamese = model_xgb.predict(X_test_siamese)
    mae_siamese = mean_absolute_error(y_test, y_pred_siamese)
    r2score_siamese = r2_score(y_test, y_pred_siamese)
    print(mae_siamese, r2score_siamese)

(263890, 1024) (65973, 1024) (263890,) (65973,)
0.7228307372561132 0.7566608012206804

exp_12e
2.566007748730553 -1.4917821492109402

exp_12e2
2.4034239016813617 -1.146375027637025

exp_12e3
2.806785725803505 -1.9018269448314213

exp_12e4
2.1967216322728356 -0.7979404523285805

exp_12e5
2.0845035126030345 -0.6330229872320614

exp_12e6
1.7753961865545342 -0.2196586416210322
