In [21]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import seaborn as sns

plt.style.use(["seaborn", "thesis"])

# Fetch Dataset 

In [14]:
from SCFInitialGuess.utilities.dataset import extract_triu_batch, AbstractDataset
from sklearn.model_selection import train_test_split

data_path = "../../dataset/TSmall_sto3g"
postfix = "TSmall_sto3g"
dim = 26
#data_path = "../butadien/data/"
#postfix = ""
#dim = 26


def split(x, y, ind):
    return x[:ind], y[:ind], x[ind:], y[ind:]

S = np.load(join(data_path, "S" + postfix + ".npy"))
P = np.load(join(data_path, "P" + postfix + ".npy"))
F = np.load(join(data_path, "F" + postfix + ".npy"))

index = np.load(join(data_path, "index" + postfix + ".npy"))

molecules = np.load(join(data_path, "molecules" + postfix + ".npy"))


ind = int(0.8 * len(index))

molecules = (molecules[:ind], molecules[ind:])

s_triu_norm, mu, std = AbstractDataset.normalize(S)


s_train, p_train, s_test, p_test = split(s_triu_norm, P, ind)


## Calculate Descriptors and extract center blocks 

In [43]:
from SCFInitialGuess.utilities.constants import number_of_basis_functions as N_BASIS
from SCFInitialGuess.descriptors.coordinate_descriptors import NonWeighted, Gaussians
from SCFInitialGuess.descriptors.coordinate_descriptors import \
    Gaussians, RADIAL_GAUSSIAN_MODELS, AZIMUTHAL_GAUSSIAN_MODELS, POLAR_GAUSSIAN_MODELS


def make_mask(mol, species):

    masks = []
    current_dim = 0
    for atom in mol.species:
        # calculate block range
        index_start = current_dim
        current_dim += N_BASIS[mol.basis][atom] 
        index_end = current_dim

        if atom == species:

            # calculate logical vector
            L = np.arange(dim)
            L = np.logical_and(index_start <= L, L < index_end)

            masks.append(np.logical_and.outer(L, L))
            
    
    return masks




def extract_dataset(molecules, p_batch, species):    
    
    # make mask to extract central blocks
    masks = make_mask(molecules[0], species)
    
    descriptor = NonWeighted(
        Gaussians(*RADIAL_GAUSSIAN_MODELS["Equidistant-Broadening_1"]),
        Gaussians(*AZIMUTHAL_GAUSSIAN_MODELS["Equisitant_1"]),
        Gaussians(*POLAR_GAUSSIAN_MODELS["Equisitant_1"])
    )
    
    descriptor_values, blocks = [], []
    for p, mol in zip(p_batch, molecules):
        for mask in masks:
            blocks.append(p.copy()[mask])
        
        for i, atom in enumerate(mol.species):
            if atom == species:
                descriptor_values.append(
                    descriptor.calculate_atom_descriptor(
                        i, 
                        mol,
                        descriptor.number_of_descriptors
                    )
                )
            
    return descriptor_values, blocks

In [44]:
inputs_test, outputs_test = extract_dataset(
    molecules[1], 
    p_test.reshape(-1, dim, dim),
    "C"
)

inputs_train, outputs_train = extract_dataset(
    molecules[1], 
    p_test.reshape(-1, dim, dim),
    "C"
)

(804, 18)
804
