In [69]:
import re
from typing import List
import numpy as np


def get_Mayer_bond(log_path: str,num_atoms: int) -> np.ndarray:
    """
    Extracts Mayer bond orders from a log file.

    Parameters:
        log_path (str): Path to the log file.

    Returns:
        np.ndarray: A 2D array containing Mayer bond orders.
    
    Raises:
        ValueError: If the log file does not contain enough lines or 
                    if the number of atoms cannot be parsed.
    """
    with open(log_path, 'r') as file:
        text = file.read()

    # Clean the text
    text = re.sub(r'\s\)', ')', text)
    text = re.sub(r'[ \t]+', ' ', text)

    lines = text.split('\n')
    if len(lines) < 4:
        raise ValueError("Log file does not contain enough lines to extract Mayer bond orders.")

    bond_order_lines = lines[2:-1]

    N_atom = num_atoms

    bond = np.zeros((N_atom, 0))  # Initialize with zero columns
    n = 0
    total_lines = len(bond_order_lines)

    while n + N_atom + 1 <= total_lines:
        # Assuming the first line in each block is a header or separator, skip it
        block = bond_order_lines[n + 1:n + N_atom + 1]
        bond_tmp = np.array([list(map(float, line.split()[1:])) for line in block])
        bond = np.hstack((bond, bond_tmp.reshape(N_atom, -1)))
        n += N_atom + 1

    return np.array(bond)


def get_CDFT_Atom_descriptor(log_path: str) -> np.ndarray:
    """
    Extracts CDFT atom descriptors from a log file.

    Parameters:
        log_path (str): Path to the log file.

    Returns:
        np.ndarray: A 2D array containing CDFT atom descriptors.
    
    Raises:
        ValueError: If any of the required patterns are not found in the log file 
                    or if the descriptor values cannot be parsed.
    """
    with open(log_path, 'r') as file:
        text = file.read()

    # Clean the text
    text = re.sub(r'\s\)', ')', text)
    text = re.sub(r'[ \t]+', ' ', text)

    patterns = {
        'q_N': r'Atom q\(N\)(.*?)Condensed local electrophilicity',
        'electrophilicity': r'Atom\s+Electrophilicity(.*?)Condensed local softness',
        's_minus': r'Atom\s+s\-(.*?)E\(N\)',
    }

    matched_texts = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.DOTALL)
        if not match:
            raise ValueError(f"Pattern '{pattern}' not found in the log file.")
        # Split into lines and exclude the header and last two lines
        lines = match.group(1).strip().split('\n')[1:]
        matched_texts[key] = lines

    q_N = np.array([list(map(float, line.split()[1:])) for line in matched_texts['q_N']])
    electrophilicity = np.array([list(map(float, line.split()[1:])) for line in matched_texts['electrophilicity']])
    s_minus = np.array([list(map(float, line.split()[1:])) for line in matched_texts['s_minus']])

    # Concatenate all descriptors horizontally
    CDFT_descriptor = np.hstack((q_N, electrophilicity, s_minus))

    return np.array(CDFT_descriptor)


def get_CDFT_Mol_descriptor(CDFT_path: str) -> List[float]:
    """
    Extracts CDFT molecular descriptors from a log file.

    Parameters:
        CDFT_path (str): Path to the CDFT log file.

    Returns:
        List[float]: A list of molecular descriptor values in eV.
    
    Raises:
        ValueError: If no molecular descriptors are found in the log file 
                    or if the descriptor values cannot be converted to float.
    """
    with open(CDFT_path, 'r') as file:
        text = file.read()

    pattern = (
        r"(E_HOMO\(N\)|E_HOMO\(N\+1\)|First vertical IP|First vertical EA|"
        r"Mulliken electronegativity|Chemical potential|Hardness \(=fundamental gap\)|"
        r"Electrophilicity index|Nucleophilicity index):\s*[-\d.]+ Hartree,\s*([-]?\d+\.\d+) eV"
    )

    matches = re.findall(pattern, text)

    if not matches:
        raise ValueError("No molecular descriptors found in the log file.")

    values = [float(match[1]) for match in matches]

    return values



In [2]:

def read_xyz(filename):
    with open(filename, 'r',encoding="utf-8") as file:
        num_atoms = int(file.readline().strip())
        comment = file.readline().strip()
        elements = []
        coordinates = []

        for line in file:
            parts = line.split()
            elements.append(parts[0])
            coordinates.append([float(parts[1]), float(parts[2]), float(parts[3])])

    return num_atoms, comment, np.array(elements), np.array(coordinates)

In [3]:
num_atoms, comment, elements, coordinates = read_xyz("test.xyz")

In [4]:
x = X

edge_index = E

edge_attr = EF, 

y = y_tensor, 

global_features = global_features, 

x3d = x3d, 

x2d = x2d, 

pos = pos, 

path = XYZ_path.replace("xyz_files\\","").replace(".xyz",""), 

fragment = [Side_chain1,Side_chain2,Frame_Part]

36

In [4]:
Mol_CDFT = get_CDFT_Mol_descriptor("CDFT.txt")
Atom_CDFT = get_CDFT_Atom_descriptor("CDFT.txt")
Mayer_bond = get_Mayer_bond("bndmat.txt",num_atoms)


In [13]:
mol = Chem.Mol(Chem.MolFromXYZFile("test.xyz"))
rdDetermineBonds.DetermineBonds(mol,charge=0)
mg = MolGraph()
mg.read_xyz("test.xyz")
G = to_networkx_graph(mg)
n_nodes = len(G.nodes)
n_edges = 2*len(G.edges)

In [27]:
n_nodes

36

In [29]:
get_CDFT_Atom_descriptor("CDFT.txt").shape

(34, 15)

In [25]:
num_atoms, comment, elements, coordinates = read_xyz("test.xyz")

Atom_CDFT = get_CDFT_Atom_descriptor("CDFT.txt")
  

X = np.zeros((n_nodes, n_node_features))
for atom in mol.GetAtoms():
    X[atom.GetIdx(), :] = get_atom_features(atom)

distance_feature = np.zeros((coordinates.shape[0], 8))

for i in range(coordinates.shape[0]):
    distance_feature[i, 0] = distance_index([0, 0, 0], coordinates[i])
    distance_feature[i, 1] = distance_index(coordinates[0], coordinates[i])
    distance_feature[i, 2] = distance_index(coordinates[1], coordinates[i])
    distance_feature[i, 3] = distance_index(coordinates[2], coordinates[i])
    distance_feature[i, 4] = distance_index(coordinates[3], coordinates[i])
    distance_feature[i, 5] = distance_index(coordinates[4], coordinates[i])
    distance_feature[i, 6] = distance_index(coordinates[5], coordinates[i])
    distance_feature[i, 7] = distance_index(coordinates[6], coordinates[i])

X = np.hstack((Atom_CDFT,distance_feature, X))

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 34 and the array at index 1 has size 36

In [7]:

import os

def get_G_Side_chain(G):
    """
    返回EDY两端的侧链

    :param G: 标准化后的图
    :return: 两个侧链的index以及骨架的index
    """
    ring_type = False
    G_0 = G.copy()
    G_0.remove_node(0)
    G_0.remove_node(1)
    G_0.remove_node(4)
    G_0.remove_node(5)
    connected_components = list(nx.connected_components(G_0))
    if len(connected_components) == 3:
        ring_type = True
        for i in connected_components:
            if 2 not in i: Side_chain1, Side_chain2 = i, i
            if 2 in i: Frame_Part = i
    
    elif len(connected_components) == 4:
        for i in connected_components:
            if 7 in i: Side_chain1 = i
            if 6 in i: Side_chain2 = i
            if 2 in i: Frame_Part = i
    else:
        return False
    return list(Side_chain1), list(Side_chain2), list(Frame_Part), ring_type


def get_G_Side_chain(G):
    """
    返回EDY两端的侧链

    :param G: 标准化后的图
    :return: 两个侧链的index以及骨架的index
    """
    ring_type = False
    G_0 = G.copy()
    G_0.remove_node(0)
    G_0.remove_node(1)
    G_0.remove_node(4)
    G_0.remove_node(5)
    connected_components = list(nx.connected_components(G_0))
    if len(connected_components) == 3:
        ring_type = True
        for i in connected_components:
            if 2 not in i: Side_chain1, Side_chain2 = i, i
            if 2 in i: Frame_Part = i
    
    elif len(connected_components) == 4:
        for i in connected_components:
            if 7 in i: Side_chain1 = i
            if 6 in i: Side_chain2 = i
            if 2 in i: Frame_Part = i
    else:
        return False
    return list(Side_chain1), list(Side_chain2), list(Frame_Part), ring_type




def one_hot_encoding(x, permitted_list):
    """
    Maps input elements x which are not in the permitted list to the last element
    of the permitted list.
    """

    if x not in permitted_list:
        x = permitted_list[-1]

    binary_encoding = [int(boolean_value) for boolean_value in list(map(lambda s: x == s, permitted_list))]

    return binary_encoding



def get_atom_features(atom):
    """
    Takes an RDKit atom object as input and gives a 1d-numpy array of atom features as output.
    """
    # define list of permitted atoms
    permitted_list_of_atoms =  ['C','N','O','S','F','Cl','H']
    # compute atom features
    atom_type_enc = one_hot_encoding(str(atom.GetSymbol()), permitted_list_of_atoms)
    n_heavy_neighbors_enc = one_hot_encoding(int(atom.GetDegree()), [0, 1, 2, 3, 4, "MoreThanFour"])
    hybridisation_type_enc = one_hot_encoding(str(atom.GetHybridization()), ["S", "SP", "SP2", "SP3", "SP3D", "SP3D2", "OTHER"])
    is_in_a_ring_enc = [int(atom.IsInRing())]
    is_aromatic_enc = [int(atom.GetIsAromatic())]
    atomic_mass_scaled = [float((atom.GetMass()))]
    vdw_radius_scaled = [float(Chem.GetPeriodicTable().GetRvdw(atom.GetAtomicNum()))]
    covalent_radius_scaled = [float((Chem.GetPeriodicTable().GetRcovalent(atom.GetAtomicNum())))]
    atom_feature_vector = atom_type_enc + n_heavy_neighbors_enc + hybridisation_type_enc + is_in_a_ring_enc + is_aromatic_enc + atomic_mass_scaled + vdw_radius_scaled + covalent_radius_scaled
    return np.array(atom_feature_vector)


def get_bond_features(bond,
                      use_stereochemistry = False):
    """
    Takes an RDKit bond object as input and gives a 1d-numpy array of bond features as output.
    """

    permitted_list_of_bond_types = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC]
    bond_type_enc = one_hot_encoding(bond.GetBondType(), permitted_list_of_bond_types)
    bond_is_conj_enc = [int(bond.GetIsConjugated())]
    bond_is_in_ring_enc = [int(bond.IsInRing())]
    bond_feature_vector = bond_type_enc + bond_is_conj_enc + bond_is_in_ring_enc
    if use_stereochemistry == True:
        stereo_type_enc = one_hot_encoding(str(bond.GetStereo()), ["STEREOZ", "STEREOE", "STEREOANY", "STEREONONE"])
        bond_feature_vector += stereo_type_enc
    return np.array(bond_feature_vector)



def distance_index(arr1, arr2):
    arr1 = np.array(arr1)
    arr2 = np.array(arr2)
    diff_squared = np.square(arr1 - arr2)
    distance = np.sqrt(np.sum(diff_squared))
    return distance
def calculate_angle(A, B, C):
    # Convert points to numpy arrays
    A = np.array(A)
    B = np.array(B)
    C = np.array(C)
    # Calculate vectors
    BA = B - A
    BC = B - C
    TheNorm = np.linalg.norm(BA) * np.linalg.norm(BC)

    rho = np.rad2deg(np.arcsin(np.cross(BA, BC) / TheNorm))

    theta = np.rad2deg(np.arccos(np.dot(BA, BC) / TheNorm))

    if rho[-1] < 0:
        theta = 360-theta
    return theta

def read_xyz(filename):
    with open(filename, 'r',encoding="utf-8") as file:
        num_atoms = int(file.readline().strip())
        comment = file.readline().strip()
        elements = []
        coordinates = []

        for line in file:
            parts = line.split()
            elements.append(parts[0])
            coordinates.append([float(parts[1]), float(parts[2]), float(parts[3])])

    return num_atoms, comment, np.array(elements), np.array(coordinates)
    
def format_xyz(num_atoms, comment, elements, coordinates):
    width=15
    precision=6
    lines = []
    lines.append(str(num_atoms))
    lines.append(comment)
    line_format = f"{{}} {{:>{width}.{precision}f}} {{:>{width}.{precision}f}} {{:>{width}.{precision}f}}"

    for element, coord in zip(elements, coordinates):
        line = line_format.format(element, coord[0], coord[1], coord[2])
        lines.append(line)

    return '\n'.join(lines)
def delete_tmp_xyz():
    filename = 'tmp.xyz'
    if os.path.exists(filename):
        os.remove(filename)
        
def calculate_volume(elements,coordinates,Side_chain):
    
    tmp = format_xyz(len(Side_chain),"xyz",elements[Side_chain],coordinates[Side_chain])
    delete_tmp_xyz()
    with open('./tmp.xyz', 'w') as file:
        file.write(tmp)
    
    mol = db.dbstep("tmp.xyz",commandline=True,verbose=False,volume=True,quiet=True,measure='classic')  
    delete_tmp_xyz()
    return mol.bur_vol,mol.occ_vol

In [8]:
    unrelated_smiles = "O=O"
    unrelated_mol = Chem.MolFromSmiles(unrelated_smiles)
    n_node_features = len(get_atom_features(unrelated_mol.GetAtomWithIdx(0)))
    n_edge_features = len(get_bond_features(unrelated_mol.GetBondBetweenAtoms(0,1)))

In [74]:
import numpy as np
import torch
import re
import os
import networkx as nx
from rdkit import Chem
from rdkit.Chem import rdDetermineBonds, AllChem
from rdkit.Chem.rdmolops import GetAdjacencyMatrix
from torch_geometric.data import Data
from torch.utils.data import DataLoader
import math
from xyz2graph import MolGraph, to_networkx_graph
import dbstep.Dbstep as db
from tqdm import tqdm

In [9]:
n_node_features

25

In [10]:
n_edge_features

6

In [30]:
log_path="CDFT.txt"

In [46]:
    with open(log_path, 'r') as file:
        text = file.read()

    # Clean the text
    text = re.sub(r'\s\)', ')', text)
    text = re.sub(r'[ \t]+', ' ', text)

    patterns = {
        'q_N': r'Atom q\(N\)(.*?)Condensed local electrophilicity',
        'electrophilicity': r'Atom\s+Electrophilicity(.*?)Condensed local softness',
        's_minus': r'Atom\s+s\-(.*?)E\(N\)',
    }

    matched_texts = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.DOTALL)
        if not match:
            raise ValueError(f"Pattern '{pattern}' not found in the log file.")
        # Split into lines and exclude the header and last two lines
        lines = match.group(1).strip().split('\n')[1:]
        matched_texts[key] = lines

    q_N = np.array([list(map(float, line.split()[1:])) for line in matched_texts['q_N']])
    electrophilicity = np.array([list(map(float, line.split()[1:])) for line in matched_texts['electrophilicity']])
    s_minus = np.array([list(map(float, line.split()[1:])) for line in matched_texts['s_minus']])

    # Concatenate all descriptors horizontally
    CDFT_descriptor = np.hstack((q_N, electrophilicity, s_minus))

In [47]:
CDFT_descriptor.shape

(36, 15)

In [40]:
    text = re.sub(r'\s\)', ')', text)
    text = re.sub(r'[ \t]+', ' ', text)

In [42]:
    patterns = {
        'q_N': r'Atom q\(N\)(.*?)Condensed local electrophilicity',
        'electrophilicity': r'Atom\s+Electrophilicity(.*?)Condensed local softness',
        's_minus': r'Atom\s+s\-(.*?)E\(N\)',
    }

    matched_texts = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.DOTALL)

In [45]:
match.group(1).strip().split('\n')

['s+ s0 s+/s- s-/s+ s(2)',
 ' 1(C) 0.2957 0.3807 0.3382 1.2874 0.7768 0.3004',
 ' 2(C) 0.3571 0.1115 0.2343 0.3123 3.2017 -0.8681',
 ' 3(C) 0.1977 0.4574 0.3276 2.3137 0.4322 0.9181',
 ' 4(C) 0.3522 0.4109 0.3815 1.1665 0.8573 0.2073',
 ' 5(C) 0.1248 0.1573 0.1410 1.2606 0.7933 0.1149',
 ' 6(C) 0.3484 0.3992 0.3738 1.1458 0.8728 0.1795',
 ' 7(N) 0.3219 0.1157 0.2188 0.3596 2.7812 -0.7287',
 ' 8(H) 0.1228 0.1912 0.1570 1.5568 0.6424 0.2418',
 ' 9(H) 0.1477 0.1798 0.1637 1.2169 0.8218 0.1132',
 ' 10(C) 0.0377 0.0558 0.0467 1.4815 0.6750 0.0641',
 ' 11(C) 0.0641 0.0479 0.0560 0.7472 1.3384 -0.0573',
 ' 12(C) 0.0318 0.0148 0.0233 0.4637 2.1565 -0.0604',
 ' 13(C) 0.0229 0.0318 0.0273 1.3904 0.7192 0.0316',
 ' 14(H) 0.0840 0.1040 0.0940 1.2368 0.8085 0.0704',
 ' 15(C) 0.0500 0.0617 0.0558 1.2336 0.8106 0.0413',
 ' 16(C) 0.0351 0.0264 0.0308 0.7523 1.3293 -0.0308',
 ' 17(H) 0.1010 0.0652 0.0831 0.6453 1.5496 -0.1266',
 ' 18(H) 0.0688 0.0516 0.0602 0.7490 1.3351 -0.0611',
 ' 19(C) 0.0359 0.028

In [63]:
XYZ_path = "test.xyz"
CDFT_path = "CDFT.txt"
unrelated_smiles = "O=O"
unrelated_mol = Chem.MolFromSmiles(unrelated_smiles)
n_node_features = len(get_atom_features(unrelated_mol.GetAtomWithIdx(0)))
n_edge_features = len(get_bond_features(unrelated_mol.GetBondBetweenAtoms(0,1)))

mg = MolGraph()
mg.read_xyz(XYZ_path)
G = to_networkx_graph(mg)

num_atoms, comment, elements, coordinates = read_xyz(XYZ_path)


Side_chain1,Side_chain2,Frame_Part,ring_type = get_G_Side_chain(G)
bur_vol_1,occ_vol_1 = calculate_volume(elements,coordinates,Side_chain1)
bur_vol_2,occ_vol_2 = calculate_volume(elements,coordinates,Side_chain2)
global_features = [bur_vol_1, occ_vol_1 ,bur_vol_2 ,occ_vol_2]+ get_CDFT_Mol_descriptor(CDFT_path) 


raw_mol = Chem.MolFromXYZFile(XYZ_path)
num_atoms, comment, elements, coordinates = read_xyz(XYZ_path)
mol = Chem.Mol(raw_mol)
rdDetermineBonds.DetermineBonds(mol,charge=0)
n_nodes = len(G.nodes)
n_edges = 2*len(G.edges)
  
Atom_CDFT = get_CDFT_Atom_descriptor("CDFT.txt")

X = np.zeros((n_nodes, n_node_features))
for atom in mol.GetAtoms():
    X[atom.GetIdx(), :] = get_atom_features(atom)


distance_feature = np.zeros((coordinates.shape[0], 8))

for i in range(coordinates.shape[0]):
    distance_feature[i, 0] = distance_index([0, 0, 0], coordinates[i])
    distance_feature[i, 1] = distance_index(coordinates[0], coordinates[i])
    distance_feature[i, 2] = distance_index(coordinates[1], coordinates[i])
    distance_feature[i, 3] = distance_index(coordinates[2], coordinates[i])
    distance_feature[i, 4] = distance_index(coordinates[3], coordinates[i])
    distance_feature[i, 5] = distance_index(coordinates[4], coordinates[i])
    distance_feature[i, 6] = distance_index(coordinates[5], coordinates[i])
    distance_feature[i, 7] = distance_index(coordinates[6], coordinates[i])

X = np.hstack((CDFT_descriptor,distance_feature, X))

(rows, cols) = np.nonzero(GetAdjacencyMatrix(mol))
torch_rows = torch.from_numpy(rows.astype(np.int64)).to(torch.long)
torch_cols = torch.from_numpy(cols.astype(np.int64)).to(torch.long)


# construct edge feature array EF of shape (n_edges, n_edge_features)
EF = np.zeros((n_edges, n_edge_features+1))

bond_order = get_Mayer_bond("bndmat.txt",num_atoms)
if bond_order.shape != (n_nodes, n_nodes):print("Error in bond")
for (k, (i,j)) in enumerate(zip(rows, cols)):
    EF[k] = np.append(get_bond_features(mol.GetBondBetweenAtoms(int(i),int(j))), bond_order[i,j])


# y_val = float(get_energy(XYZ_path,-1)) 
y_val=000
global_features = torch.tensor(global_features, dtype = torch.float)
X = torch.tensor(X, dtype = torch.float)
E = torch.stack([torch_rows, torch_cols], dim = 0)
EF = torch.tensor(EF, dtype = torch.float)
y_tensor = torch.tensor(np.array([y_val]), dtype = torch.float)

pos = torch.tensor(coordinates, dtype = torch.float)

In [None]:
Mol_CDFT = get_CDFT_Mol_descriptor("CDFT.txt")
Atom_CDFT = get_CDFT_Atom_descriptor("CDFT.txt")
Mayer_bond = get_Mayer_bond("bndmat.txt",num_atoms)


In [54]:
bur_vol_1, occ_vol_1 ,bur_vol_2 ,occ_vol_2

(30.529294380920952,
 54.808250000000015,
 30.529294380920952,
 54.808250000000015)

In [58]:
[bur_vol_1, occ_vol_1 ,bur_vol_2 ,occ_vol_2]+ get_CDFT_Mol_descriptor(CDFT_path) 

[30.529294380920952,
 54.808250000000015,
 30.529294380920952,
 54.808250000000015,
 -4.8595,
 2.872,
 6.5644,
 -1.1335,
 2.7155,
 -2.7155,
 7.6979,
 0.4789,
 4.2617]

In [73]:
import numpy as np
import torch
from rdkit import Chem
from rdkit.Chem import rdDetermineBonds
import sys

# Define file paths
XYZ_PATH="test.xyz"
# XYZ_PATH =  sys.argv[1]
CDFT_PATH = "CDFT.txt"
BNDMAT_PATH = "bndmat.txt"


# Set up constants
UNRELATED_SMILES = "O=O"
unrelated_mol = Chem.MolFromSmiles(UNRELATED_SMILES)
n_node_features = len(get_atom_features(unrelated_mol.GetAtomWithIdx(0)))
n_edge_features = len(get_bond_features(unrelated_mol.GetBondBetweenAtoms(0, 1)))

# Read XYZ file and generate molecular graph
mg = MolGraph()
mg.read_xyz(XYZ_PATH)
G = to_networkx_graph(mg)

num_atoms, comment, elements, coordinates = read_xyz(XYZ_PATH)

# Get Side Chains and Frame Part
side_chain_1, side_chain_2, frame_part, ring_type = get_G_Side_chain(G)
bur_vol_1, occ_vol_1 = calculate_volume(elements, coordinates, side_chain_1)
bur_vol_2, occ_vol_2 = calculate_volume(elements, coordinates, side_chain_2)

# Obtain global molecular descriptors
global_features = [bur_vol_1, occ_vol_1, bur_vol_2, occ_vol_2] + get_CDFT_Mol_descriptor(CDFT_PATH)

# Load the molecule from the XYZ file and determine bonds
raw_mol = Chem.MolFromXYZFile(XYZ_PATH)
mol = Chem.Mol(raw_mol)
rdDetermineBonds.DetermineBonds(mol, charge=0)

n_nodes = len(G.nodes)
n_edges = 2 * len(G.edges)

# Get atom-level CDFT descriptors
atom_CDFT = get_CDFT_Atom_descriptor(CDFT_PATH)

# Construct the node feature matrix X
X = np.zeros((n_nodes, n_node_features))
for atom in mol.GetAtoms():
    X[atom.GetIdx(), :] = get_atom_features(atom)

# Construct distance feature matrix
distance_feature = np.zeros((coordinates.shape[0], 8))
reference_points = [
    [0, 0, 0],
    coordinates[0],
    coordinates[1],
    coordinates[2],
    coordinates[3],
    coordinates[4],
    coordinates[5],
    coordinates[6],
]

for i in range(coordinates.shape[0]):
    for j, ref_point in enumerate(reference_points):
        distance_feature[i, j] = distance_index(ref_point, coordinates[i])

# Concatenate CDFT descriptors and distance features to node features
X = np.hstack((atom_CDFT, distance_feature, X))

# Get adjacency information for edges
(rows, cols) = np.nonzero(GetAdjacencyMatrix(mol))
torch_rows = torch.from_numpy(rows.astype(np.int64)).to(torch.long)
torch_cols = torch.from_numpy(cols.astype(np.int64)).to(torch.long)

# Construct edge feature array EF of shape (n_edges, n_edge_features)
EF = np.zeros((n_edges, n_edge_features + 1))

# Get bond order from Mayer bond matrix
bond_order = get_Mayer_bond(BNDMAT_PATH, num_atoms)
if bond_order.shape != (n_nodes, n_nodes):
    print("Error in bond order matrix dimensions")

for k, (i, j) in enumerate(zip(rows, cols)):
    bond_features = get_bond_features(mol.GetBondBetweenAtoms(int(i), int(j)))
    EF[k] = np.append(bond_features, bond_order[i, j])

# Create tensors for model input
y_val = 0.0  # Placeholder value for target variable
global_features_tensor = torch.tensor(global_features, dtype=torch.float)
X_tensor = torch.tensor(X, dtype=torch.float)
E_tensor = torch.stack([torch_rows, torch_cols], dim=0)
EF_tensor = torch.tensor(EF, dtype=torch.float)
y_tensor = torch.tensor(np.array([y_val]), dtype=torch.float)
position_tensor = torch.tensor(coordinates, dtype=torch.float)

# Output the tensors for further processing
print("Global Features Tensor:", global_features_tensor)
print("Node Feature Tensor X:", X_tensor)
print("Edge Index Tensor E:", E_tensor)
print("Edge Feature Tensor EF:", EF_tensor)
print("Target Tensor y:", y_tensor)
print("Position Tensor:", position_tensor)

Global Features Tensor: tensor([30.5293, 54.8083, 30.5293, 54.8083, -4.8595,  2.8720,  6.5644, -1.1335,
         2.7155, -2.7155,  7.6979,  0.4789,  4.2617])
Node Feature Tensor X: tensor([[-0.0142, -0.1219,  0.0694,  ..., 12.0110,  1.7000,  0.7600],
        [-0.0882, -0.1198,  0.0128,  ..., 12.0110,  1.7000,  0.7600],
        [-0.0293, -0.1587,  0.0267,  ..., 12.0110,  1.7000,  0.7600],
        ...,
        [ 0.0354,  0.0391,  0.0365,  ...,  1.0080,  1.2000,  0.3100],
        [ 0.0337,  0.0278,  0.0425,  ...,  1.0080,  1.2000,  0.3100],
        [ 0.0351,  0.0204,  0.0531,  ...,  1.0080,  1.2000,  0.3100]])
Edge Index Tensor E: tensor([[ 0,  0,  1,  1,  2,  2,  2,  3,  3,  3,  4,  4,  5,  5,  6,  6,  6,  7,
          8,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13,
         14, 14, 14, 14, 15, 15, 15, 15, 16, 17, 18, 18, 18, 18, 19, 19, 19, 19,
         20, 21, 22, 23, 24, 25, 26, 27, 27, 28, 28, 28, 28, 29, 30, 31, 32, 33,
         34, 35],
        [ 1,  6,  0,  

In [66]:
atom_CDFT.shape

(34, 15)

In [68]:
get_CDFT_Atom_descriptor(CDFT_PATH).shape

(34, 15)