In [2]:
import h5py
import numpy as np
import os
np.random.seed(42)

In [3]:
def remove_zeros(positions, transfer_integrals):
    '''removes zeros from transfer integrals, so that we can take the log during processing'''
    mask = ~(transfer_integrals==0.)
    transfer_integrals = transfer_integrals[mask]
    positions = positions[mask]
    return positions, transfer_integrals

def split(positions, transfer_integrals, len_train=0, len_test=0):
    '''shuffle and split the dataset into a train and a test part'''
    permutation = np.random.permutation(len(transfer_integrals))
    positions = positions[permutation]
    transfer_integrals = transfer_integrals[permutation]
    positions_train = positions[:len_train]
    positions_test = positions[len_train:(len_train+len_test)]
    transfer_integrals_train = transfer_integrals[:len_train]
    transfer_integrals_test = transfer_integrals[len_train:(len_train+len_test)]
    return positions_train, positions_test, transfer_integrals_train, transfer_integrals_test


'''shuffles and splits dataset into train and test (so that they do not share )'''
def read_file(name):
    '''reads original .h5 file'''
    with h5py.File('data/data_original/raw/john_'+name+'.h5', 'r') as file:
        atoms = file.get("atoms")[()]
        positions = file.get("positions")[()]
        transfer_integrals = file.get("transfer_integrals")[()]
        return atoms, positions, transfer_integrals

### debug File penta 100, tetra 100, DNTT 50 (train), DNTT 50 (test)

In [56]:
# for each dataset: (for each molecule) read file, remove zeros and split, append to file
# pentacene
name = 'pentacene' 
len_train = 100
len_test = 0

# read 
atoms, positions, transfer_integrals = read_file(name)
(positions, transfer_integrals) = remove_zeros(positions, transfer_integrals)
(positions_train, 
 positions_test, 
 transfer_integrals_train, 
 transfer_integrals_test) = split(positions, transfer_integrals, len_train=len_train, len_test=len_test)



In [58]:
# write 
write_name = 'debug_penta_100_tetra_100_DNTT_50_train.h5'
path = os.path.join('data','raw')

if "train" in write_name:
    positions = positions_train
    transfer_integrals = transfer_integrals_train
elif "test" in write_name:
    positions = positions_test
    transfer_integrals = transfer_integrals_test

with h5py.File(os.path.join(path, write_name), 'w') as file:
    grp = file.create_group(name)
    grp.create_dataset('atoms', data=atoms)
    grp.create_dataset('positions', data=positions)
    grp.create_dataset('transfer_integrals', data=transfer_integrals)



In [59]:
# tetra 
name = 'tetracene' 
len_train = 100
len_test = 0

# read 
atoms, positions, transfer_integrals = read_file(name)
(positions, transfer_integrals) = remove_zeros(positions, transfer_integrals)
(positions_train, 
 positions_test, 
 transfer_integrals_train, 
 transfer_integrals_test) = split(positions, transfer_integrals, len_train=len_train, len_test=len_test)

In [60]:
# write 
write_name = 'debug_penta_100_tetra_100_DNTT_50_train.h5'

if "train" in write_name:
    positions = positions_train
    transfer_integrals = transfer_integrals_train
elif "test" in write_name:
    positions = positions_test
    transfer_integrals = transfer_integrals_test

with h5py.File(os.path.join(path, write_name), 'a') as file:
    grp = file.create_group(name)
    grp.create_dataset('atoms', data=atoms)
    grp.create_dataset('positions', data=positions)
    grp.create_dataset('transfer_integrals', data=transfer_integrals)

In [61]:
# DNTT 
name = 'DNTT' 
len_train = 50
len_test = 50

# read 
atoms, positions, transfer_integrals = read_file(name)
(positions, transfer_integrals) = remove_zeros(positions, transfer_integrals)
(positions_train, 
 positions_test, 
 transfer_integrals_train, 
 transfer_integrals_test) = split(positions, transfer_integrals, len_train=len_train, len_test=len_test)

In [62]:
# write 
write_name = 'debug_penta_100_tetra_100_DNTT_50_train.h5'

if "train" in write_name:
    positions = positions_train
    transfer_integrals = transfer_integrals_train
elif "test" in write_name:
    positions = positions_test
    transfer_integrals = transfer_integrals_test

with h5py.File(os.path.join(path, write_name), 'a') as file:
    grp = file.create_group(name)
    grp.create_dataset('atoms', data=atoms)
    grp.create_dataset('positions', data=positions)
    grp.create_dataset('transfer_integrals', data=transfer_integrals)

In [64]:
# write 
write_name = 'debug_DNTT_50_test.h5'

if "train" in write_name:
    positions = positions_train
    transfer_integrals = transfer_integrals_train
elif "test" in write_name:
    positions = positions_test
    transfer_integrals = transfer_integrals_test

with h5py.File(os.path.join(path, write_name), 'w') as file:
    grp = file.create_group(name)
    grp.create_dataset('atoms', data=atoms)
    grp.create_dataset('positions', data=positions)
    grp.create_dataset('transfer_integrals', data=transfer_integrals)

### debug data: penta 100 (train), tetra 50 (test)


In [3]:
# pentacene
name = 'pentacene' 
len_train = 100
len_test = 0

# read 
atoms, positions, transfer_integrals = read_file(name)
(positions, transfer_integrals) = remove_zeros(positions, transfer_integrals)
(positions_train, 
 positions_test, 
 transfer_integrals_train, 
 transfer_integrals_test) = split(positions, transfer_integrals, len_train=len_train, len_test=len_test)


In [4]:
# write 
write_name = 'debug_penta_100_train.h5'
path = os.path.join('data','raw')

if "train" in write_name:
    positions = positions_train
    transfer_integrals = transfer_integrals_train
elif "test" in write_name:
    positions = positions_test
    transfer_integrals = transfer_integrals_test

with h5py.File(os.path.join(path, write_name), 'w') as file:
    grp = file.create_group(name)
    grp.create_dataset('atoms', data=atoms)
    grp.create_dataset('positions', data=positions)
    grp.create_dataset('transfer_integrals', data=transfer_integrals)



In [5]:
# tetra 
name = 'tetracene' 
len_train = 0
len_test = 50

# read 
atoms, positions, transfer_integrals = read_file(name)
(positions, transfer_integrals) = remove_zeros(positions, transfer_integrals)
(positions_train, 
 positions_test, 
 transfer_integrals_train, 
 transfer_integrals_test) = split(positions, transfer_integrals, len_train=len_train, len_test=len_test)

In [6]:
# write 
write_name = 'debug_tetra_50_test.h5'

if "train" in write_name:
    positions = positions_train
    transfer_integrals = transfer_integrals_train
elif "test" in write_name:
    positions = positions_test
    transfer_integrals = transfer_integrals_test

with h5py.File(os.path.join(path, write_name), 'w') as file:
    grp = file.create_group(name)
    grp.create_dataset('atoms', data=atoms)
    grp.create_dataset('positions', data=positions)
    grp.create_dataset('transfer_integrals', data=transfer_integrals)

### debug data: penta 50 (test)


In [5]:
# pentacene
name = 'pentacene' 
len_train = 0 
len_test = 50

# read 
atoms, positions, transfer_integrals = read_file(name)
(positions, transfer_integrals) = remove_zeros(positions, transfer_integrals)
(positions_train, 
 positions_test, 
 transfer_integrals_train, 
 transfer_integrals_test) = split(positions, transfer_integrals, len_train=len_train, len_test=len_test)


In [6]:
# write 
write_name = 'debug_penta_50_test.h5'
path = os.path.join('data','raw')

if "train" in write_name:
    positions = positions_train
    transfer_integrals = transfer_integrals_train
elif "test" in write_name:
    positions = positions_test
    transfer_integrals = transfer_integrals_test

with h5py.File(os.path.join(path, write_name), 'w') as file:
    grp = file.create_group(name)
    grp.create_dataset('atoms', data=atoms)
    grp.create_dataset('positions', data=positions)
    grp.create_dataset('transfer_integrals', data=transfer_integrals)



### data for Exp 7 (train)
#### penta 15000, tetra 5000 (train)

In [8]:
# pentacene
name = 'pentacene' 
len_train = 15000
len_test = 0

# read 
atoms, positions, transfer_integrals = read_file(name)
(positions, transfer_integrals) = remove_zeros(positions, transfer_integrals)
(positions_train, 
 positions_test, 
 transfer_integrals_train, 
 transfer_integrals_test) = split(positions, transfer_integrals, len_train=len_train, len_test=len_test)


In [9]:
# write 
write_name = 'penta_15k_tetra_5k_train.h5'
path = os.path.join('data','raw')

if "train" in write_name:
    positions = positions_train
    transfer_integrals = transfer_integrals_train
elif "test" in write_name:
    positions = positions_test
    transfer_integrals = transfer_integrals_test

with h5py.File(os.path.join(path, write_name), 'w') as file:
    grp = file.create_group(name)
    grp.create_dataset('atoms', data=atoms)
    grp.create_dataset('positions', data=positions)
    grp.create_dataset('transfer_integrals', data=transfer_integrals)



In [10]:
# tetra 
name = 'tetracene' 
len_train = 5000
len_test = 0

# read 
atoms, positions, transfer_integrals = read_file(name)
(positions, transfer_integrals) = remove_zeros(positions, transfer_integrals)
(positions_train, 
 positions_test, 
 transfer_integrals_train, 
 transfer_integrals_test) = split(positions, transfer_integrals, len_train=len_train, len_test=len_test)

In [11]:
# write 
write_name = 'penta_15k_tetra_5k_train.h5'
path = os.path.join('data','raw')

if "train" in write_name:
    positions = positions_train
    transfer_integrals = transfer_integrals_train
elif "test" in write_name:
    positions = positions_test
    transfer_integrals = transfer_integrals_test

with h5py.File(os.path.join(path, write_name), 'a') as file:
    grp = file.create_group(name)
    grp.create_dataset('atoms', data=atoms)
    grp.create_dataset('positions', data=positions)
    grp.create_dataset('transfer_integrals', data=transfer_integrals)



### Debug


In [12]:
# read 
write_name = 'penta_15k_tetra_5k_train.h5'
path = os.path.join('data','raw')

atoms= {} 
positions= {}
transfer_integrals= {}
with h5py.File(os.path.join(path, write_name), "r") as file: 
    groups = list(file.keys())
    for group in groups:
        # list top-level groups/datasets in file

        # get content and convert to numpy arrays
        atoms[group] = file[group].get("atoms")[()].astype(str)
        positions[group] = file[group].get("positions")[()]
        transfer_integrals[group] = file[group].get("transfer_integrals")[()] 
        print(group)
        print(f'length: {len(transfer_integrals[group])}')



pentacene
length: 15000
tetracene
length: 5000
