In [1]:
import sys
sys.path.insert(0, '../')
import numpy as np
import ase.io as ase_io
from ase import Atoms
import tqdm
from pathos.multiprocessing import ProcessingPool as Pool


import pyximport; pyximport.install()
import nice_utilities
import naive, radial_basis, spherical_coefficients, spherical_harmonics, convert_rascal_coefficients
from convert_rascal_coefficients import convert_rascal_coefficients, normalize_by_ps
import ClebschGordan, test_utilities

In [2]:
def process_structures(structures):
    for structure in structures: 
        structure.cell =[120, 120, 120]
        structure.positions += np.asarray((60,60,60))
        structure.pbc=True
        structure.wrap()

In [3]:
structures = ase_io.read('../structures.xyz', index = ':100000')
process_structures(structures)

In [4]:
R_CUT = 6.3

N_MAX = 9
L_MAX = 9


In [5]:
import rascal
from rascal.representations import SphericalInvariants as SOAP
from rascal.representations import SphericalExpansion as SPH
from rascal.neighbourlist.structure_manager import (
        mask_center_atoms_by_species, mask_center_atoms_by_id)


    
    
def get_rascal_coefficients(structures, r_cut, n_max, l_max, n_types, normalize = True):
    HYPERS = {
    'interaction_cutoff': r_cut,
    'max_radial': n_max,
    'max_angular': l_max,
    'gaussian_sigma_constant': 0.05,
    'gaussian_sigma_type': 'Constant',
    'cutoff_smooth_width': 0.3,
    'radial_basis': 'GTO',
    #'expansion_by_species_method' : 'environment wise'
    }
   
    
    sph = SPH(**HYPERS)
    feat = sph.transform(structures).get_features(sph)
    res = convert_rascal_coefficients(feat, n_max, n_types, l_max)
    
    if (normalize):
        normalize_by_ps(res)
    return np.array(res)

In [6]:
def get_rascal_coefficients_parallelized(p, structures, r_cut, n_max, l_max, n_types,
                                         normalize = True, task_size = 100):
    
    tasks = []
    for i in range(0, len(structures), task_size):
        tasks.append([structures[i : i + task_size], r_cut, n_max, l_max, n_types, normalize])
        
    def wrapped(task):
        return get_rascal_coefficients(*task)
    
    result = [res for res in tqdm.tqdm(p.imap(wrapped, tasks), total = len(tasks))]
    return np.concatenate(result, axis = 0)
    

In [7]:
p = Pool(45)

Process ForkPoolWorker-26:
Process ForkPoolWorker-33:
Process ForkPoolWorker-22:
Process ForkPoolWorker-12:
Process ForkPoolWorker-9:
Process ForkPoolWorker-42:
Process ForkPoolWorker-1:
Process ForkPoolWorker-17:
Process ForkPoolWorker-29:
Process ForkPoolWorker-3:
Process ForkPoolWorker-30:
Process ForkPoolWorker-31:
Process ForkPoolWorker-24:
Process ForkPoolWorker-19:
Process ForkPoolWorker-43:
Process ForkPoolWorker-40:
Process ForkPoolWorker-14:
Process ForkPoolWorker-6:
Process ForkPoolWorker-25:
Process ForkPoolWorker-8:
Process ForkPoolWorker-44:
Process ForkPoolWorker-11:
Process ForkPoolWorker-38:
Process ForkPoolWorker-21:
Process ForkPoolWorker-36:
Process ForkPoolWorker-39:
Process ForkPoolWorker-28:
Process ForkPoolWorker-7:
Process ForkPoolWorker-23:
Process ForkPoolWorker-10:
Process ForkPoolWorker-20:
Process ForkPoolWorker-35:
Process ForkPoolWorker-16:
Process ForkPoolWorker-18:
Process ForkPoolWorker-27:
Process ForkPoolWorker-4:
Process ForkPoolWorker-5:
Process F

In [8]:
coefficients = get_rascal_coefficients_parallelized(p, structures, R_CUT, N_MAX, L_MAX, 2)
print(coefficients.shape)
coefficients = np.reshape(coefficients, [len(structures), -1, coefficients.shape[1],
                                         coefficients.shape[2], coefficients.shape[3]])
print(coefficients.shape)

100%|██████████| 1000/1000 [01:20<00:00, 12.45it/s]


(500000, 18, 10, 19)
(100000, 5, 18, 10, 19)


In [9]:
def get_powerspectrums_paralellized(p, coefficients, l_max):
    def get_powerspectrum(coefficients):   
        now = []
        for j in range(coefficients.shape[0]):
            now.append(naive.compute_powerspectrum(coefficients[j], l_max))
        return np.array(now)
    
    res = [el for el in tqdm.tqdm(p.imap(get_powerspectrum, coefficients), total = coefficients.shape[0])]
    return np.concatenate(res, axis = 0)

In [10]:
powerspectrums = get_powerspectrums_paralellized(p, coefficients, L_MAX)
print(powerspectrums.shape)

100%|██████████| 100000/100000 [02:11<00:00, 762.13it/s]


(500000, 1710)


In [11]:
powerspectrums = np.reshape(powerspectrums, [len(structures), 5, -1])
print(powerspectrums.shape)

(100000, 5, 1710)


In [12]:
import pickle
np.save("ps_9_9_train", powerspectrums)

In [8]:
N_MAX = 5
L_MAX = 5

In [14]:
coefficients = get_rascal_coefficients_parallelized(p, structures, R_CUT, N_MAX, L_MAX, 2)
print(coefficients.shape)
coefficients = np.reshape(coefficients, [len(structures), -1, coefficients.shape[1],
                                         coefficients.shape[2], coefficients.shape[3]])
print(coefficients.shape)


powerspectrums = []
for i in tqdm.tqdm(range(coefficients.shape[0])):
    now = []
    for j in range(coefficients.shape[1]):
        now.append(naive.compute_powerspectrum(coefficients[i, j], L_MAX))
    powerspectrums.append(now)
powerspectrums = np.array(powerspectrums)
print(powerspectrums.shape)
powerspectrums = np.reshape(powerspectrums, [len(structures), 5, -1])
print(powerspectrums.shape)

import pickle
with open("ps_5_5_train", "wb") as f:
    pickle.dump(powerspectrums, f)

100%|██████████| 1000/1000 [00:56<00:00, 17.75it/s]
  0%|          | 217/100000 [00:00<00:46, 2155.37it/s]

(500000, 10, 6, 11)
(100000, 5, 10, 6, 11)


100%|██████████| 100000/100000 [01:00<00:00, 1665.45it/s]


(100000, 5, 330)
(100000, 5, 330)


In [9]:
coefficients = get_rascal_coefficients_parallelized(p, structures, R_CUT, N_MAX, L_MAX, 2)
print(coefficients.shape)
coefficients = np.reshape(coefficients, [len(structures), -1, coefficients.shape[1],
                                         coefficients.shape[2], coefficients.shape[3]])
print(coefficients.shape)

100%|██████████| 1000/1000 [01:03<00:00, 15.71it/s]


(500000, 10, 6, 11)
(100000, 5, 10, 6, 11)


In [10]:
clebsch = ClebschGordan.ClebschGordan(L_MAX)

In [11]:
print(L_MAX)

5


In [12]:
clebsch = ClebschGordan.ClebschGordan(L_MAX)
def get_bispectrums(task):
    clebsch, coefficients = task
    now = []
    for j in range(coefficients.shape[0]):
        now.append(naive.compute_bispectrum(clebsch.precomputed, coefficients[j], 5, only_even = True))
    return np.array(now)

In [13]:
tasks = [[clebsch, coeff] for coeff in coefficients]

In [14]:
bispectrums = []
for res in tqdm.tqdm(p.imap(get_bispectrums, tasks), total = len(structures)):
    bispectrums.append(res)
bispectrums = np.array(bispectrums)
print(bispectrums.shape)
bispectrums = np.reshape(bispectrums, [len(structures), 5, -1])
print(bispectrums.shape)


100%|██████████| 100000/100000 [11:30<00:00, 144.86it/s]


(100000, 5, 15180)
(100000, 5, 15180)


In [23]:
print(bispectrums.shape)

(100000, 5, 15180)


In [16]:
def normalize_per_env(features):
    self_kernels = np.sum(features * features, axis = 2)
    return features / np.sqrt(self_kernels[:, :, np.newaxis])

In [17]:
bispectrums = normalize_per_env(bispectrums)

In [18]:
print(1)

1


In [19]:
from sklearn.decomposition import PCA

In [20]:
c_envs = bispectrums[:, 0, :]
print(c_envs.shape)

(100000, 15180)


In [21]:
import time

In [24]:
begin = time.time()
c_pca = PCA(n_components = 2000)
c_pca.fit(c_envs[0:30000])
print(time.time() - begin)

148.56111693382263


In [26]:
import pickle
with open("c_pca", "wb") as f:
    pickle.dump(c_pca, f)

In [27]:
h_envs = np.concatenate(bispectrums[:7500, 1:, :], axis = 0)
print(h_envs.shape)

(30000, 15180)


In [28]:
begin = time.time()
h_pca = PCA(n_components = 2000)
h_pca.fit(h_envs)
print(time.time() - begin)

145.00989151000977


In [30]:
with open("h_pca", "wb") as f:
    pickle.dump(h_pca, f)

In [33]:
def transform(bispectrums):
    c_envs = bispectrums[:, 0, :]
    c_envs_transformed = c_pca.transform(c_envs)
    h_envs = np.concatenate(bispectrums[:, 1:, :], axis = 0)
    h_envs_transformed = h_pca.transform(h_envs)
    print(c_envs_transformed.shape)
    print(h_envs_transformed.shape)
    h_envs_transformed = h_envs_transformed.reshape([bispectrums.shape[0], 4, -1])
    print(h_envs_transformed.shape)
    bispectrums_transformed = np.concatenate([c_envs_transformed[:, np.newaxis, :], h_envs_transformed], axis = 1)
    print(bispectrums_transformed.shape)
    return bispectrums_transformed

In [34]:
bispectrums_transformed = transform(bispectrums[0:50000])
np.save("bs_5_5_train_1.npy", bispectrums_transformed)

(50000, 2000)
(200000, 2000)
(50000, 4, 2000)
(50000, 5, 2000)


In [None]:
bispectrums_transformed = transform(bispectrums[50000:100000])
np.save("bs_5_5_train_2.npy", bispectrums_transformed)

In [2]:
import numpy as np

In [3]:
first = np.load("bs_5_5_train_1.npy")
second = np.load("bs_5_5_train_2.npy")
result  = np.concatenate([first, second], axis = 0)
print(result.shape)
np.save('bs_5_5_train.npy', result)

(100000, 5, 2000)
