In [1]:
'''import sys
sys.path.insert(0, 'nice')
'''
#os.environ['OMP_NUM_THREADS'] = '24'

import numpy as np
import ase.io as ase_io
from ase import Atoms
import tqdm
import time
import copy
from sklearn.linear_model import BayesianRidge

import nice
from nice.blocks import *


from matplotlib import pyplot as plt


In [3]:
def process_structures(structures):
    for structure in structures: 
        structure.cell =[120, 120, 120]
        structure.positions += np.asarray((60,60,60))
        structure.pbc=True
        structure.wrap()

In [4]:
structures_train = ase_io.read('../../collecting_standard_features/cleaned/structures.xyz', 
                         index = '0:100000')

process_structures(structures_train)

'''structures_val = ase_io.read('../../collecting_standard_features/cleaned/structures.xyz', 
                         index = '100:200')'''

#process_structures(structures_val)


"structures_val = ase_io.read('../../collecting_standard_features/cleaned/structures.xyz', \n                         index = '100:200')"

In [5]:
energies = np.load('../../collecting_standard_features/cleaned/energies.npy')
energies = energies - np.mean(energies)
energies = energies / np.sqrt(np.mean(energies * energies))
energies_train = energies[0:1000]
energies_val = energies[1000:2000]
print(energies_train.shape)
print(energies_val.shape)

(1000,)
(1000,)


In [6]:
HYPERS = {
'interaction_cutoff': 6.3,
'max_radial': 5,
'max_angular': 5,
'gaussian_sigma_type': 'Constant',
'gaussian_sigma_constant': 0.05,
'cutoff_smooth_width': 0.3,
'radial_basis': 'GTO',
}
    

In [7]:
p = Pool(40)

In [8]:
print(len(structures_train))

100000


In [9]:
coefficients = get_rascal_coefficients_parallelized(p, structures_train, HYPERS, 2)
print(coefficients.shape)
coefficients = coefficients.reshape([len(structures_train), 5, coefficients.shape[1], 
                                     coefficients.shape[2], coefficients.shape[3]])

print(coefficients.shape)

100%|██████████| 1000/1000 [00:32<00:00, 30.70it/s]


(500000, 10, 6, 11)
(100000, 5, 10, 6, 11)


In [10]:
'''def concatenate_data(datas):
    covariants = [datas[i].covariants_ for i in range(len(datas))]
    actual_sizes = [datas[i].actual_sizes_ for i in range(len(datas))]
    
    return Data(covariants, actual_sizes, datas[0].importances_, datas[0].raw_importances_)

def transform_parallelized(transformer, *args, task_size = 100):
    
    tasks = []
    for i in range(0, coefficients.shape[0], task_size):
        now = []
        for j in range(len(args)):
            now.append(args[j][i : i + task_size])
        tasks.append(now)
    
    
    result = [res for res in tqdm.tqdm(p.imap(transformer.transform, tasks), total = len(tasks))]
    ans = []
    for i in range(len(result[0])):
        now = [result[j][i] for j in range(len(result))]
        ans.append(concatenate_data(now))
        
    return ans
'''

'def concatenate_data(datas):\n    covariants = [datas[i].covariants_ for i in range(len(datas))]\n    actual_sizes = [datas[i].actual_sizes_ for i in range(len(datas))]\n    \n    return Data(covariants, actual_sizes, datas[0].importances_, datas[0].raw_importances_)\n\ndef transform_parallelized(transformer, *args, task_size = 100):\n    \n    tasks = []\n    for i in range(0, coefficients.shape[0], task_size):\n        now = []\n        for j in range(len(args)):\n            now.append(args[j][i : i + task_size])\n        tasks.append(now)\n    \n    \n    result = [res for res in tqdm.tqdm(p.imap(transformer.transform, tasks), total = len(tasks))]\n    ans = []\n    for i in range(len(result[0])):\n        now = [result[j][i] for j in range(len(result))]\n        ans.append(concatenate_data(now))\n        \n    return ans\n'

In [11]:
class TSTransformer():
    def __init__(self):
        self.initial_ = InitialTransformer()
        self.pca_0_ = IndividualLambdaPCAsBoth()
        self.expansioner_1_ = ThresholdExpansioner()
        '''self.pca_1_ = IndividualLambdaPCAsBoth()
        self.expansioner_2_invariants_ = ThresholdExpansioner(num_expand = 100000, mode = 'invariants')
        self.pca_2_ = IndividualLambdaPCAsBoth(3000)
        self.expansioner_2_ = ThresholdExpansioner(num_expand = 20000)
        self.expansioner_3 = ThresholdExpansioner(num_expand = 100000, mode = 'invariants')'''
        '''self.pca_1_ = IndividualLambdaPCAsBoth(10)
        self.expansioner_2_invariants_ = ThresholdExpansioner(num_expand = 10, mode = 'invariants')
        self.pca_2_ = IndividualLambdaPCAsBoth(10)
        self.expansioner_2_ = ThresholdExpansioner(num_expand = 10)
        self.expansioner_3_ = ThresholdExpansioner(num_expand = 10, mode = 'invariants')'''
        
        self.pca_1_ = IndividualLambdaPCAsBoth(1000)
        self.expansioner_2_invariants_ = ThresholdExpansioner(num_expand = 20000, mode = 'invariants')
        self.pca_2_ = IndividualLambdaPCAsBoth(1000)
        self.expansioner_2_ = ThresholdExpansioner(num_expand = 4000)
        self.expansioner_3_ = ThresholdExpansioner(num_expand = 20000, mode = 'invariants')
        
        
        
    def fit(self, coefficients):
        data_even_0, data_odd_0 = self.initial_.transform(coefficients)
        self.pca_0_.fit(data_even_0, data_odd_0)
        data_even_0, data_odd_0 = self.pca_0_.transform(data_even_0, data_odd_0)
        #print(data_even_0.covariants_.shape[0])
        self.expansioner_1_.fit(data_even_0, data_odd_0, data_even_0, data_odd_0)
        data_even_1, data_odd_1 = self.expansioner_1_.transform(data_even_0, data_odd_0, data_even_0, data_odd_0)
        self.pca_1_.fit(data_even_1, data_odd_1)
        data_even_1, data_odd_1 = self.pca_1_.transform(data_even_1, data_odd_1)
        self.expansioner_2_invariants_.fit(data_even_1, data_odd_1, data_even_0, data_odd_0)
        self.expansioner_2_.fit(data_even_1, data_odd_1, data_even_0, data_odd_0)
        data_even_2, data_odd_2 = self.expansioner_2_.transform(data_even_1, data_odd_1, data_even_0, data_odd_0)
        self.pca_2_.fit(data_even_2, data_odd_2)
        data_even_2, data_odd_2 = self.pca_2_.transform(data_even_2, data_odd_2)
        self.expansioner_3_.fit(data_even_2, data_odd_2, data_even_0, data_odd_0)
        
    def transform(self, coefficients):                
        data_even_0, data_odd_0 = self.initial_.transform(coefficients)
        data_even_0_t, data_odd_0_t = self.pca_0_.transform(data_even_0, data_odd_0)
        data_even_1, data_odd_1 = self.expansioner_1_.transform(data_even_0_t, data_odd_0_t,
                                                                data_even_0_t, data_odd_0_t)
        data_even_1_t, data_odd_1_t = self.pca_1_.transform(data_even_1, data_odd_1)
        invariants_even_2, _ = self.expansioner_2_invariants_.transform(data_even_1_t, data_odd_1_t, 
                                                             data_even_0_t, data_odd_0_t)
        
        data_even_2, data_odd_2 = self.expansioner_2_.transform(data_even_1, data_odd_1, data_even_0, data_odd_0)
        data_even_2_t, data_odd_2_t = self.pca_2_.transform(data_even_2, data_odd_2)
        
        invariants_even_3, _ = self.expansioner_3_.transform(data_even_2_t, data_odd_2_t, 
                                                             data_even_0_t, data_odd_0_t)
        
        return [data_even_0.covariants_[:, :data_even_0.actual_sizes_[0], 0, 0], 
                data_even_1.covariants_[:, :data_even_1.actual_sizes_[0], 0, 0],
                invariants_even_2, invariants_even_3]
        

In [12]:
def split_coefficients(coefficients):
    c_coefficients = coefficients[:, 0]
    h_coefficients = coefficients[:, 1:]
   
    h_coefficients = np.reshape(h_coefficients, [coefficients.shape[0] * 4, 
                                              coefficients.shape[2], coefficients.shape[3],
                                              coefficients.shape[4]])
    return c_coefficients, h_coefficients

In [13]:
c_coefficients, h_coefficients = split_coefficients(coefficients)
print(c_coefficients.shape)

(100000, 10, 6, 11)


In [14]:
'''begin = time.time()
pst = BSTransformer()
pst.fit(c_coefficients)
print(time.time() - begin)'''

'begin = time.time()\npst = BSTransformer()\npst.fit(c_coefficients)\nprint(time.time() - begin)'

In [15]:
'''begin = time.time()
res = pst.transform(c_coefficients)
print(res[2].shape)
print(time.time() - begin)'''

'begin = time.time()\nres = pst.transform(c_coefficients)\nprint(res[2].shape)\nprint(time.time() - begin)'

In [16]:
def fit(coefficients, num_to_fit):
    c_coefficients, h_coefficients = split_coefficients(coefficients)
    print(c_coefficients.shape)
    begin = time.time()
    c_trans = TSTransformer()
    c_trans.fit(c_coefficients[:num_to_fit])
    print(time.time() - begin)
    
    begin = time.time()
    h_trans = TSTransformer()
    h_trans.fit(h_coefficients[:num_to_fit])
    print(time.time() - begin)
    
    return c_trans, h_trans



In [17]:
c_coefficients, h_coefficients = split_coefficients(coefficients)


In [18]:
c_trans, h_trans = fit(coefficients, 2000)


(100000, 10, 6, 11)
1707.313946723938
1546.420652627945


In [19]:
import pickle
with open("nu_4_data/transformers", "wb") as f:
    pickle.dump([c_trans, h_trans], f, protocol = 4)

In [20]:
def transform_full(transformer, coefficients, task_size = 100):
    tasks = [coefficients[i : i + task_size]
             for i in range(0, coefficients.shape[0], task_size)]  
    print(len(tasks))
    result = [res for res in tqdm.tqdm(p.imap(transformer.transform, tasks), total = len(tasks))]
    ans = []
    for i in range(len(result[0])):
        now = [result[j][i] for j in range(len(result))]
        ans.append(np.concatenate(now, axis = 0))
        
    return ans

In [21]:
c_coefficients, h_coefficients = split_coefficients(coefficients)
#result = np.concatenate(transform_full(c_trans, c_coefficients), axis = 1)
result = transform_full(c_trans, c_coefficients)
'''print(result[0].shape)
print(result[1].shape)
print(result[2].shape)'''
#print(result.shape)

  0%|          | 0/1000 [00:00<?, ?it/s]

1000


100%|██████████| 1000/1000 [29:49<00:00,  1.79s/it]


'print(result[0].shape)\nprint(result[1].shape)\nprint(result[2].shape)'

In [22]:
with open("nu_4_data/zpbsts_c", "wb") as f:
    pickle.dump(result, f, protocol = 4)

In [23]:
'''import pickle
with open("nu_2_data/transformers", "rb") as f:
    c_trans, h_trans = pickle.load(f)'''

'import pickle\nwith open("nu_2_data/transformers", "rb") as f:\n    c_trans, h_trans = pickle.load(f)'

In [24]:
result = transform_full(h_trans, h_coefficients)
#print(result.shape)

  0%|          | 0/4000 [00:00<?, ?it/s]

4000


100%|██████████| 4000/4000 [2:03:01<00:00,  1.85s/it]  


In [25]:
with open("nu_4_data/zpbsts_h", "wb") as f:
    pickle.dump(result, f, protocol = 4)

## 