In [7]:
"""
This part is parameter settting
"""

"""

This dictory is used to transform the space group symbols to the space group number, like P1 -> 1

"""
import json
path = "/usr/share/decryst/wyck.json"
tables = json.load(open(path))
sg_dic = {}
for idx in tables.keys():
    desc = tables[idx]['desc']
    desc = desc.split(":")[0]
    try:
        num = int(idx)
    except:
        num = int(idx[:-1])
    sg_dic[desc] = num

    
peak_dic0 = {
            'U': [4.0, 4.0, 0],
            'V': [-4.0, -4.0, 0],
            'W': [12.0, 15.0, 0],
            'X': [0.0, 5, 0],
            'Y': [0.0, 5, 0],
            'Z': [0.0, 0, 0]
                    }

In [17]:
"""
This part is functions to generate PXRD spectrum for one cif or a whole dataset 
"""

import os,sys
sys.path.append('/home/jyb/.conda/envs/yfh/GSASII')
import GSASIIscriptable as G2sc
import numpy as np

'''
Goal:
    pxrd_generator is used to generate pxrd spectrum of one cif and gsas_generator is used to zip pxrd spectrums of millions of cifs to csv files.
Directory:
    In this process, we need a datadir and a workdir(named datadir + "_spec"). The datadir contains all cifs 
Explaination of Variables:
    phase name is the cif name without ".cif"(it is just a name), And inst_xry.prm is in the intrument parameter file. 
    If we do not add parameter like peak functions parameters, everything will follow default parameter 

output: [cell_type,lattice_type], np.array(the reflection) 
'''
def pxrd_generator(phase_name, peak_dic,datadir, Tmin = 3., Tmax = 30., Tstep = 0.01):
    sg_ls = [] # cell_type, lattice_type , SGLaue
    workdir = datadir + "_spec"
    
    name = randint(0,100)
    
    gpx = G2sc.G2Project(newgpx=f'/home/jyb/lyt/gsas/test_2_{name}.gpx') # create a project
    phase0 = gpx.add_phase(os.path.join(datadir, phase_name +".cif"),
                      phasename=phase_name,fmthint='CIF') 
    sg_ls = [sg_dic[phase0.data['General']['SGData']['SpGrp']]]
    # step 2, setup: add a simulated histogram and link it to the previous phase(s)
    hist1 = gpx.add_simulated_powder_histogram("icsd_024496" + "simulation",
                          "/home/jyb/lyt/solution/Er/inst_xry.prm",
                          Tmin,Tmax,Tstep,phases=gpx.phases())
    hist1.InstrumentParameters.update(peak_dic)
    # step 3, compute: turn off parameter optimization and calculate pattern
    gpx.data['Controls']['data']['max cyc'] = 0 # refinement not needed
    gpx.do_refinements([{}])
    #gpx.save()
    return sg_ls, gpx.histogram(0).getdata('ycalc')

from pandas import DataFrame
import random
def gsas_generator(cif_ls, num, datadir, serial, Tmin = 3., Tmax = 30., Tstep = 0.02):
    sym_list = []
    spec_list = []
    global peak_dic0
    for i in range(0,num):
        peak_dic = peak_dic0
        phase_name = ciflist[i][:-4]
        try:
            #print(phase_name, peak_dic, datadir, Tmin, Tmax, Tstep)
            sg_ls, I_observe = pxrd_generator(phase_name, peak_dic, datadir, Tmin = Tmin, Tmax = Tmax, Tstep = Tstep)
            I_observe = I_observe/np.max(I_observe)
            if len(I_observe):
                if np.isnan(I_observe[0]): 
                    
                    continue
                sym_list.extend(sg_ls)
                spec_list.append(I_observe)
        except: 
            print("error happen");continue
    sym_arr = np.array(sym_list)
    spec_arr = np.array(spec_list)
    spec_matrix = np.matrix(spec_arr)
    data_0 = DataFrame(sym_arr)
    data_1 = DataFrame(spec_matrix)
#     name_0 = 'sym_{0}_{1}st'.format(num,serial)
#     name_1 = 'spec_{0}_{1}st'.format(num,serial)
    name_0 = "sym_wide_{0}".format(serial)
    name_1 = "spec_wide_{0}".format(serial)
    data_0.to_csv(f"/home/data/lyt/pxrd_data/{name_0}.csv",index=False,header=None)
    data_1.to_csv(f"/home/data/lyt/pxrd_data/{name_1}.csv",index=False,header=None)
    return sym_list, spec_list

In [22]:
"""
Examples of pxrd_generator:
"""
datadir = "/home/data/lyt/cod/Zn-C-H/"

peak_dic = {
            'U': [4.0, 8.0, 0],
            'V': [-4.0, -8.0, 0],
            'W': [10.0, 10.0, 0],
            'X': [0.0, 10, 0],
            'Y': [0.0, 4, 0],
            'Z': [0.0, 0, 0]
                    }
sg_ls, I_observe = pxrd_generator("7051459", peak_dic, datadir, Tmin = 3, Tmax = 30, Tstep = 0.02)

/home/data/lyt/cod/Zn-C-H/7051459.cif read by Reader CIF
gpx file saved as /home/jyb/lyt/gsas/test_2_5.gpx
Simulating 1351 points from 3 to 30 degrees 2theta
Instrument parameters read: /home/jyb/lyt/solution/Er/inst_xry.prm bank 1
gpx file saved as /home/jyb/lyt/gsas/test_2_5.gpx
 Hessian Levenberg-Marquardt SVD refinement on 1 variables:
initial chi^2 2530.7 with 1351 obs.
Read from file:/home/jyb/lyt/gsas/test_2_5.bak0.gpx
Save to file  :/home/jyb/lyt/gsas/test_2_5.gpx
GPX file save successful
 Refinement results are in file: /home/jyb/lyt/gsas/test_2_5.lst
 ***** Refinement successful *****


In [23]:
"""
One example
"""
import os
#import fileinput
processing_foo1s = False
path = "/home/data/lyt/tobacco/orig_cif_0"
filelist = os.listdir(path)
ciflist = []
for file in os.listdir(path):
    if file[-3:] == "cif": ciflist.append(file)
print(len(ciflist))
ciflist = ciflist[:10]
from time import time
t_0 = time()
#for i in range(0,2):
    #gsas_generator(ciflist[100000*i:], 100000, datadir, "cod_new_"+str(i))
gsas_generator(ciflist, len(ciflist), path, "tobacco_orig_3")
total = time()-t_0

178655
/home/data/lyt/tobacco/orig_cif_0/output_asc_v1-6c_Cr_1_Ch_v2-4c_1anC_Ch_v3-3c_Cu_imadazoline_Ch_1-2B_4H_Ch_1-2B_2SH_Ch.cif read by Reader CIF
gpx file saved as /home/jyb/lyt/gsas/test_2_65.gpx
Simulating 1351 points from 3.0 to 30.0 degrees 2theta
Instrument parameters read: /home/jyb/lyt/solution/Er/inst_xry.prm bank 1
gpx file saved as /home/jyb/lyt/gsas/test_2_65.gpx
 Hessian Levenberg-Marquardt SVD refinement on 1 variables:
initial chi^2 1706.9 with 1351 obs.
Read from file:/home/jyb/lyt/gsas/test_2_65.bak0.gpx
Save to file  :/home/jyb/lyt/gsas/test_2_65.gpx
GPX file save successful
 Refinement results are in file: /home/jyb/lyt/gsas/test_2_65.lst
 ***** Refinement successful *****
/home/data/lyt/tobacco/orig_cif_0/output_cut_v1-3c_triazine_Ch_v2-8c_Tb_1_Ch_1-2B_2CH3_Ch_1-ntn_edge.cif read by Reader CIF
gpx file saved as /home/jyb/lyt/gsas/test_2_58.gpx
Simulating 1351 points from 3.0 to 30.0 degrees 2theta
Instrument parameters read: /home/jyb/lyt/solution/Er/inst_xry.prm

In [None]:
"""
How to run the scripts in parellel and generate numberous dataset (without testing again)
"""

from multiprocessing import Pool
core_number = 5
p = Pool(core_number)

count=1
cif_ls_0 = [ciflist[count+0:count+20000],ciflist[count+20000:count+40000],ciflist[count+40000:count+60000],ciflist[count+60000:80000],ciflist[count+80000:count+100000]]
serl_ls = ["train_1","train_2","train_3","train_4","train_5"]
whole_num = 20000

for i in range(0,12):
    p.apply_async(gsas_generator, args=(ciflist[i*15000+0, i*15000+15000], len(ciflist[i*15000+0, i*15000+15000]), datadir, serl_ls[i]))
p.close()
p.join()