In [6]:
import pandas as pd
import numpy as np
import os
import glob
from datetime import datetime
from shutil import copy
import pymatgen as mg
import string_misc

datadir = os.path.join(os.environ['USERPROFILE'],'OneDrive - Colorado School of Mines/Research/MIDDMI/TCO/Data')
indir = os.path.join(datadir,'in') #libraries to be processed
outdir = os.path.join(datadir,'out') #output directory for csvs and logs
procdir = os.path.join(datadir,'processed') #destination dir for successfully processed libraries
errdir = os.path.join(datadir,'error') #destination dir for libraries with errors
config = os.path.join(datadir,'config') #config/parameter file location

############################
"read config files"
############################
os.chdir(config)

#define A and B site atoms
Asite = ['Ba']
Bsite = ['Co','Fe','Zr','Y']

#load variable definitions
#----------------------------
vardictdf = pd.read_excel('VariableDict.xlsx')

#put standard variables into dict
stdvardf = vardictdf[vardictdf.Keyword.str[0] != '!']
stdvars = list(stdvardf.VarName)
headers = list(stdvardf.Keyword + ': ' + stdvardf.DisplayName)
stdvardict = dict(zip(stdvars,headers))

#composition variables
compvardf = vardictdf[vardictdf.Keyword == '!composition']
compvars = list(compvardf.VarName)

#load PLD parameters
#----------------------------
prepstep = 'Pulsed Laser Deposition'
params = pd.read_excel('PLD_params.xlsx',skiprows=3,index_col=0)
paramdictdf = pd.read_excel('PLD_ParamDict.xlsx')
#put standard params into dict
stdparamdf = paramdictdf[paramdictdf.Keyword.str[0] != '!']
stdparams = list(stdparamdf.ParamName)
parheaders = list(stdparamdf.Keyword + ': ' + stdparamdf.DisplayName)
stdparamdict = dict(zip(stdparams,parheaders))

#get lists of targets & target info params
tgtcols = list(paramdictdf[paramdictdf.Keyword == '!target'].ParamName)
tgtinfo = paramdictdf[paramdictdf.Keyword == '!targetinfo']
tgtvars = paramdictdf[paramdictdf.Keyword=='!targetinfo'].DisplayName.unique()

#rename headers to ingestable format using paramdict
params = params.rename(index=str, columns = stdparamdict) 
params.insert(1,'PREPARATION STEP NAME',prepstep)

#load list of possible targets
targets = pd.read_excel('targetlist.xlsx')

#library prefix to strip out
libprefix = 'PDAC_COM3_'

############
"end config"
############


########################################
"loop through all libraries in indir"
########################################
os.chdir(indir)
nsucc = 0
nerr = 0
errsumname = os.path.join(errdir, 'gen_csv_' + datetime.now().strftime('%Y%m%d_%H%M%S') + '.err')
errsumtxt = ''

for lib in next(os.walk(indir))[1]:
    os.chdir(lib)
    
    #log file
    logname = lib + '_gen_csv.log'
    log = open(logname, 'w')
    log.write('Processing library ' + lib + '\n')
    
    #error file
    errtxt = ''
    
    #csv
    csvname = lib + '_AllVar.csv'
    
    all_var = pd.DataFrame()
    
    #read the Points file to determine file length
    try:
        pts = pd.read_csv(lib + '_Points.txt',sep='\t',usecols = [0]) 
    except FileNotFoundError: #starting with 1502 points filename suffix changed to '_Point'
        pts = pd.read_csv(lib + '_Point.txt',sep='\t',usecols = [0]) 
    
    #dataframes to store composition data
    chm = pd.DataFrame()
    sitesum = pd.DataFrame(np.zeros((len(pts),2)), columns = ['A','B'])
    
    #track found variables
    foundvars = ['Point','Row','Column']
    
    ################################
    "read variables from txt files"
    ################################
    for fname in glob.glob(lib + '*.txt'):
        vname = fname[len(lib)+1:fname.find('.txt')]
        #regular variables
        if vname in stdvars and vname not in foundvars:
            if len(all_var) == 0:
                cols = None
            else:
                cols = [3]
            df = pd.read_csv(fname,sep='\t',usecols = cols)
            all_var = pd.concat([all_var,df], axis=1)
        #composition variables
        elif vname in compvars:
            dfc = pd.read_csv(fname,sep='\t',usecols = [3])
            elmnt = vname[:vname.find('_at')]
            chm[elmnt] = dfc[vname] 
            sitesum.B = sitesum.B + dfc[vname]
        
        foundvars.append(vname)
    
    #identify ignored and missing variables
    ignoredvars = np.setdiff1d(foundvars,stdvars+compvars, assume_unique=True)
    missingstd = np.setdiff1d(stdvars,foundvars, assume_unique=True)
    missingcomp = np.setdiff1d(compvars,foundvars, assume_unique=True)
    if len(ignoredvars) > 0:
        log.write('Ignored variables:\n\t' + '\n\t'.join(ignoredvars) + '\n')
    if len(missingstd) > 0:
        log.write('*Warning: missing standard variables:\n\t' + '\n\t'.join(missingstd) + '\n')
    if len(missingcomp) > 0:
        log.write('*Warning: missing composition variables:\n\t' + '\n\t'.join(missingcomp) + '\n')
        errtxt += 'Missing composition variables: ' + ', '.join(missingcomp) + '\n'
        
    #######################################
    "determine composition and add features"
    #######################################
    #determine scale - starting at 1502 scale changes to 100
    if sitesum.B.max() > 1:
        sitesum.A = 100 - sitesum.B
    else:
        sitesum.A = 1 - sitesum.B
    sitemax = sitesum.max(axis=1)
    #normalize for the higher occupancy site
    sitenorm = sitesum.divide(sitemax,axis=0) 
    chmnorm = chm.divide(sitemax,axis=0)
    chmnorm.insert(0,'Ba',list(sitenorm.A))
    chmnorm = chmnorm.round(5)
    
    
    chmfeat = pd.DataFrame(np.zeros((44,13)),
                           columns=['A_avg_X',
                                'A_avg_at_radius',
                                'A_avg_ion_radius',
                                'A_avg_mass',
                                'B_avg_X',
                                'B_avg_at_radius',
                                'B_avg_ion_radius',
                                'B_avg_mass',
                                'AB_site_ratio',
                                'AB_X_ratio',
                                'AB_at_radius_ratio',
                                'AB_ion_radius_ratio',
                                'AB_mass_ratio']
                      )
    
    for elmnt in list(chmnorm.columns):
        x = mg.Element(elmnt)
        if elmnt in Asite:
            wt = chmnorm[elmnt]/sitenorm.A #weight for wtd average
            chmfeat.A_avg_X += wt*x.X
            chmfeat.A_avg_at_radius += wt*x.atomic_radius
            chmfeat.A_avg_ion_radius += wt*x.average_ionic_radius
            chmfeat.A_avg_mass += wt*x.atomic_mass
        elif elmnt in Bsite:
            wt = chmnorm[elmnt]/sitenorm.B #weight for wtd average
            chmfeat.B_avg_X += wt*x.X
            chmfeat.B_avg_at_radius += wt*x.atomic_radius
            chmfeat.B_avg_ion_radius += wt*x.average_ionic_radius
            chmfeat.B_avg_mass += wt*x.atomic_mass
        
        chmnorm[elmnt] = elmnt + chmnorm[elmnt].map(str)
            
    chmnorm['formula'] = chmnorm.apply(lambda x: ''.join(x),axis=1)
    chmnorm['formula'] = chmnorm['formula'] + 'O3'
    
    chmfeat.AB_site_ratio = sitenorm.A/sitenorm.B
    chmfeat.AB_X_ratio = chmfeat.A_avg_X/chmfeat.B_avg_X
    chmfeat.AB_at_radius_ratio = chmfeat.A_avg_at_radius/chmfeat.B_avg_at_radius
    chmfeat.AB_ion_radius_ratio = chmfeat.A_avg_ion_radius/chmfeat.B_avg_ion_radius
    chmfeat.AB_mass_ratio = chmfeat.A_avg_mass/chmfeat.B_avg_mass
    for col in chmfeat.columns:
        chmfeat = chmfeat.rename(columns={col:'PROPERTY: ' + col})
        
    sample = lib[len(libprefix):]

    all_var = all_var.rename(columns = stdvardict) #rename headers to ingestable format using vardict
    all_var.insert(0,'IDENTIFIER: Sample number',sample) #add sample column        
    all_var.insert(0,'FORMULA',list(chmnorm['formula'])) #add formula column
    all_var = pd.concat([all_var,chmfeat],axis=1)
    
    ####################
    "add PLD parameters"
    ####################
    if sample in list(params.index) and not(pd.isnull(params.loc[sample,'IDENTIFIER: Date'])):

        sp = params.loc[sample,:]
        sp = sp.to_frame().T #convert series to df

        #add columns for all possible targets (BZY82, BZC19, ...)
        for var in tgtvars:
            for tgt in targets.Target: 
                colname = 'PREPARATION STEP DETAIL: ' + tgt + ' ' + var
                #insert 0 for all target vars
                sp.insert(len(sp.columns),colname,0)
            if var == 'Pulses/Cycle':
                sp.insert(len(sp.columns),'PREPARATION STEP DETAIL: Average Target Pulses/Cycle',0)
        
        #get avg pulses per cycle across targets
        ppc = []
        for col in tgtinfo[tgtinfo.DisplayName=='Pulses/Cycle'].ParamName:
            val = sp.loc[sample,col]
            if str(val).find('/') >= 0: #if multiple values for single target, take average (should only apply to BZY82)
                vals = [float(v) for v in val.split('/')]
                val = np.mean(vals)
            if string_misc.is_number(str(val)):
                ppc.append(val)
        sp.loc[sample,'PREPARATION STEP DETAIL: Average Target Pulses/Cycle'] = np.mean(ppc)

        #find targets actually used
        for tgtcol in tgtcols:
            tgtname = str(sp.loc[sample,tgtcol]).strip().replace('-','')

            if tgtname in list(targets.Target):
                #get target info
                tidf = tgtinfo[tgtinfo.ParamName.str[0:8]==tgtcol]
                for tic in tidf.ParamName:
                    disp = np.asscalar(tidf[tidf['ParamName']==tic].DisplayName)
                    colname = 'PREPARATION STEP DETAIL: ' + tgtname + ' ' + disp
                    val = str(sp.loc[sample,tic]).strip().replace('-','0')
                    if disp == 'Pulses/Cycle' and val.find('/') >= 0: #if multiple values for single target, take average (should only apply to BZY82)
                        vals = [float(v) for v in val.split('/')]
                        val = np.mean(vals)
                    sp.loc[sample,colname] = val
            elif len(tgtname) > 0:
                #write to err
                errtxt += 'Unknown target: ' + tgtname + '\n'
            
            sp = sp.drop(tgtcol, axis=1)
        
        for ti in list(tgtinfo.ParamName):
            sp = sp.drop(ti, axis=1)

        all_var = all_var.join(sp,on='IDENTIFIER: Sample number')
    else:
        log.write('*Warning: could not locate PLD parameters\n')
        errtxt += 'Missing PLD parameters\n'
        
    all_var.to_csv(csvname,index=False)
    log.write('Wrote ' + csvname + '\n')
    
    #############
    "write files"
    #############
    #handle errors - these files should be sent to errdir rather than procdir, and err file should be written
    if len(errtxt) > 0:
        #write err file
        errname = lib + '_gen_csv.err' #os.path.join(errdir, lib + '_gen_csv.err')
        err = open(errname,'w')
        err.write(errtxt)
        err.close()
        
        #add to errsummary
        errsumtxt += lib + ': ' + errtxt
        
        #make a note in log
        log.write('Error(s) detected, moving to ' + errdir)
        log.close()
        
        #move to errdir
        os.chdir('..')
        os.rename(os.path.join(indir,lib), os.path.join(errdir,lib))
        
        #increment nerr
        nerr += 1
    #if no errors - move to procdir and copy outputs to outdir
    else:
        #move processed library to procdir
        log.write('Finished processing library ' + lib)
        log.close()

        #copy csv and log to outdir
        copy(csvname, os.path.join(outdir,csvname))
        copy(logname, os.path.join(outdir,logname))
        os.chdir('..')
        os.rename(os.path.join(indir,lib), os.path.join(procdir,lib))
        
        #increment nsucc
        nsucc += 1
    
if nerr > 0:
    errsum = open(errsumname,'w')
    errsum.write(errsumtxt)
    errsum.close()

print('Processed ' + str(nsucc) + ' sample(s) successfully')
print('Processed ' + str(nerr) + ' sample(s) with errors')

Processed 3 sample(s) successfully
Processed 0 sample(s) with errors


In [3]:
stdvardict

{'Column': 'IDENTIFIER: Column',
 'FWHM_110': 'PROPERTY: FWHM of 110 peak ($\\degree$)',
 'FWHM_111': 'PROPERTY: FWHM of 111 peak ($\\degree$)',
 'FWHM_200': 'PROPERTY: FWHM of 200 peak ($\\degree$)',
 'MaxInt_110': 'PROPERTY: Max intensity of 110 peak (counts)',
 'MaxInt_111': 'PROPERTY: Max intensity of 111 peak (counts)',
 'MaxInt_200': 'PROPERTY: Max intensity of 200 peak (counts)',
 'PeakArea_110': 'PROPERTY: Area of 110 peak',
 'Point': 'IDENTIFIER: Point',
 'Pos_110': 'PROPERTY: Position of 110 peak ($\\degree$)',
 'Pos_111': 'PROPERTY: Position of 111 peak ($\\degree$)',
 'Pos_200': 'PROPERTY: Position of 200 peak ($\\degree$)',
 'Ratio_111_100': 'PROPERTY: Max intensity ratio 111:100 ',
 'Ratio_200_100': 'PROPERTY: Max intensity ratio 200:100 ',
 'Row': 'IDENTIFIER: Row',
 'a_Ang': 'PROPERTY: Cubic lattice constant ($\\r{A}$)',
 'd_um': 'PROPERTY: Film thickness ($\\mu$m)',
 'x_mm': 'IDENTIFIER: X position (mm)',
 'y_mm': 'IDENTIFIER: Y position (mm)'}

In [4]:
foundvars

['Point',
 'Row',
 'Column',
 'a_Ang',
 'Ba_at',
 'Column',
 'Co_at',
 'd_um',
 'Fe_at',
 'Frac_110',
 'Frac_111',
 'Frac_200',
 'FWHM_110',
 'FWHM_111',
 'FWHM_200',
 'MaxInt_110',
 'MaxInt_111',
 'MaxInt_200',
 'Point',
 'Pos_110',
 'Pos_111',
 'Pos_200',
 'Row',
 'x_mm',
 'Y_at',
 'y_mm',
 'Zr_at']