In [115]:
import pandas as pd
import numpy as np
import os
import glob

#datadir = "C:/Users/jdhuang/OneDrive - Colorado School of Mines/Research/MIDDMI/TCO/Data"
indir = os.path.join(os.environ['HOME'], 'OneDrive - Colorado School of Mines/Research/MIDDMI/TCO/in')
outdir = os.path.join(os.environ['HOME'], 'OneDrive - Colorado School of Mines/Research/MIDDMI/TCO/out')
logname = 'gen_csv_log.txt'
os.chdir(indir)

#load variable definitions
#----------------------------
vardictdf = pd.read_excel('VariableDict.xlsx')

#put standard variables into dict
stdvardf = vardictdf[vardictdf.Keyword.str[0] != '!']
stdvars = list(stdvardf.VarName)
headers = list(stdvardf.Keyword + ': ' + stdvardf.DisplayName)
stdvardict = dict(zip(stdvars,headers))

#composition variables
compvardf = vardictdf[vardictdf.Keyword == '!composition']
compvars = list(compvardf.VarName)

#load PLD parameters
#----------------------------
prepstep = 'Pulsed Laser Deposition'
params = pd.read_excel('PLD_params.xlsx',skiprows=3,index_col=0)
paramdictdf = pd.read_excel('PLD_ParamDict.xlsx')
#put standard params into dict
stdparamdf = paramdictdf[paramdictdf.Keyword.str[0] != '!']
stdparams = list(stdparamdf.ParamName)
parheaders = list(stdparamdf.Keyword + ': ' + stdparamdf.DisplayName)
stdparamdict = dict(zip(stdparams,parheaders))
params = params.rename(index=str, columns = stdparamdict) #rename headers to ingestable format using paramdict
params.insert(1,'PREPARATION STEP NAME',prepstep)


#library prefix to strip out
libprefix = 'PDAC_COM3_'

#log file
log = open(os.path.join(outdir,logname), 'w')

#loop through all directories in indir
for lib in next(os.walk(indir))[1]:
    log.write('Processing library ' + lib + '\n')
    csvname = lib + '_AllVar.csv'
    os.chdir(lib)
    
    all_var = pd.DataFrame([])
    
    #read the Points file to determine file length
    pts = pd.read_csv(lib + '_Points.txt',sep='\t',usecols = [0]) 
    
    #dataframes to store composition data
    chm = pd.DataFrame()#np.zeros((len(pts),1),dtype='str'),columns=['formula'])
    sitesum = pd.DataFrame(np.zeros((len(pts),2)), columns = ['A','B'])
    
    #track found variables
    foundvars = ['Point','Row','Column']
    
    for fname in glob.glob(lib + '*.txt'):
        vname = fname[len(lib)+1:fname.find('.txt')]
        #regular variables
        if vname in stdvars:
            if len(all_var) == 0:
                cols = None
            else:
                cols = [3]
            df = pd.read_csv(fname,sep='\t',usecols = cols)
            all_var = pd.concat([all_var,df], axis=1)
        #composition variables
        elif vname in list(compvar.VarName):
            dfc = pd.read_csv(fname,sep='\t',usecols = [3])
            elmnt = vname[:vname.find('_at')]
            chm[elmnt] = dfc[vname] 
            sitesum.A = sitesum.A + dfc[vname]
        
        foundvars.append(vname)
    
    #identify ignored and missing variables
    ignoredvars = np.setdiff1d(foundvars,stdvars+compvars, assume_unique=True)
    missingstd = np.setdiff1d(stdvars,foundvars, assume_unique=True)
    missingcomp = np.setdiff1d(compvars,foundvars, assume_unique=True)
    log.write('Ignored variables:\n\t' + '\n\t'.join(ignoredvars) + '\n')
    log.write('*Warning: missing standard variables:\n\t' + '\n\t'.join(missingstd) + '\n')
    log.write('*Warning: missing composition variables:\n\t' + '\n\t'.join(missingcomp) + '\n')
        
    #determine composition
    sitesum.B = 1 - sitesum.A
    sitemax = sitesum.max(axis=1)
    sitenorm = sitesum.divide(sitemax,axis=0) #normalize for the higher occupancy site
    chmnorm = chm.divide(sitemax,axis=0)
    chmnorm.insert(0,'Ba',list(sitenorm.B))
    chmnorm = chmnorm.round(5)
    for elmnt in list(chmnorm.columns):
        chmnorm[elmnt] = elmnt + chmnorm[elmnt].map(str) 
    chmnorm['formula'] = chmnorm.apply(lambda x: ''.join(x),axis=1)
    chmnorm['formula'] = chmnorm['formula'] + 'O3'

    sample = lib[len(libprefix):]

    all_var = all_var.rename(index=str, columns = stdvardict) #rename headers to ingestable format using vardict
    all_var.insert(0,'IDENTIFIER: Sample number',sample) #add sample column        
    all_var.insert(0,'FORMULA',list(chmnorm['formula'])) #add formula column
    
    #add PLD parameters corresponding to sample number
    if sample in list(params.index):
        all_var = all_var.join(params,on='IDENTIFIER: Sample number')
    else:
        log.write('*Warning: could not locate PLD parameters\n')
    
    all_var.to_csv(os.path.join(outdir,csvname),index=False)
    
    log.write('Wrote ' + csvname + '\n')
    log.write('Finished processing library ' + lib + '\n\n')
    os.chdir('..')
    
log.close()