In [1]:
import pandas as pd
import numpy as np
import json

from pprint import pprint

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [7]:
with open('./rsc/fxn_grps.json', 'r') as fp:
    fxn_grp_dict = json.load(fp)
    
with open('./rsc/df_columns.json', 'r') as fp:
    column_filters = json.load(fp)
    
compounds = list(fxn_grp_dict.keys())

In [13]:
df = pd.read_csv('./data/bfo_df_20211122.csv')
df = df.dropna(thresh=1)
print('Number of recipes: ', len(df))
print('Number of papers: ', df['doi'].nunique())

#codify impurity presence
impurity_codes = []
for cmp in df['reported_impurities'].tolist():
    if cmp == '[]':
        impurity_codes.append(0)
    else:
        impurity_codes.append(1)
        
df['impurity_code'] = impurity_codes

#codify Bi/Fe precursor
nitrate_precs = []
for i, (bi, fe) in enumerate(
    zip(df['Bi(NO3)3-5H2O'].to_list(), 
        df['Fe(NO3)3-9H2O'].to_list())
):
    if bi and fe:
        nitrate_precs.append(1)
    elif bi or fe:
        nitrate_precs.append(0) #Fe is always the other, never Bi
    else:
        nitrate_precs.append(-1)
        
df['nitrate_precs'] = nitrate_precs

#create single acetic_acids column for 
df['acetic_acids'] = df[['acetic_acid', 'glacial_acetic_acid']].apply(
    lambda x: max(x),
    axis=1
)

#create functional groups features

fxn_grp_features = {
    "methyl" : np.zeros(len(df)),
    "methylene" : np.zeros(len(df)),
    "carbonyl" : np.zeros(len(df)),
    "methoxyl" : np.zeros(len(df)),
    "hydroxyl" : np.zeros(len(df)),
    "carboxyl" : np.zeros(len(df)),
    "ether" : np.zeros(len(df)),
    "amine" : np.zeros(len(df))
}

for c in compounds:
    for i, p in enumerate(df[c].to_list()):
        for grp in fxn_grp_dict[c]:
            if p:
                fxn_grp_features[grp][i] += fxn_grp_dict[c][grp]

df = df.drop([c for c in compounds 
              if c not in ['ethylene_glycol', '2-methoxyethanol', 'acetic_acids', 'citric_acid']], axis=1)
    
for grp in fxn_grp_features:
    df[grp] = fxn_grp_features[grp].astype(bool).astype(int)

#create chelating agent SMILES feature
        
#codify atmosphere
air_atm = []
o2_atm = []
n2_atm = []
for atm in df['atmosphere'].tolist():
    if atm == 'O2':
        air_atm.append(0)
        o2_atm.append(1)
        n2_atm.append(0)
    elif atm == 'N2':
        air_atm.append(0)
        o2_atm.append(0)
        n2_atm.append(1)
    else:
        air_atm.append(1)
        o2_atm.append(0)
        n2_atm.append(0)
        
df['air_atm'] = air_atm
df['o2_atm'] = o2_atm
df['n2_atm'] = n2_atm

#assume blank prebake temps are 0
df['prebake_degC'] = df['prebake_degC'].fillna(0)

#assume blank prebake times with no temp are zero
df['prebake_time_hr'] = df[['prebake_time_hr', 'prebake_degC']].apply(
    lambda rec: 0 if rec['prebake_degC']==0 else rec['prebake_time_hr'],
    axis=1
)

codified_df = df.drop([
    'atmosphere',
    'Bi(NO3)3-5H2O',
    'Fe(NO3)3-9H2O',
    'Bi2O3',
    'Bi_acetate',
    'Fe_acetylacetone',
    'Fe_pentanedionate',
    'Fe_citrate',
    'nitric_acid',
    'substrate',
    'subtrate_orientation',
    'acetic_acid',
    'glacial_acetic_acid',
    'methylethylene_glycol',
    'ammonium_hydroxide',
    'DMF',
    'formamide',
    'water',
    'blank',
    'Notes',
    'reported_impurities',
    'validation',
    'precursor_viscosity',
    'dry_degC',
    'dry_time_hr',
    'spincoating_time_sec',
    'spincoating_rpm'
], axis=1)

print(df['precursor_concentration'].median())
print(df['annealing_time_hr'].median())

codified_df.to_csv('./data/BFO_film_synth_df_codified_20211101.csv', index=False)

display(codified_df)

Number of recipes:  358
Number of papers:  180


KeyError: 'prebake_degC'