## Importations

In [5]:
import matplotlib.pyplot as plt
from matplotlib import rc
import numpy as np
import sys
sys.path.append('../')

## ML
from scripts.machine_learning_utils import *

##data
from scripts.manage_data import *

import pandas as pd



### Gather quantum descriptors

In [None]:
from autoqchem.db_functions import *

data_ONO = descriptors(["boron_LA_ONO_ar_2"], presets=['global', 'substructure'], 
            conf_option='mean', 
            solvent='None', 
            functional='M062X', 
            basis_set="6-31G(d)",
            substructure='B')
data_ONO = LA_only(data_ONO)

data_NNN = descriptors(["boron_LA_NNN_ar_2"], presets=['global', 'substructure'], 
            conf_option='mean', 
            solvent='None', 
            functional='M062X', 
            basis_set="6-31G(d)",
            substructure='B')
data_NNN = LA_only(data_NNN)

data_OCO = descriptors(["boron_LA_ohmiya_CB"], presets=['global', 'substructure'], 
            conf_option='mean', 
            solvent='None', 
            functional='M062X', 
            basis_set="6-31G(d)",
            substructure='B')
data_OCO = LA_only(data_OCO)

data_NMR_LA = descriptors(["boron_LA_NMR_data"], presets=['global', 'substructure'], 
            conf_option='mean', 
            solvent='None', 
            functional='M062X', 
            basis_set="6-31G(d)",
            substructure='B')
data_NMR_LA = LA_only(data_NMR_LA)

data_various_boron_LA = descriptors(["various_boron_LA"], presets=['global', 'substructure'], 
            conf_option='mean', 
            solvent='None', 
            functional='M062X', 
            basis_set="6-31G(d)",
            substructure='B')
data_various_boron_LA = LA_only(data_various_boron_LA)

data_triarylboranes = descriptors(["boron_LA_triarylboranes"], presets=['global', 'substructure'], 
            conf_option='mean', 
            solvent='None', 
            functional='M062X', 
            basis_set="6-31G(d)",
            substructure='B')
data_triarylboranes = LA_only(data_triarylboranes)


## Quantum descriptors for the 4 molecular structures
df_global = (data_OCO['global'].append(data_NNN['global'])).append(data_ONO['global']).append(data_NMR_LA['global']).append(data_various_boron_LA['global']).append(data_triarylboranes['global'])

df_atom1 = data_OCO['atom1'].append(data_NNN['atom1']).append(data_ONO['atom1']).append(data_NMR_LA['atom1']).append(data_various_boron_LA['atom1']).append(data_triarylboranes["atom1"])


In [None]:
df_atom1 = df_atom1.drop('labels',axis=1)
df_atom1 = df_atom1.reset_index()
df_atom1_indexed = index_data(df_atom1)
df_atom1_indexed.to_csv('../tables/descriptors/df_atom1_ext.csv')

df_global = df_global.reset_index()
df_global_indexed = index_data(df_global)
df_global_indexed.to_csv('../tables/descriptors/df_global_ext.csv')

In [13]:
## Import quantum descriptors
df_atom1 = pd.read_csv('../tables/descriptors/df_atom1_ext.csv').set_index("can")
df_global = pd.read_csv('../tables/descriptors/df_global_ext.csv').set_index("can")

data_Q_tot = {}

data_Q_tot['global']=df_global

data_Q_tot['atom1']=df_atom1

### Gather Lewis acidity energy data

#### Create df_total, a dataframe containing all available energy metrics of Lewis acidity for the 4 molecular structures datasets

In [14]:
# ONO, NNN, OCO, triarylboranes molecules

df_OCO = pd.read_csv('../tables/data/table_smiles_FIA_OCO.csv').set_index('SMILES').drop_duplicates()
df_OCO["group"] = ['OCO' for i in range(len(df_OCO)) ]
df_NNN    = pd.read_csv('../tables/data/table_smiles_FIA_NNN.csv').set_index('SMILES').drop_duplicates()
df_NNN["group"] = ['NNN' for i in range(len(df_NNN))]
df_ONO    = pd.read_csv('../tables/data/table_smiles_FIA_ONO_extended.csv').set_index('SMILES').drop_duplicates()
df_ONO["group"] = ['ONO' for i in range(len(df_ONO))]
df_triarylboranes = pd.read_csv("../tables/data/table_smiles_FIA_triarylboranes.csv").set_index('SMILES').drop_duplicates()
df_triarylboranes['group'] = ['triarylboranes' for i in range(len(df_triarylboranes))]

df_total = df_OCO
df_total = df_total.append(df_NNN)
df_total = df_total.append(df_ONO)
df_total = df_total.append(df_triarylboranes)

# index df_total with smiles written in the RDKit way
df_total = df_total.reset_index()
df_total = replace_with_rdkit_smiles(df_total)            
df_total = df_total.set_index('SMILES')


# gather data for which we computed other metrics of LA
df_OCO_HIA_reorg_nrj = create_df('../tables/data/table_smiles_FIA_HIA_reorg_nrj_OCO.csv')

df_NNN_HIA_reorg_nrj = create_df('../tables/data/table_smiles_FIA_HIA_reorg_nrj_NNN.csv')
df_ONO_HIA = create_df('../tables/data/table_smiles_FIA_HIA_ONO.csv')
df_ONO_reorg_nrj = create_df('../tables/data/table_smiles_FIA_reorg_nrj_ONO.csv')

df_total['HIA'] = [None for i in range(len(df_total))]
df_total['reorg_nrj'] = [None for i in range(len(df_total))]
 
df_list = [df_OCO_HIA_reorg_nrj, df_NNN_HIA_reorg_nrj, df_ONO_HIA]
for df in df_list :
    for smi in df.index :         
        df_total.at[smi, 'HIA']=df.at[smi, 'HIA']
    
df_list = [df_OCO_HIA_reorg_nrj, df_NNN_HIA_reorg_nrj, df_ONO_reorg_nrj]
for df in df_list :
    for smi in df.index :    
        df_total.at[smi, 'reorg_nrj']=df.at[smi, 'reorg_nrj']

## add GEI to total dataframe

df_total['GEI'] = [0 for i in range(len(df_total))]

for smi in df_total.index :
    try : 
        ki = df_global.at[smi,"electronegativity"]
        hardness = df_global.at[smi,"hardness"]
    except :
        print(f'smi {smi} not found, converting to rdkit smi')
        rdkit_smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi))        
        ki = df_global.at[rdkit_smi,"electronegativity"]
        hardness = df_global.at[rdkit_smi,"hardness"]
    
    gei = (ki**2)/(2*hardness)*27.2107 #conversion to eV
    try : df_total.at[smi, 'GEI'] = gei
    except : 
        print(f"no GEI data for {smi}")
        df_total.at[smi, 'GEI'] = None


  df_total = df_total.append(df_NNN)
  df_total = df_total.append(df_ONO)
  df_total = df_total.append(df_triarylboranes)


In [17]:
df_total.to_csv('../tables/data/df_total.csv')

### Gather Gutmann-Beckett data from the litterature and computed energies for various LA

This dataframe wil contain all available data to compare Lewis acidity metrics

In [24]:
df_NMR_nrjs = pd.read_csv('../tables/data/table_NMR_LA_smiles_FIA_HIA_reorg_nrj.csv')
df_NMR_data = pd.read_excel('../tables/data/table_smiles_NMR.xlsx')
df_small_LA_nrjs = pd.read_csv('../tables/data/table_smiles_FIA_HIA_reorg_nrj_small_boron_LA.csv')

In [25]:
df_NMR_nrjs = replace_with_rdkit_smiles(df_NMR_nrjs)
df_NMR_data = replace_with_rdkit_smiles(df_NMR_data)
df_small_LA_nrjs = replace_with_rdkit_smiles(df_small_LA_nrjs)

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


In [26]:
df_NMR_data = df_NMR_data.dropna(subset = 'SMILES')

In [27]:
df_NMR_nrjs = df_NMR_nrjs.set_index('SMILES')
df_NMR_data = df_NMR_data.set_index('SMILES')
df_small_LA_nrjs = df_small_LA_nrjs.set_index('SMILES')
#df_NMR_FIA = df_NMR_FIA.set_index('SMILES')

In [28]:
FIA = []
HIA = []
r_nrjs = []
GEIs = []

for smi in df_NMR_data.index:
    
    try : fia = df_NMR_nrjs.at[smi, "FIA"]
    except : 
        try : fia = df_small_LA_nrjs.at[smi,"FIA"]
        except : fia = None
    FIA.append(fia)
    
    try : hia = df_NMR_nrjs.at[smi, "HIA"]
    except : 
        try : hia = df_small_LA_nrjs.at[smi,"HIA"]
        except : hia = None
    HIA.append(hia)
    
    try : reorg_nrj = df_NMR_nrjs.at[smi,'reorg_nrj']
    except :
        try : reorg_nrj = df_small_LA_nrjs.at[smi,"reorg_nrj"]
        except : reorg_nrj = None
    r_nrjs.append(reorg_nrj)
    
    ki = df_global.at[smi,"electronegativity"]
    hardness = df_global.at[smi,"hardness"]
    gei = (ki**2)/(2*hardness)*27.2107
    GEIs.append(gei)

    

In [29]:
df_NMR_data['FIA']=FIA
df_NMR_data['HIA']=HIA
df_NMR_data['reorg_nrj']=r_nrjs
df_NMR_data['GEI']=GEIs

In [30]:
df_NMR_data.to_excel('../tables/data/NMR_smiles_data_sources_nrjs.xlsx') 

### Gather all data for Lewis acidity scale studies

In [50]:
df_all_data = df_NMR_data.drop(labels= ['delta (neat)', "Ddelta (neat)", 'delta (CD2Cl2)', "Ddelta (CD2Cl2)", "source", "delta (C6D6)", "Ddelta(C6D6)", "source.1"], axis =1)

In [51]:
df_small_LA = pd.read_csv('../tables/data/df_small_LA.csv')

In [52]:
df_small_LA = replace_with_rdkit_smiles(df_small_LA)
df_small_LA = df_small_LA.set_index("SMILES")

In [56]:
for smi in df_small_LA.index :
    if smi not in df_NMR_data.index :
        df_all_data.at[smi,"FIA"] = df_small_LA.at[smi,"FIA"]
        df_all_data.at[smi,"name"] = df_small_LA.at[smi,"names"]
        df_all_data.at[smi,"reorg_nrj"] = df_small_LA.at[smi,"reorg_nrj"]
        df_all_data.at[smi,"HIA"] = df_small_LA.at[smi,"HIA"]
        try : 
            ki = df_global.at[smi,"electronegativity"]
            hardness = df_global.at[smi,"hardness"]
            gei = (ki**2)/(2*hardness)*27.2107
            df_all_data.at[smi,"GEI"] = gei
        except : 
            df_all_data.at[smi,"GEI"] = None

In [59]:
df_all_data.to_excel('../tables/data/all_available_metrics_various_boron_LA.xlsx')