## Prepare Streamlit Data: LCIA QSAR Study
**Date:** October 20, 2023

This notebook is used to pre-compute data for the Streamlit web application. The data are written to a subdirectory that will become the GitHub repository for the Streamlit app.

In [1]:
import pandas as pd
import os
import json

from config_management import UnifiedConfiguration
from data_management import DataManager
from metrics_management import MetricsManager
from results_management import ResultsManager
from results_analysis import ResultsAnalyzer

In [2]:
config_mapping_path = 'Input\configuration-mapping.json'
config = UnifiedConfiguration(config_mapping_path)

data_manager = DataManager(config.data, config.path)
metrics_manager = MetricsManager(config.category_to_dict('metric'))
results_manager = ResultsManager(
    output_dir='Results',
    results_file_type=config.data.file_type
)
results_analyzer = ResultsAnalyzer(
    results_manager, 
    data_manager, 
    config.path.seem3_exposure_file
)

In [3]:
# Map effect categories to keys for the final models
model_key_for_effect = {
    'general' : (
      'general',
      'opera',
      'predicted',
      'missing',
      'true',
      'RandomForestRegressor'
    ),

    'repro_dev' : (
      'repro_dev',
      'opera',
      'predicted',
      'missing',
      'true',
      'RandomForestRegressor'
    )
}

In [4]:
# Load the app configuration file
app_directory = 'streamlit_app'
app_config_file = os.path.join(
    app_directory, 
    'config.json'
)
with open(app_config_file, 'r') as file:
    app_config = json.load(file)

def write_data(
        dataframe,
        file_name,
        data_directory='Data',
        effect=None
    ):
    '''
    '''
    effect_directory = effect if effect else ''
    
    directory_path = os.path.join(
        app_directory, 
        data_directory, 
        effect_directory
    )
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)
        
    full_path = os.path.join(directory_path, file_name)
    dataframe.to_parquet(full_path)

In [5]:
exposure_df = results_analyzer.load_exposure_data()

write_data(exposure_df, app_config['exposure_file_name'])

exposure_df

Unnamed: 0_level_0,95th percentile (mg/kg/day),50th percentile (mg/kg/day),5th percentile (mg/kg/day)
DTXSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DTXSID9047623,5.286615,0.903291,-6.871191
DTXSID0052700,-3.973504,-6.702956,-8.376128
DTXSID00583560,-4.332382,-6.986779,-8.416910
DTXSID00859050,-3.897477,-6.345883,-8.335409
DTXSID00860464,-3.748435,-5.943255,-8.174190
...,...,...,...
DTXSID3038307,-11.725129,-14.558289,-16.664093
DTXSID8038300,-11.750743,-14.263276,-16.447085
DTXSID8074158,-11.957397,-14.431560,-16.529526
DTXSID2032180,-12.103339,-13.921710,-15.699422


Note: The estimator objects were manually copied over from their respective directories.

In [5]:
for effect, model_key in model_key_for_effect.items():
    
    # Get predicted PODs and associated features
    y_pred, X = results_analyzer.predict_out_of_sample(model_key)
    pod_data = results_analyzer.get_pod_comparison_data(model_key)
    pod_data['QSAR'] = y_pred
    
    # Write PODs to disk
    write_data(
        pd.DataFrame(pod_data), 
        app_config['pod_file_name'],
        effect=effect
    )
    
    # Write features to disk
    write_data(
        X, 
        app_config['features_file_name'],
        effect=effect
    )
    
    # Get margins of exposure with uncertainty estimates
    results_for_percentile = results_analyzer.moe_and_prediction_intervals(model_key)
    
    # Write MOEs to disk
    write_data(
        pd.concat(results_for_percentile, axis=1), 
        app_config['moe_file_name'],
        effect=effect
    )    

In [7]:
# For chemicals inside the applicability domain of SEEM3
chem_id_file = 'Input\Raw\OPERA\Input\Application\chemical-identifiers.smi'

# Map DTXSID to QSAR-ready SMILES
chem_ids = pd.Series(
    {dtxsid: smiles for smiles, dtxsid in 
     pd.read_csv(chem_id_file).squeeze().str.split('\t')}
)
chem_ids.name = 'qsar_ready_smiles'
 
write_data(pd.DataFrame(chem_ids), app_config['chem_ids_file_name'])