## Prepare Streamlit Data: LCIA QSAR Study
**Date:** October 20, 2023

This notebook is used to pre-compute data for the Streamlit web application. The data are written to a subdirectory that will become the GitHub repository for the Streamlit app.

In [1]:
import pandas as pd
import os
import json
import matplotlib.pyplot as plt
import matplotlib
# %matplotlib notebook
matplotlib.use('Agg')  # avoids rendering figures
import pickle
import gzip

from config_management import UnifiedConfiguration
from data_management import DataManager
from metrics_management import MetricsManager
from results_management import ResultsManager
from results_analysis import ResultsAnalyzer
import plotting.pod
import plotting.moe

In [2]:
config_mapping_path = 'Input\configuration-mapping.json'
config = UnifiedConfiguration(config_mapping_path)

data_manager = DataManager(config.data, config.path)
metrics_manager = MetricsManager(config.category_to_dict('metric'))
results_manager = ResultsManager(
    output_dir='Results',
    results_file_type=config.data.file_type
)
results_analyzer = ResultsAnalyzer(
    results_manager, 
    data_manager, 
    config.path.seem3_exposure_file
)

In [3]:
# Map effect categories to keys for the final models
model_key_for_effect = {
    'general' : (
      'general',
      'opera',
      'predicted',
      'missing',
      'true',
      'RandomForestRegressor'
    ),

    'repro_dev' : (
      'repro_dev',
      'opera',
      'predicted',
      'missing',
      'true',
      'RandomForestRegressor'
    )
}

In [4]:
# Load the app configuration file
app_dir = 'streamlit_app'
app_config_file = os.path.join(
    app_dir, 
    'config.json'
)
with open(app_config_file, 'r') as file:
    app_config = json.load(file)

def write_parquet(
        dataframe,
        app_dir,
        data_dir,
        file_name,
        effect_dir=''
    ):
    '''
    ''' 
    file_path = build_file_path(
        app_dir, 
        data_dir,
        file_name,
        effect_dir=effect_dir
    )
    
    dataframe.to_parquet(file_path)
    
def pickle_dump(
        obj, 
        app_dir,
        data_dir,
        effect_dir,
        file_name
    ):
    '''
    '''
    file_path = build_file_path(
        app_dir, 
        data_dir,
        file_name,
        effect_dir=effect_dir
    )
    
    with gzip.open(file_path, 'wb') as file:
        pickle.dump(obj, file)
        
def build_file_path(
        app_dir,
        data_dir,
        file_name,
        effect_dir=''
    ):
    '''
    '''
    directory_path = os.path.join(
        app_dir, 
        data_dir, 
        effect_dir
    )
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)
        
    return os.path.join(directory_path, file_name)

In [5]:
# TODO: Reduce figure file size. Try downsampling data with gzip compressions

In [6]:
for effect, model_key in model_key_for_effect.items():
    
    # Get predicted PODs and associated features
    y_pred, X = results_analyzer.predict_out_of_sample(model_key)
    pod_data = results_analyzer.get_pod_comparison_data(model_key)
    pod_data['QSAR'] = y_pred
    
    # Include the cumulative proportions for CDF figure
    pod_cdf_data = {}  # initialize
    for k, pods in pod_data.items():
        sorted_values, cumulative_data = results_analyzer.generate_cdf_data(
            pods, normalize=True)
        pod_cdf_data[k] = pd.DataFrame({
            'POD' : sorted_values, 
            'Cum_Proportion' : cumulative_data
        })
    pod_cdf_data = (
        pd.concat(pod_cdf_data, axis=1)
        .swaplevel(axis=1)
    )
    
    
    # Write PODs to disk
    write_parquet(
        pod_cdf_data,
        app_dir,
        app_config['data_dir'],
        app_config['pod_file_name'],
        effect_dir=effect
    )
    
    # Write features to disk
    write_parquet(
        X,
        app_dir,
        app_config['data_dir'],
        app_config['features_file_name'],
        effect_dir=effect
    )

    ## Write Figure objects to disk for subsequent modifications
    
    pod_fig, _ = plotting.pod.single_model_cdfs(
        pod_data, 
        results_analyzer
    )
    pickle_dump(
        pod_fig, 
        app_dir,
        app_config['data_dir'],
        effect,
        app_config['pod_fig_file_name']
    )
    
    moe_fig, _ = plotting.moe.single_model_moes(
        model_key, 
        results_analyzer, 
        config.plot
    )
    pickle_dump(
        moe_fig, 
        app_dir,
        app_config['data_dir'],
        effect,
        app_config['moe_fig_file_name']
    )
        
    moe_data = (
        pd.concat(
            results_analyzer.moe_and_prediction_intervals(model_key), 
            axis=1
        )
        .swaplevel(axis=1)  # for convenient indexing
        .rename(columns=app_config['pod_key_mapper'])  # for user friendliness
        .rename(columns=app_config['exposure_key_mapper'], level=-1)
    )
    write_parquet(
        moe_data, 
        app_dir,
        app_config['data_dir'],
        app_config['moe_file_name'],
        effect_dir=effect
    )    

In [7]:
# For chemicals inside the applicability domain of SEEM3
chem_id_file = 'Input\Raw\OPERA\Input\Application\chemical-identifiers.smi'

# Map DTXSID to QSAR-ready SMILES
chem_ids = pd.Series(
    {dtxsid: smiles for smiles, dtxsid in 
     pd.read_csv(chem_id_file).squeeze().str.split('\t')}
)
chem_ids.name = 'qsar_ready_smiles'
 
write_parquet(
    pd.DataFrame(chem_ids), 
    app_dir, 
    app_config['data_dir'],
    app_config['chem_ids_file_name']
)