# Results Analysis: LCIA QSAR Modeling Framework
**Date:** August, 31, 2023 <br>
**Author:** Jacob Kvasnicka

In [1]:
import matplotlib
# matplotlib.use('Agg')  # avoids rendering figures
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

from raw_processing import other_sources
from config_management import UnifiedConfiguration
from data_management import DataManager
from metrics_management import MetricsManager
from results_management import ResultsManager
from results_analysis import ResultsAnalyzer
from plotting.moe import MOE_CATEGORIES

config_mapping_path = 'Input\Configuration\configuration-mapping.json'
config = UnifiedConfiguration(config_mapping_path)

data_manager = DataManager(config.data, config.path)
metrics_manager = MetricsManager(config.category_to_dict('metric'))
results_manager = ResultsManager(
    output_dir='Results',
    results_file_type=config.data.file_type
)
results_analyzer = ResultsAnalyzer(
    results_manager, 
    data_manager
)

key_for_effect = {
    'general' : {
      "target_effect" : "general",
      "features_source" : "opera",
      "ld50_type" : "predicted",
      "data_condition" : "missing",
      "select_features" : "true",
      "estimators" : "RandomForestRegressor"
    },

    'repro_dev' : {
      "target_effect" : "repro_dev",
      "features_source" : "opera",
      "ld50_type" : "predicted",
      "data_condition" : "missing",
      "select_features" : "true",
      "estimators" : "RandomForestRegressor"
    }
}

def get_model_key(effect):
    return tuple(key_for_effect[effect].values())

percentiles = [0.05, 0.5, 0.95]

## Dataset Characterization

In [2]:
raw_surrogate_pods = other_sources.toxicity_data_and_study_counts_from_excel(
    config.path.raw_surrogate_pods_file, 
    config.raw_data.tox_metric, 
    'dtxsid', 
    config.raw_data.surrogate_tox_data_kwargs
)

for effect in list(raw_surrogate_pods.columns.unique(level=0)):
    N = len(raw_surrogate_pods[effect].dropna())
    print(f'{N} chemicals for {effect}')

5209 chemicals for non-reproductive/developmental effects
4938 chemicals for reproductive/developmental effects


In [3]:
surrogate_pods = pd.read_csv(
    config.path.surrogate_pods_file, 
    index_col=0
)

for effect, y in list(surrogate_pods.items()):
    N = len(y.dropna())
    print(f'{N} chemicals for {effect}')

2404 chemicals for general
2999 chemicals for repro_dev


In [4]:
for effect in surrogate_pods:    
    _, y = results_analyzer.load_features_and_target(**key_for_effect[effect])
    N = len(y)
    print(f'{N} chemicals for {effect}')    

1791 chemicals for general
2228 chemicals for repro_dev


In [5]:
application_chemicals = data_manager.load_application_chemicals()

print(f'{len(application_chemicals)} application chemicals')

32524 application chemicals


In [6]:
def percents_missing(X):
    '''
    Compute the percentages of samples with complete data for each feature.
    '''
    return ((X.isna().sum() / len(X)).sort_values(ascending=False).round(2) * 100).head()

In [7]:
features_path = config.path.file_for_features_source['opera']

X = pd.read_parquet(features_path)

percents_missing(X)

BioDeg_HalfLife_pred    72.0
CACO2_pred              45.0
OH_pred                 37.0
Clint_pred              22.0
KM_pred                 21.0
dtype: float64

In [8]:
for effect in key_for_effect:
    
    print(effect)
    
    X, y = results_analyzer.load_features_and_target(**key_for_effect[effect])
    
    print(percents_missing(X))

general
BioDeg_HalfLife_pred    74.0
CACO2_pred              49.0
OH_pred                 44.0
KOA_pred                23.0
KM_pred                 23.0
dtype: float64
repro_dev
BioDeg_HalfLife_pred    75.0
CACO2_pred              49.0
OH_pred                 45.0
KM_pred                 25.0
Clint_pred              24.0
dtype: float64


## Model Evaluation & Important Features

In [9]:
def describe_result(effect, result_type):
    '''
    Return the summary statistics with confidence interval.
    '''    
    if 'importances' in result_type:
        metrics = list(config.plot.label_for_scoring)
    else: 
        metrics = list(config.plot.label_for_metric)
        other_metric = 'mean_absolute_error'
        if other_metric not in metrics:
            metrics.append(other_metric)
            
    if 'root_mean_squared_error' in metrics:
        metrics.append('gsd')
        metrics.append('gsd_squared')
        # Compute GSD squared.     
        performances = results_analyzer.read_result(get_model_key(effect), result_type)
        rmse = performances['root_mean_squared_error']
        gsd = 10**rmse  # in natural units
        performances['gsd'] = gsd
        performances['gsd_squared'] = gsd**1.96

    desc = performances.describe(percentiles=percentiles)[metrics].round(2)
    
    return desc.loc[[k for k in desc.index if '%' in k]].T

In [10]:
describe_result('general', 'performances')

Unnamed: 0_level_0,5%,50%,95%
metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
root_mean_squared_error,0.7,0.71,0.74
median_absolute_error,0.41,0.42,0.42
r2_score,0.41,0.44,0.47
mean_absolute_error,0.53,0.53,0.55
gsd,5.03,5.16,5.54
gsd_squared,23.72,24.91,28.71


In [11]:
describe_result('repro_dev', 'performances')

Unnamed: 0_level_0,5%,50%,95%
metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
root_mean_squared_error,0.58,0.62,0.67
median_absolute_error,0.3,0.33,0.34
r2_score,0.43,0.45,0.5
mean_absolute_error,0.42,0.44,0.45
gsd,3.79,4.16,4.66
gsd_squared,13.63,16.43,20.43


In [12]:
important_features = {}

for effect in key_for_effect:
    
    important_features[effect] = set(results_analyzer.get_important_features(get_model_key(effect)))

In [13]:
important_features['general'].intersection(important_features['repro_dev'])

{'CATMoS_LD50_pred',
 'CombDipolPolariz',
 'ReadyBiodeg_pred_discrete',
 'Sp3Sp2HybRatio'}

In [14]:
important_features['general'].difference(important_features['repro_dev'])

{'MP_pred',
 'P_pred',
 'TopoPolSurfAir',
 'VP_pred',
 'nbN_discrete',
 'ndHBdDon_discrete'}

In [15]:
important_features['repro_dev'].difference(important_features['general'])

{'BCF_pred', 'FUB_pred', 'KM_pred', 'KOA_pred', 'Koc_pred', 'WS_pred'}

## Model Application

### POD cumulative frequencies

In [16]:
pod_data = {
    effect : results_analyzer.pod_and_prediction_interval(
        get_model_key(effect),
        inverse_transform=True, 
        normalize=True
    )
    for effect in key_for_effect
}

# Get median value with prediction interval
for effect, df in pod_data.items():
    median = df['pod'].quantile()
    # Find the closest index to the median to avoid float issues
    abs_diff = (df['pod'] - median).abs()
    idx = abs_diff.idxmin()
    median_data = df.loc[idx]
    print(f"{effect}: {median} ({median_data['lb']}, {median_data['ub']})")

general: 11.23102806892201 (0.7563387635693767, 166.77181913781948)
repro_dev: 30.835350038180074 (2.975160965060669, 319.52207402361194)


### Compare POD estimates (benchmarking)

In [17]:
# NOTE: Not sure whether these data are used in the paper
pod_comparison_data = {
    effect : results_analyzer.get_pod_comparison_data(get_model_key(effect)) 
    for effect in key_for_effect
}

# Inverse transform the PODs from log10
(10**pd.DataFrame(pod_comparison_data['general']).quantile(percentiles)).round(1)

Unnamed: 0,Authoritative,ToxValDB,QSAR
0.05,0.0,0.3,2.0
0.5,4.4,12.8,11.3
0.95,212.8,268.6,99.5


In [18]:
(10**pd.DataFrame(pod_comparison_data['repro_dev']).quantile(percentiles)).round(1)

Unnamed: 0,Authoritative,ToxValDB,QSAR
0.05,0.1,1.1,4.9
0.5,4.3,49.0,31.6
0.95,93.1,331.8,178.0


### Margins of exposure with cumulative counts

In [19]:
# Get the upper bounds of the MOE concern categories
thres_for_concern = {
    k.lower().replace(' ', '_') : bounds[-1] 
    for k, bounds in MOE_CATEGORIES.items()
}

for effect in key_for_effect:
    print(f'For {effect} effects:')

    results_for_percentile = results_analyzer.moe_and_prediction_intervals(get_model_key(effect))

    # Get the upper bound of exposure uncertainty
    ub_exposure_results = results_for_percentile['95th percentile (mg/kg/day)']
    # Get MOEs for the lower bound of the POD prediction interval
    moes = ub_exposure_results['lb']

    for concern, concern_threshold in thres_for_concern.items():
        where_potential_concern = moes <= concern_threshold
        concern_count = ub_exposure_results.loc[where_potential_concern]['cum_count'][-1]
        print(f'\t{concern_count} chemicals {concern}')

For general effects:
	2378 chemicals moderate_concern
	509 chemicals high_concern
For repro_dev effects:
	1512 chemicals moderate_concern
	201 chemicals high_concern


In [20]:
exposure_df = data_manager.load_exposure_data().loc[application_chemicals]

exposure_difference = (
    exposure_df['95th percentile (mg/kg/day)'] 
    - exposure_df['5th percentile (mg/kg/day)']
)

typical_uncertainty = exposure_difference.median()

print(f'Typical exposure uncertainty: {round(typical_uncertainty)} log10-units')

Typical exposure uncertainty: 4 log10-units
