# Results Analysis: LCIA QSAR Modeling Framework
**Date:** August, 31, 2023 <br>
**Author:** Jacob Kvasnicka

In [1]:
import matplotlib
# matplotlib.use('Agg')  # avoids rendering figures
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

from config_management import UnifiedConfiguration
from data_management import DataManager
from metrics_management import MetricsManager
from results_management import ResultsManager
from results_analysis import ResultsAnalyzer
from plotting.moe import MOE_CATEGORIES

config_mapping_path = 'Input\configuration-mapping.json'
config = UnifiedConfiguration(config_mapping_path)

data_manager = DataManager(config.data, config.path)
metrics_manager = MetricsManager(config.category_to_dict('metric'))
results_manager = ResultsManager(
    output_dir='Results',
    results_file_type=config.data.file_type
)
results_analyzer = ResultsAnalyzer(
    results_manager, 
    data_manager, 
    config.path.seem3_exposure_file
)

key_for_effect = {
    'general' : {
      "target_effect" : "general",
      "features_source" : "opera",
      "ld50_type" : "predicted",
      "data_condition" : "missing",
      "select_features" : "true",
      "estimators" : "RandomForestRegressor"
    },

    'repro_dev' : {
      "target_effect" : "repro_dev",
      "features_source" : "opera",
      "ld50_type" : "predicted",
      "data_condition" : "missing",
      "select_features" : "true",
      "estimators" : "RandomForestRegressor"
    }
}

def get_model_key(effect):
    return tuple(key_for_effect[effect].values())

percentiles = [0.05, 0.5, 0.95]

## Dataset Characterization

In [2]:
def percents_missing(X):
    '''
    Compute the percentages of samples with complete data for each feature.
    '''
    return ((X.isna().sum() / len(X)).sort_values(ascending=False).round(2) * 100).head()

In [3]:
features_path = config.path.file_for_features_source['opera']

X = pd.read_csv(features_path, index_col=0)

percents_missing(X)

BioDeg_HalfLife_pred    73.0
OH_pred                 42.0
CACO2_pred              35.0
Clint_pred              19.0
FUB_pred                13.0
dtype: float64

In [4]:
for effect in key_for_effect:
    
    print(effect)
    
    X, y = results_analyzer.load_features_and_target(**key_for_effect[effect])
    
    print(percents_missing(X))

general
BioDeg_HalfLife_pred    74.0
CACO2_pred              49.0
OH_pred                 44.0
KOA_pred                23.0
KM_pred                 23.0
dtype: float64
repro_dev
BioDeg_HalfLife_pred    75.0
CACO2_pred              49.0
OH_pred                 45.0
KM_pred                 25.0
Clint_pred              24.0
dtype: float64


## Model Evaluation & Important Features

In [24]:
def describe_result(effect, result_type):
    '''
    Return the summary statistics with confidence interval.
    '''    
    if 'importances' in result_type:
        metrics = list(config.plot.label_for_scoring)
    else: 
        metrics = list(config.plot.label_for_metric)
        other_metric = 'mean_absolute_error'
        if other_metric not in metrics:
            metrics.append(other_metric)
            
    if 'root_mean_squared_error' in metrics:
        metrics.append('gsd_squared')
        # Compute GSD squared.     
        performances = results_analyzer.read_result(get_model_key(effect), result_type)
        rmse = performances['root_mean_squared_error']
        gsd = 10**rmse  # in natural units
        performances['gsd_squared'] = gsd**2

    desc = performances.describe(percentiles=percentiles)[metrics].round(2)
    
    return desc.loc[[k for k in desc.index if '%' in k]].T

In [25]:
describe_result('general', 'performances')

Unnamed: 0_level_0,5%,50%,95%
metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
root_mean_squared_error,0.64,0.69,0.75
median_absolute_error,0.36,0.4,0.44
r2_score,0.42,0.48,0.53
mean_absolute_error,0.48,0.52,0.55
gsd_squared,19.13,23.98,31.86


In [26]:
describe_result('repro_dev', 'performances')

Unnamed: 0_level_0,5%,50%,95%
metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
root_mean_squared_error,0.53,0.58,0.72
median_absolute_error,0.28,0.31,0.34
r2_score,0.39,0.5,0.56
mean_absolute_error,0.39,0.42,0.45
gsd_squared,11.41,14.4,27.3


In [8]:
important_features = {}

for effect in key_for_effect:
    
    important_features[effect] = set(results_analyzer.get_important_features(get_model_key(effect)))

In [9]:
important_features['general'].intersection(important_features['repro_dev'])

{'CATMoS_LD50_pred',
 'CombDipolPolariz',
 'ReadyBiodeg_pred_discrete',
 'WS_pred',
 'nbN_discrete'}

In [10]:
important_features['general'].difference(important_features['repro_dev'])

{'MP_pred', 'P_pred', 'TopoPolSurfAir', 'VP_pred', 'ndHBdDon_discrete'}

In [11]:
important_features['repro_dev'].difference(important_features['general'])

{'BCF_pred', 'FUB_pred', 'KM_pred', 'Koc_pred', 'Sp3Sp2HybRatio'}

## Sensitivity Analysis

## Model Application

### Cumulative POD distributions

In [12]:
pod_data = {
    effect : results_analyzer.get_pod_comparison_data(get_model_key(effect)) 
    for effect in key_for_effect
}

# Inverse transform the PODs from log10
(10**pd.DataFrame(pod_data['general']).quantile(percentiles)).round(1)

Unnamed: 0,Regulatory,ToxValDB,QSAR
0.05,0.0,0.4,0.3
0.5,4.0,13.0,12.8
0.95,248.6,175.7,268.6


In [13]:
(10**pd.DataFrame(pod_data['repro_dev']).quantile(percentiles)).round(1)

Unnamed: 0,Regulatory,ToxValDB,QSAR
0.05,0.1,1.9,1.1
0.5,4.3,46.7,49.0
0.95,87.9,246.8,331.8


### Margins of exposure with cumulative counts

In [27]:
# Get the upper bounds of the MOE concern categories
thres_for_concern = {
    k.lower().replace(' ', '_') : bounds[-1] 
    for k, bounds in MOE_CATEGORIES.items()
}

for effect in key_for_effect:
    print(f'For {effect} effects:')

    results_for_percentile = results_analyzer.moe_and_prediction_intervals(get_model_key(effect))

    # Get the upper bound of exposure uncertainty
    ub_exposure_results = results_for_percentile['95th percentile (mg/kg/day)']
    # Get MOEs for the lower bound of the POD prediction interval
    moes = ub_exposure_results['lb']

    for concern, concern_threshold in thres_for_concern.items():
        where_potential_concern = moes <= concern_threshold
        concern_count = ub_exposure_results.loc[where_potential_concern]['cum_count'][-1]
        print(f'\t{round(concern_count, -1)} chemicals {concern}')

For general effects:
	4600 chemicals potential_concern
	740 chemicals definite_concern
For repro_dev effects:
	2580 chemicals potential_concern
	280 chemicals definite_concern


In [15]:
exposure_df = results_analyzer.load_exposure_data()

exposure_difference = (
    exposure_df['95th percentile (mg/kg/day)'] 
    - exposure_df['5th percentile (mg/kg/day)']
)

typical_uncertainty = exposure_difference.median()

print(f'Typical exposure uncertainty: {round(typical_uncertainty)} log10-units')

Typical exposure uncertainty: 4 log10-units
