# Parse Raw Data: LCIA QSAR Project
**Author:** Jacob Kvasnicka <br>
**Date:** February 24, 2023

This module should theoretically be ran once and then any feature engineering can be done separately.

In [6]:
from config_management import UnifiedConfiguration
from raw_processing.processor import RawDataProcessor

In [7]:
config_mapping_path = 'Input\configuration-mapping.json'
config = UnifiedConfiguration(config_mapping_path)

raw_processor = RawDataProcessor(config.raw_data, config.data, config.path)

## Get chemical identifiers

In [3]:
identifiers = raw_processor.get_labeled_identifiers()

identifiers

0       DTXSID5020281
1       DTXSID8020961
2       DTXSID0021834
3       DTXSID3032622
4       DTXSID2044347
            ...      
8313    DTXSID5057882
8314    DTXSID5057884
8315    DTXSID0057885
8316    DTXSID1057905
8317    DTXSID6057906
Name: dtxsid, Length: 6598, dtype: object

In [4]:
seem3_identifiers = raw_processor.get_seem3_identifiers()

seem3_identifiers

0          DTXSID9047623
1          DTXSID0052700
2         DTXSID00583560
3         DTXSID00859050
4         DTXSID00860464
               ...      
479861     DTXSID3038307
479862     DTXSID8038300
479863     DTXSID8074158
479864     DTXSID2032180
479865     DTXSID7020895
Name: DTXSID, Length: 479866, dtype: object

In [5]:
dsstox_sdf_data = raw_processor.process_from_raw('dsstox_sdf_data')

dsstox_sdf_data

[10:02:40] Unusual charge on atom 12 number of radical electrons set to zero


[10:05:01] Unusual charge on atom 0 number of radical electrons set to zero
[10:05:01] Unusual charge on atom 0 number of radical electrons set to zero
[10:05:05] Explicit valence for atom # 1 Sb, 9, is greater than permitted
[10:05:05] ERROR: Could not sanitize molecule ending on line 1943790
[10:05:05] ERROR: Explicit valence for atom # 1 Sb, 9, is greater than permitted
[10:05:06] ERROR: SGroup SAP line too short: 'M  SAP   1  1  51  14 1' on line 2090317
[10:05:06] ERROR: moving to the beginning of the next molecule
[10:05:06] Unusual charge on atom 1 number of radical electrons set to zero
[10:05:06] Unusual charge on atom 2 number of radical electrons set to zero
[10:05:07] atom 6 has specified valence (3) smaller than the drawn valence 18.
[10:05:08] Explicit valence for atom # 0 Sb, 9, is greater than permitted
[10:05:08] ERROR: Could not sanitize molecule ending on line 2376873
[10:05:08] ERROR: Explicit valence for atom # 0 Sb, 9, is greater than permitted
[10:05:08] atom 0 h

[10:05:08] atom 0 has specified valence (3) smaller than the drawn valence 18.
[10:05:12] Unusual charge on atom 2 number of radical electrons set to zero
[10:05:12] Unusual charge on atom 3 number of radical electrons set to zero
[10:05:12] Unusual charge on atom 1 number of radical electrons set to zero
[10:05:12] atom 1 has specified valence (3) smaller than the drawn valence 18.
[10:05:12] Unusual charge on atom 3 number of radical electrons set to zero
[10:05:16] Explicit valence for atom # 0 Sb, 9, is greater than permitted
[10:05:16] ERROR: Could not sanitize molecule ending on line 3693980
[10:05:16] ERROR: Explicit valence for atom # 0 Sb, 9, is greater than permitted
[10:05:16] Explicit valence for atom # 1 Sb, 9, is greater than permitted
[10:05:16] ERROR: Could not sanitize molecule ending on line 3732955
[10:05:16] ERROR: Explicit valence for atom # 1 Sb, 9, is greater than permitted


KeyboardInterrupt: 

## Target variable: Surrogate POD [mg/(kg-d)]

In [None]:
surrogate_pods = raw_processor.process_from_raw('surrogate_pods')

surrogate_pods

## Regulatory PODs (fully adjusted to human equivalent dose) [mg(kg-d)]

See the tab “Data for Figure 5” – the CASRN are in columns A (general non-cancer) and G (repro/dev effects), and the regulatory PODs (fully adjusted to human equivalent dose) are in columns F and L.

In [None]:
%%time

reg_pods = raw_processor.process_from_raw('regulatory_pods')

reg_pods

## Experimental LD50 values

In [None]:
%%time 

ld50s = raw_processor.process_from_raw('experimental_ld50s')

ld50s

## CompTox features: OPERA + TEST predictions

In [8]:
%%time

comptox_features = raw_processor.process_from_raw('comptox_features')

comptox_features

CPU times: total: 141 ms
Wall time: 154 ms


Unnamed: 0_level_0,AVERAGE_MASS,48HR_DAPHNIA_LC50_MOL/L_TEST_PRED,DENSITY_G/CM^3_TEST_PRED,DEVTOX_TEST_PRED,96HR_FATHEAD_MINNOW_MOL/L_TEST_PRED,FLASH_POINT_DEGC_TEST_PRED,AMES_MUTAGENICITY_TEST_PRED,ORAL_RAT_LD50_MOL/KG_TEST_PRED,SURFACE_TENSION_DYN/CM_TEST_PRED,THERMAL_CONDUCTIVITY_MW/(M*K)_TEST_PRED,...,BIODEGRADATION_HALF_LIFE_DAYS_DAYS_OPERA_PRED,BOILING_POINT_DEGC_OPERA_PRED,HENRYS_LAW_ATM-M3/MOLE_OPERA_PRED,OPERA_KM_DAYS_OPERA_PRED,OCTANOL_AIR_PARTITION_COEFF_LOGKOA_OPERA_PRED,SOIL_ADSORPTION_COEFFICIENT_KOC_L/KG_OPERA_PRED,OCTANOL_WATER_PARTITION_LOGP_OPERA_PRED,MELTING_POINT_DEGC_OPERA_PRED,VAPOR_PRESSURE_MMHG_OPERA_PRED,WATER_SOLUBILITY_MOL/L_OPERA_PRED
DTXSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DTXSID5020281,157.550,1.081430e-04,1.372,0.323,5.714790e-05,109.227,0.966,0.007079,,137.950,...,4.84081,242.050,4.947790e-06,0.332365,2.124320e+04,507.6930,246.342872,82.9942,2.203470e-02,1.317610e-03
DTXSID8020961,138.126,6.486340e-05,1.309,0.250,4.385310e-04,141.229,0.642,0.009141,,162.043,...,6.64833,331.672,1.265030e-09,0.306715,3.734737e+06,75.9962,24.436556,145.5400,3.265100e-06,4.815930e-03
DTXSID0021834,139.110,5.395110e-05,1.376,0.281,1.472310e-04,117.588,0.490,0.005140,,152.108,...,4.09570,278.910,4.207650e-10,0.251436,1.297807e+06,233.1200,81.678923,113.5640,9.970590e-05,9.679290e-02
DTXSID3032622,99.089,,1.276,0.639,,36.844,0.372,0.004457,,155.108,...,4.27009,215.029,9.095360e-08,0.288388,5.393492e+04,10.0605,2.852542,86.5110,1.396390e-03,8.066960e-01
DTXSID2044347,150.177,1.312200e-04,1.072,0.621,2.904020e-04,98.358,0.305,0.004887,36.672,139.705,...,4.91679,248.981,9.683370e-08,0.279812,1.183477e+05,98.5113,55.324819,37.4772,6.500820e-03,1.201570e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DTXSID6057879,345.460,1.291220e-05,1.232,0.860,9.183330e-07,271.676,0.563,0.001766,,,...,6.00889,417.425,1.882080e-07,1.137960,3.325830e+10,881.4960,5365.126916,181.8440,1.564390e-08,4.541010e-07
DTXSID5057880,353.860,7.144960e-06,1.256,0.846,1.023290e-06,261.844,0.204,,,,...,3.35362,415.812,2.941970e-07,1.655430,4.403520e+10,961.6230,892.585855,125.4470,6.420140e-08,5.643740e-07
DTXSID5057882,300.150,2.065380e-07,1.363,0.679,7.211070e-07,187.442,0.313,,,138.484,...,3.54031,337.893,2.333810e-07,3.627750,4.228633e+10,8156.8700,7006.481829,161.2890,2.468280e-07,1.982760e-06
DTXSID5057884,409.310,,1.434,0.933,,297.064,0.200,,,,...,6.80176,375.225,1.494930e-10,0.148214,3.513581e+09,2186.0300,808.946872,181.9450,1.335850e-09,3.015700e-05


## OPERA 2.9 features

In [9]:
AD_flags, opera_features = raw_processor.process_from_raw('opera_features')

opera_features

Unnamed: 0_level_0,CERAPP_Ago_pred_discrete,CERAPP_Anta_pred_discrete,CERAPP_Bind_pred_discrete,CoMPARA_Ago_pred_discrete,CoMPARA_Anta_pred_discrete,CoMPARA_Bind_pred_discrete,CATMoS_LD50_pred,FUB_pred,Clint_pred,CACO2_pred,...,Sp3Sp2HybRatio,nbRotBd_discrete,nbHBdAcc_discrete,ndHBdDon_discrete,nbLipinskiFailures_discrete,TopoPolSurfAir,MolarRefract,CombDipolPolariz,VP_pred,WS_pred
DTXSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DTXSID5020281,0.0,0.0,0.0,0.0,0.0,0.0,236.0,0.20,399.37,,...,0.000000,1.0,0.0,0.0,0.0,43.14,9.5056,0.900,2.187762e-02,1.318257e-03
DTXSID8020961,0.0,0.0,0.0,0.0,0.0,0.0,787.0,0.33,23.59,,...,0.000000,1.0,1.0,1.0,0.0,69.16,8.1121,1.201,3.311311e-06,4.786301e-03
DTXSID0021834,0.0,0.0,0.0,0.0,0.0,0.0,228.0,0.15,25.94,,...,0.000000,1.0,0.0,1.0,0.0,63.37,6.4546,1.065,1.000000e-04,9.772372e-02
DTXSID3032622,0.0,0.0,0.0,0.0,0.0,0.0,2446.0,0.93,4.35,,...,0.250000,0.0,1.0,1.0,0.0,38.33,24.3643,0.909,1.380384e-03,8.128305e-01
DTXSID2044347,0.0,0.0,0.0,0.0,0.0,0.0,2606.0,0.15,0.00,-4.92,...,0.222222,2.0,1.0,0.0,0.0,26.30,18.0862,1.185,6.606934e-03,1.412538e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DTXSID10855048,0.0,0.0,0.0,0.0,1.0,1.0,2702.0,,,,...,0.000000,2.0,0.0,0.0,1.0,9.23,44.6146,1.900,6.760830e-08,1.445440e-09
DTXSID5047902,0.0,0.0,0.0,0.0,0.0,0.0,1587.0,,0.67,,...,1.000000,1.0,2.0,2.0,0.0,46.25,16.2111,0.530,5.011872e-02,1.621810e+01
DTXSID30785570,0.0,0.0,0.0,0.0,1.0,1.0,2702.0,,,,...,0.000000,2.0,0.0,0.0,1.0,9.23,44.6146,1.948,8.317638e-08,7.943282e-10
DTXSID5061586,0.0,0.0,0.0,0.0,0.0,0.0,14316.0,,,,...,0.970588,32.0,2.0,0.0,2.0,26.30,111.7869,0.558,4.265795e-07,6.606934e-08
