# Parse Raw Data: LCIA QSAR Project
**Author:** Jacob Kvasnicka <br>
**Date:** February 24, 2023

This module should theoretically be ran once and then any feature engineering can be done separately.

In [1]:
import pandas as pd
import numpy as np
from os import path 
import parse
from config_management import UnifiedConfiguration

import comptox
import opera
import features

pd.set_option('display.max_columns', None)  

index_col = 'DTXSID'

config_mapping_path = 'Input\configuration-mapping.json'
config = UnifiedConfiguration(config_mapping_path)

## Chemical identifiers from CompTox

In [2]:
chem_identifiers = pd.read_csv(
    config.path.comptox_identifiers_file, 
    index_col=index_col
)

chem_identifiers

Unnamed: 0_level_0,INPUT,FOUND_BY,PREFERRED_NAME,DTXCID,CASRN,INCHIKEY,IUPAC_NAME,SMILES,INCHI_STRING,QSAR_READY_SMILES,MOLECULAR_FORMULA
DTXSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
DTXSID5020281,DTXSID5020281,DSSTox_Substance_Id,1-Chloro-4-nitrobenzene,DTXCID10281,100-00-5,CZGCEKJOLUNIFY-UHFFFAOYSA-N,1-Chloro-4-nitrobenzene,[O-][N+](=O)C1=CC=C(Cl)C=C1,InChI=1S/C6H4ClNO2/c7-5-1-3-6(4-2-5)8(9)10/h1-...,[O-][N+](=O)C1=CC=C(Cl)C=C1,C6H4ClNO2
DTXSID8020961,DTXSID8020961,DSSTox_Substance_Id,4-Nitrobenzenamine,DTXCID40961,100-01-6,TYMLOMAKGOJONV-UHFFFAOYSA-N,4-Nitroaniline,NC1=CC=C(C=C1)[N+]([O-])=O,InChI=1S/C6H6N2O2/c7-5-1-3-6(4-2-5)8(9)10/h1-4...,NC1=CC=C(C=C1)[N+]([O-])=O,C6H6N2O2
DTXSID0021834,DTXSID0021834,DSSTox_Substance_Id,4-Nitrophenol,DTXCID201834,100-02-7,BTJIUGUIPKRLHP-UHFFFAOYSA-N,4-Nitrophenol,OC1=CC=C(C=C1)[N+]([O-])=O,"InChI=1S/C6H5NO3/c8-6-3-1-5(2-4-6)7(9)10/h1-4,...",OC1=CC=C(C=C1)[N+]([O-])=O,C6H5NO3
DTXSID3032622,DTXSID3032622,DSSTox_Substance_Id,Hymexazol,DTXCID1012622,10004-44-1,KGVPNLBXJKTABS-UHFFFAOYSA-N,"5-Methyl-1,2-oxazol-3(2H)-one",CC1=CC(=O)NO1,"InChI=1S/C4H5NO2/c1-3-2-4(6)5-7-3/h2H,1H3,(H,5...",CC1=CC(=O)NO1,C4H5NO2
DTXSID2044347,DTXSID2044347,DSSTox_Substance_Id,4'-Methoxyacetophenone,DTXCID0024347,100-06-1,NTPLXRHDUXRPNE-UHFFFAOYSA-N,1-(4-Methoxyphenyl)ethan-1-one,COC1=CC=C(C=C1)C(C)=O,InChI=1S/C9H10O2/c1-7(10)8-3-5-9(11-2)6-4-8/h3...,COC1=CC=C(C=C1)C(C)=O,C9H10O2
...,...,...,...,...,...,...,...,...,...,...,...
DTXSID5057882,DTXSID5057882,DSSTox_Substance_Id,CP-939689,DTXCID2031669,NOCAS_57882,PYUQQPZQUSFCHK-UHFFFAOYSA-N,"1-[2-(3,4-Dichlorophenoxy)-5-fluorophenyl]etha...",CC(N)C1=CC(F)=CC=C1OC1=CC=C(Cl)C(Cl)=C1,InChI=1/C14H12Cl2FNO/c1-8(18)11-6-9(17)2-5-14(...,CC(N)C1=CC(F)=CC=C1OC1=CC=C(Cl)C(Cl)=C1,C14H12Cl2FNO
DTXSID5057884,DTXSID5057884,DSSTox_Substance_Id,CJ-013974,DTXCID6031671,NOCAS_57884,LMPBXMBUTBQPJJ-QFBILLFUSA-N,"2-(3,4-Dichlorophenyl)-N-hydroxy-N-{(1S)-2-[(3...",O[C@H]1CCN(C[C@@H](N(O)C(=O)CC2=CC(Cl)=C(Cl)C=...,InChI=1S/C20H22Cl2N2O3/c21-17-7-6-14(10-18(17)...,O[C@H]1CCN(C[C@@H](N(O)C(=O)CC2=CC(Cl)=C(Cl)C=...,C20H22Cl2N2O3
DTXSID0057885,DTXSID0057885,DSSTox_Substance_Id,CP-395919,DTXCID1031672,NOCAS_57885,DJLMIXIBPMWLNC-UHFFFAOYSA-N,"N-[2-(2-Acetamidoethyl)-1,2,3,4-tetrahydroisoq...",CC(=O)NCCN1CCC2=CC(NC(=O)C3=CC=CC=C3C3=CC=C(C=...,InChI=1S/C27H26F3N3O2/c1-18(34)31-13-15-33-14-...,CC(=O)NCCN1CCC2=CC(NC(=O)C3=CC=CC=C3C3=CC=C(C=...,C27H26F3N3O2
DTXSID1057905,DTXSID1057905,DSSTox_Substance_Id,MK 0493,,1021945-00-5,,,,,,


### Define chemicals to exclude from QSAR

In [3]:
chemicals_to_exclude = opera.chemicals_to_exclude_from_qsar(
    config.path.chemical_id_dev_file, 
    config.path.chemical_structures_dev_file
)

print(len(chemicals_to_exclude), 'chemicals to exclude from QSAR modeling')

1856 chemicals to exclude from QSAR modeling


## Target variable: Surrogate POD [mg/(kg-d)]

In [4]:
sheet_name = 'ORAL'
tox_metric = 'POD [mg/kg-d]'

# Map original keys to preferred keys for the return.
effect_mapper = {
    'non-reproductive/developmental effects' : 'general',
    'reproductive/developmental effects' : 'repro_dev'
}

surrogate_pods = parse.surrogate_toxicity_values_from_excel(
    config.path.raw_surrogate_pods_file, 
    sheet_name,
    tox_metric, 
    index_col.lower(), 
    log10=True,
    effect_mapper=effect_mapper,
    write_path=config.path.surrogate_pods_file
)

surrogate_pods

Unnamed: 0_level_0,general,repro_dev
DTXSID,Unnamed: 1_level_1,Unnamed: 2_level_1
DTXSID00100074,,2.524183
DTXSID00100498,1.980348,
DTXSID001005033,,2.124155
DTXSID001006300,1.706121,
DTXSID00100756,,2.411296
...,...,...
DTXSID9096313,1.240166,
DTXSID90965533,,1.899360
DTXSID9098147,1.968909,2.411296
DTXSID9098220,,2.286296


## Regulatory PODs (fully adjusted to human equivalent dose) [mg(kg-d)]

See the tab “Data for Figure 5” – the CASRN are in columns A (general non-cancer) and G (repro/dev effects), and the regulatory PODs (fully adjusted to human equivalent dose) are in columns F and L.

In [5]:
%%time

# Define the integer locations of the relevant columns.
ilocs_for_effect = {
    'general' : [0, 5],
    'repro_dev' : [6, 11]
}

# Map CASRN to index_col for replacing the original index.
chem_id_for_casrn = (
    chem_identifiers
    .reset_index()
    .set_index('CASRN')[index_col]
    .to_dict()
)

reg_pods = parse.regulatory_toxicity_values_from_csv(
    config.path.raw_regulatory_pods_file, 
    ilocs_for_effect, 
    chem_id_for_casrn=chem_id_for_casrn, 
    new_chem_id=index_col, 
    write_path=config.path.regulatory_pods_file
)

reg_pods

CPU times: total: 46.9 ms
Wall time: 32.9 ms


Unnamed: 0_level_0,general,repro_dev
DTXSID,Unnamed: 1_level_1,Unnamed: 2_level_1
DTXSID5020281,-0.891,
DTXSID8020961,0.286,
DTXSID6026080,2.193,
DTXSID0021836,-0.534,
DTXSID3020596,2.028,
...,...,...
DTXSID9021762,,2.682
DTXSID0039229,,1.438
DTXSID5021386,,0.191
DTXSID3020207,,-0.763


## Oral equivalent doses for active ToxCast assays [mg/(kg-d)]

Data prepared by En-Hsuan Lu on May 28, 2023.

In [6]:
%%time

oed_columns = [
    'tox_httk.50',
    'tox_httk.95'
]

oeds = parse.toxcast_expocast_from_csv(
    config.path.raw_toxcast_oeds_file, 
    index_col, 
    data_columns=oed_columns,
    log10=True,
    write_path=config.path.toxcast_oeds_file
)
    
oeds

CPU times: total: 0 ns
Wall time: 15 ms


Unnamed: 0_level_0,tox_httk_50,tox_httk_95
DTXSID,Unnamed: 1_level_1,Unnamed: 2_level_1
DTXSID9034650,-4.465181,-5.456664
DTXSID6024177,-0.965089,-1.341372
DTXSID0020606,0.694738,0.187224
DTXSID7032555,-1.380711,-2.126687
DTXSID5034270,-0.389368,-0.780627
...,...,...
DTXSID3020964,1.435501,0.856183
DTXSID0034930,-1.244328,-1.659792
DTXSID4032615,-1.797070,-2.601392
DTXSID5021831,-0.871298,-1.521485


## Experimental LD50 values

In [7]:
%%time 

ld50_columns = [
    'median_LD50'
]

# Apply inverse-log10 transformation to get the original scale.
ld50s = parse.experimental_ld50s_from_excel(
    config.path.raw_ld50_experimental_file, 
    chem_identifiers, 
    index_col, 
    ld50_columns=ld50_columns, 
    write_path=config.path.ld50_experimental_file
)

ld50s

CPU times: total: 797 ms
Wall time: 859 ms


Unnamed: 0_level_0,median_LD50
DTXSID,Unnamed: 1_level_1
DTXSID5020281,138.120754
DTXSID8020961,126.969502
DTXSID0021834,69.543651
DTXSID2044347,420.473799
DTXSID9059204,518.581177
...,...
DTXSID60469235,73.338453
DTXSID2033447,134.034040
DTXSID3041794,366.640543
DTXSID6052667,345.720785


## CompTox features: OPERA + TEST predictions

In [8]:
%%time

columns_to_exclude = [
    'INPUT', 
    'FOUND_BY',
    'PREFERRED_NAME',
    'MONOISOTOPIC_MASS',
    'OPERA_PKAA_OPERA_PRED',
    'OPERA_PKAB_OPERA_PRED'
]
    
comptox_features = comptox.opera_test_predictions_from_csv(
    config.path.raw_comptox_features_file, 
    index_col, 
    chemicals_to_exclude=chemicals_to_exclude,
    columns_to_exclude=columns_to_exclude,
    log10_pat='LOG', 
    write_path=config.path.file_for_features_source['comptox']
)

comptox_features

CPU times: total: 141 ms
Wall time: 136 ms


Unnamed: 0_level_0,AVERAGE_MASS,48HR_DAPHNIA_LC50_MOL/L_TEST_PRED,DENSITY_G/CM^3_TEST_PRED,DEVTOX_TEST_PRED,96HR_FATHEAD_MINNOW_MOL/L_TEST_PRED,FLASH_POINT_DEGC_TEST_PRED,AMES_MUTAGENICITY_TEST_PRED,ORAL_RAT_LD50_MOL/KG_TEST_PRED,SURFACE_TENSION_DYN/CM_TEST_PRED,THERMAL_CONDUCTIVITY_MW/(M*K)_TEST_PRED,TETRAHYMENA_PYRIFORMIS_IGC50_MOL/L_TEST_PRED,VISCOSITY_CP_CP_TEST_PRED,ATMOSPHERIC_HYDROXYLATION_RATE_(AOH)_CM3/MOLECULE*SEC_OPERA_PRED,BIOCONCENTRATION_FACTOR_OPERA_PRED,BIODEGRADATION_HALF_LIFE_DAYS_DAYS_OPERA_PRED,BOILING_POINT_DEGC_OPERA_PRED,HENRYS_LAW_ATM-M3/MOLE_OPERA_PRED,OPERA_KM_DAYS_OPERA_PRED,OCTANOL_AIR_PARTITION_COEFF_KOA_OPERA_PRED,SOIL_ADSORPTION_COEFFICIENT_KOC_L/KG_OPERA_PRED,OCTANOL_WATER_PARTITION_P_OPERA_PRED,MELTING_POINT_DEGC_OPERA_PRED,VAPOR_PRESSURE_MMHG_OPERA_PRED,WATER_SOLUBILITY_MOL/L_OPERA_PRED
DTXSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
DTXSID5020281,157.550,1.081430e-04,1.372,0.323,5.714790e-05,109.227,0.966,0.007079,,137.950,0.000179,2.46604,7.429490e-13,20.11010,4.84081,242.050,4.947790e-06,0.332365,2.124320e+04,507.6930,246.342872,82.9942,2.203470e-02,1.317610e-03
DTXSID8020961,138.126,6.486340e-05,1.309,0.250,4.385310e-04,141.229,0.642,0.009141,,162.043,0.000520,4.89779,1.480110e-12,3.47311,6.64833,331.672,1.265030e-09,0.306715,3.734737e+06,75.9962,24.436556,145.5400,3.265100e-06,4.815930e-03
DTXSID0021834,139.110,5.395110e-05,1.376,0.281,1.472310e-04,117.588,0.490,0.005140,,152.108,0.000184,7.97995,1.155450e-12,11.35610,4.09570,278.910,4.207650e-10,0.251436,1.297807e+06,233.1200,81.678923,113.5640,9.970590e-05,9.679290e-02
DTXSID3032622,99.089,,1.276,0.639,,36.844,0.372,0.004457,,155.108,,,2.727420e-11,1.46093,4.27009,215.029,9.095360e-08,0.288388,5.393492e+04,10.0605,2.852542,86.5110,1.396390e-03,8.066960e-01
DTXSID2044347,150.177,1.312200e-04,1.072,0.621,2.904020e-04,98.358,0.305,0.004887,36.672,139.705,0.000793,2.58821,7.147800e-12,7.55058,4.91679,248.981,9.683370e-08,0.279812,1.183477e+05,98.5113,55.324819,37.4772,6.500820e-03,1.201570e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DTXSID6057879,345.460,1.291220e-05,1.232,0.860,9.183330e-07,271.676,0.563,0.001766,,,,,2.014600e-11,20.26340,6.00889,417.425,1.882080e-07,1.137960,3.325830e+10,881.4960,5365.126916,181.8440,1.564390e-08,4.541010e-07
DTXSID5057880,353.860,7.144960e-06,1.256,0.846,1.023290e-06,261.844,0.204,,,,,,1.382820e-11,96.57770,3.35362,415.812,2.941970e-07,1.655430,4.403520e+10,961.6230,892.585855,125.4470,6.420140e-08,5.643740e-07
DTXSID5057882,300.150,2.065380e-07,1.363,0.679,7.211070e-07,187.442,0.313,,,138.484,0.000009,,1.594230e-11,225.96200,3.54031,337.893,2.333810e-07,3.627750,4.228633e+10,8156.8700,7006.481829,161.2890,2.468280e-07,1.982760e-06
DTXSID5057884,409.310,,1.434,0.933,,297.064,0.200,,,,,,2.432130e-11,82.99380,6.80176,375.225,1.494930e-10,0.148214,3.513581e+09,2186.0300,808.946872,181.9450,1.335850e-09,3.015700e-05


## OPERA 2.9 features

### Training chemicals

In [9]:
%%time

# TODO: Move to input_config
opera_file_namer = lambda name: 'OPERA2.9_' + name + '.csv'
opera_log10_pat = 'Log'

AD_flags_train, opera_features_train = opera.parse_data_with_applicability_domains(
    config.path.raw_opera_features_dir, 
    config.path.opera_mapper_file, 
    opera_file_namer, 
    index_name=index_col, 
    discrete_columns=config.data.discrete_columns_for_source['opera'],
    discrete_suffix=config.data.discrete_column_suffix,
    log10_pat=opera_log10_pat
)

opera_features_train

CPU times: total: 1.88 s
Wall time: 2.09 s


Unnamed: 0_level_0,CERAPP_Ago_pred_discrete,CERAPP_Anta_pred_discrete,CERAPP_Bind_pred_discrete,CoMPARA_Ago_pred_discrete,CoMPARA_Anta_pred_discrete,CoMPARA_Bind_pred_discrete,CATMoS_LD50_pred,FUB_pred,Clint_pred,CACO2_pred,OH_pred,BCF_pred,BioDeg_HalfLife_pred,ReadyBiodeg_pred_discrete,HL_pred,KM_pred,KOA_pred,Koc_pred,P_pred,MP_pred,MolWeight,nbAtoms_discrete,nbHeavyAtoms_discrete,nbC_discrete,nbO_discrete,nbN_discrete,nbAromAtom_discrete,nbRing_discrete,nbHeteroRing_discrete,Sp3Sp2HybRatio,nbRotBd_discrete,nbHBdAcc_discrete,ndHBdDon_discrete,nbLipinskiFailures_discrete,TopoPolSurfAir,MolarRefract,CombDipolPolariz,VP_pred,WS_pred
DTXSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1
DTXSID5020281,0.0,0.0,0.0,0.0,0.0,0.0,236.0,0.20,399.37,,7.413102e-13,19.952623,,0.0,4.897788e-06,0.331131,2.137962e+04,512.861384,245.470892,83.0,156.993056,14,10,6,2,1,6,1,0,0.000000,1,0,0,0,43.14,9.5056,0.900,2.187762e-02,1.318257e-03
DTXSID8020961,0.0,0.0,0.0,0.0,0.0,0.0,787.0,0.33,23.59,,1.479108e-12,3.467369,,0.0,1.258925e-09,0.309030,3.715352e+06,75.857758,24.547089,146.0,138.042927,16,10,6,2,2,6,1,0,0.000000,1,1,1,0,69.16,8.1121,1.201,3.311311e-06,4.786301e-03
DTXSID0021834,0.0,0.0,0.0,0.0,0.0,0.0,228.0,0.15,25.94,,1.148154e-12,11.481536,,0.0,4.265795e-10,0.251189,1.288250e+06,234.422882,81.283052,113.0,139.026943,15,10,6,3,1,6,1,0,0.000000,1,0,1,0,63.37,6.4546,1.065,1.000000e-04,9.772372e-02
DTXSID3032622,0.0,0.0,0.0,0.0,0.0,0.0,2446.0,0.93,4.35,,2.754229e-11,1.445440,,1.0,1.380384e-09,0.288403,5.370318e+04,10.000000,2.818383,87.0,99.032028,12,7,4,2,1,0,1,1,0.250000,0,1,1,0,38.33,24.3643,0.909,1.380384e-03,8.128305e-01
DTXSID2044347,0.0,0.0,0.0,0.0,0.0,0.0,2606.0,0.15,0.00,-4.92,7.079458e-12,7.585776,4.897788,1.0,6.456542e-07,0.281838,1.174898e+05,97.723722,56.234133,18.0,150.068080,21,11,9,2,0,6,1,0,0.222222,2,1,0,0,26.30,18.0862,1.185,6.606934e-03,1.412538e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DTXSID6057879,0.0,0.0,0.0,0.0,1.0,1.0,2410.0,0.17,25.37,-4.62,,20.417379,,0.0,1.621810e-08,1.148154,3.311311e+10,891.250938,5248.074602,213.0,345.139865,47,24,19,3,1,12,3,0,0.368421,5,3,1,0,63.78,45.2203,2.828,7.079458e-09,7.943282e-07
DTXSID5057880,0.0,0.0,0.0,0.0,1.0,1.0,2659.0,0.14,20.48,-4.67,,95.499259,,0.0,1.230269e-08,1.659587,4.365158e+10,954.992586,891.250938,132.0,353.085242,43,23,17,3,1,12,2,0,0.294118,5,3,1,0,63.78,46.2861,2.811,5.754399e-10,1.148154e-06
DTXSID5057882,0.0,0.0,0.0,0.0,1.0,1.0,1138.0,0.08,12.07,-5.33,1.584893e-11,223.872114,,0.0,3.467369e-10,3.630781,4.265795e+10,8128.305162,13182.567386,87.0,299.027998,31,19,14,1,1,12,2,0,0.142857,3,1,1,0,35.25,27.2415,1.587,9.120108e-05,1.584893e-06
DTXSID5057884,0.0,0.0,0.0,0.0,1.0,1.0,1296.0,0.03,5.93,-5.26,,83.176377,,0.0,1.548817e-09,0.147911,,2187.761624,354.813389,229.0,408.100748,49,27,20,3,2,12,3,1,0.350000,7,3,2,0,64.01,55.0931,2.670,2.754229e-08,4.265795e-05


### Application chemicals

In [10]:
%%time 

# Unsure where these data belong.
structures_file_name = "chemical-identifiers.smi"
log_file_name = "log-batch-run.txt"

AD_flags_app, opera_features_app = opera.process_all_batches(
    config.path.opera_application_batches_dir, 
    config.path.opera_mapper_file,
    opera_file_namer,
    structures_file_name, 
    log_file_name, 
    index_name=index_col, 
    discrete_columns=config.data.discrete_columns_for_source['opera'],
    discrete_suffix=config.data.discrete_column_suffix,
    log10_pat=opera_log10_pat
)

CPU times: total: 1min 5s
Wall time: 1min 34s


In [11]:
all_chem_ids_file = 'Input/Raw/OPERA/Input/Application/chemical-identifiers.smi'    
all_chem_ids = opera.extract_dtxsid_from_structures_file(all_chem_ids_file)

f'{round(len(opera_features_app)/len(all_chem_ids)*100)}% ({len(opera_features_app)}) of all chemicals processed'

'94% (450334) of all chemicals processed'

### Merge all chemicals

In [12]:
# Drop duplicates. 
chem_intersection = list(
    opera_features_train.index.intersection(opera_features_app.index))
AD_flags_app = AD_flags_app.drop(chem_intersection)
opera_features_app = opera_features_app.drop(chem_intersection)

In [13]:
data_write_path=config.path.file_for_features_source['opera']
flags_write_path=config.path.opera_AD_file

pd.concat([opera_features_train, opera_features_app]).to_csv(data_write_path)
pd.concat([AD_flags_train, AD_flags_app]).to_csv(flags_write_path)