# Get Chemical Identifiers: LCIA QSAR Project
**Author:** Jacob Kvasnicka <br>
**Date:** June 24, 2023

Get chemical identifiers to use as inputs to the OPERA 2.9 user interface.

In [1]:
import pandas as pd
from os import path 
import re

import pattern
from config_management import UnifiedConfiguration

config_mapping_path = 'Input\configuration-mapping.json'
config = UnifiedConfiguration(config_mapping_path)

## Model development & training

### Load the identifiers in their raw format

In [2]:
exposure_route = 'oral'

# Define key-word arguments for pandas.read_excel().
kwargs = {
    'sheet_name': exposure_route.upper(),
    'header': [0, 1]}

identifiers = (
    pd.read_excel(
        config.path.raw_surrogate_pods_file, 
        **kwargs)
    .droplevel(axis=1, level=0)
    [['dtxsid', 'casrn']])

identifiers

Unnamed: 0,dtxsid,casrn
0,DTXSID5020281,100-00-5
1,DTXSID8020961,100-01-6
2,DTXSID0021834,100-02-7
3,DTXSID3032622,10004-44-1
4,DTXSID2044347,100-06-1
...,...,...
10150,"NODTXSID_NOCAS_Zeolite, synthetic, cryst","NOCAS_Zeolite, synthetic, crystalline, non fi"
10151,NODTXSID_NOCAS_zinc 2-hydroxy-5-(C13-C18,NOCAS_zinc 2-hydroxy-5-(C13-C18)-alkylbenzoat
10152,NODTXSID_NOCAS_Zyclen,NOCAS_Zyclen
10153,NODTXSID_RN: 2778-42-9,RN: 2778-42-9


### Extract the actual identifiers using regular expressions

In [3]:
pattern_for_col = {
    'dtxsid': re.compile(pattern.dtxsid(as_group=True)),
    'casrn': re.compile(pattern.casrn(as_group=True))
}
for col, pat in pattern_for_col.items():
    identifiers.loc[:, col] = (
        identifiers[col].str.extract(pat, expand=False))

identifiers

Unnamed: 0,dtxsid,casrn
0,DTXSID5020281,100-00-5
1,DTXSID8020961,100-01-6
2,DTXSID0021834,100-02-7
3,DTXSID3032622,10004-44-1
4,DTXSID2044347,100-06-1
...,...,...
10150,,
10151,,
10152,,
10153,,2778-42-9


In [4]:
identifiers['casrn'].notna().sum()

7550

In [5]:
identifiers['dtxsid'].notna().sum()

6598

In [6]:
identifiers = identifiers['dtxsid'].dropna()

identifiers

0       DTXSID5020281
1       DTXSID8020961
2       DTXSID0021834
3       DTXSID3032622
4       DTXSID2044347
            ...      
8313    DTXSID5057882
8314    DTXSID5057884
8315    DTXSID0057885
8316    DTXSID1057905
8317    DTXSID6057906
Name: dtxsid, Length: 6598, dtype: object

In [7]:
# Write contents to TXT file for batch download in OPERA.
identifiers.to_csv(
    config.path.chemical_id_dev_file, 
    header=None, 
    index=None, 
    sep=' '
)

## Model application: Comparison with SEEM3 exposure predictions

In [8]:
(
    pd.read_csv(
        config.path.seem3_exposure_file,
        encoding='latin-1')
    ['DTXSID']
    .to_csv(
        config.path.chemical_id_app_file,    
        index=False, 
        header=False,
        sep=' '
    )
)