# Standardization

In [131]:
import numpy as np
import pandas as pd

In [132]:
df = pd.read_csv('../data/DAGs.csv').drop_duplicates()
df.head()

Unnamed: 0,Author,Exposure,Outcome,Direction,Strength,ID,Status
0,,Diabetes,Ischemic stroke,Increase,6.0,1,Final
1,,age,Ischemic stroke,Increase,6.0,1,Final
2,,age,Diabetes,Increase,5.0,1,Final
3,,Sex (Male),Ischemic stroke,Increase,6.0,1,Final
4,,Hypertension,Ischemic stroke,Increase,6.0,1,Final


### Getting list of unique terms

In [133]:
terms = pd.concat([df.Exposure, df.Outcome], ignore_index=True).unique()
pd.DataFrame(terms, columns=['terms']).to_csv('terms.csv', index=False)

### Import Usagi mappping, non-standard SNOMED concept additions, and extra information

**usagi_export.csv**
* `source_code_description` - Original term name
* `target_concept_id` - OHDSI ATHENA ID for term 

**non_standard_concepts_mapping.xlsx**
(USAGI only includes standard concepts from the OHDSI CDM so some manual additions had to be done)
* `source_code_description` - Original term name
* `target_concept_id` - OHDSI ATHENA ID for term 

**term_time_and_direction_switches.xlsx**
* `time` - If a concept was used multiple times to keep track of its position in the DAG
* `direction` - Whether the direction in the DAG needs to be switched (e.g. female is coded as male so the direction needs to be changed from increase to decrease or vice versa in the original DAG) 

**! ISSUE !** since using unique term assigning time variable to a term may result in it being applied for multiple terms. will need to rename original variable to not have same name (e.g. depression --> depression before stroke)

In [134]:
mapping = pd.read_csv('usagi_export.csv', dtype={'target_concept_id': str})[['source_code_description','target_concept_id']]
mapping.replace('0',np.nan, inplace=True)

# Add non-standard snomed terms
non_standard_additions = pd.read_excel('non_standard_concepts_mapping.xlsx', dtype={'target_concept_id': str})
mapping = mapping.merge(non_standard_additions, on='source_code_description', how='left', suffixes=['','_non_standard'])
mapping['target_concept_id'] = mapping['target_concept_id'].fillna(mapping['target_concept_id_non_standard'])
mapping = mapping.drop('target_concept_id_non_standard', axis=1)

# Add annotations of time and direction
annotations = pd.read_excel('term_time_and_direction_switches.xlsx')
mapping = mapping.merge(annotations, left_on='source_code_description', right_on='term', how='left').drop('term', axis=1)

mapping.head()

Unnamed: 0,source_code_description,target_concept_id,time,direction
0,race,4013886,,
1,Lacunes,4046360,,
2,chemotherapy,4273629,,
3,malformations,4079975,,
4,kidney disease,198124,,


### Missing Mappings

In [135]:
mapping[mapping.target_concept_id.isna()]

Unnamed: 0,source_code_description,target_concept_id,time,direction
14,peripheral immune competence,,,
16,microglial proliferation,,,
44,nonatherosclerotic abnormalities,,,
52,LMV,,,
53,B-cell infiltration,,,
75,Cardiac Remodelling,,,
92,Other Cardiac Pathologies,,,
104,production of reactive oxygen species,,,
105,loss of brain structural integrity,,,
113,collateralization,,,


For now, fill missing mappings with original term.

In [136]:
mapping['target_concept_id'] = mapping.target_concept_id.fillna(mapping['source_code_description'])

### Match mapping codes with Athena database

In [137]:
concept_columns = ['concept_id',
                   'concept_name',
                   'domain_id',
                   'vocabulary_id',
                   'concept_class_id',
                   'concept_code'
                   ]
athena_concepts = pd.read_csv('athena_vocabulary/CONCEPT.csv', sep='\t', dtype={'concept_id': str, 'concept_code': str}, low_memory=False)[concept_columns]

mapped_terms = mapping.merge(athena_concepts, how='left', left_on='target_concept_id', right_on='concept_id').drop('concept_id', axis=1)
mapped_terms.head()

Unnamed: 0,source_code_description,target_concept_id,time,direction,concept_name,domain_id,vocabulary_id,concept_class_id,concept_code
0,race,4013886,,,Race,Observation,SNOMED,Observable Entity,103579009
1,Lacunes,4046360,,,Lacunar infarction,Condition,SNOMED,Disorder,230698000
2,chemotherapy,4273629,,,Chemotherapy,Procedure,SNOMED,Procedure,367336001
3,malformations,4079975,,,Congenital malformation,Condition,SNOMED,Disorder,276654001
4,kidney disease,198124,,,Kidney disease,Condition,SNOMED,Disorder,90708001


### Apply direction switches to arrows
Pairs that need switching:
* Blood-brain-barrier breakdown (vs. integrity)
* Cognitive function / cognition (vs. impairment)
* Female (vs. male)
* Immobility (vs. mobility)
* Low socioeconomic status (vs. socioeconomic status)
* Physical inactivity (vs. exercise)
* reduced quality of life (vs. quality of life)

In [138]:
switch_terms = mapped_terms[mapped_terms.direction=='switch'].source_code_description

In [139]:
def flip_direction(direction):
    return "Decrease" if direction == "Increase" else "Increase"

def apply_switch_logic(row):
    # Count how many terms in `switch_terms` appear in `Exposure` or `Outcome`
    match_count = sum(term in row["Exposure"] or term in row["Outcome"] for term in switch_terms)
    
    # If there is exactly one match, flip the direction
    if match_count == 1:
        return flip_direction(row["Direction"])
    # If there are two matches (both terms are present), keep the direction unchanged
    else:
        return row["Direction"]
    
df["Direction"] = df.apply(apply_switch_logic, axis=1)

### Add Time

In [140]:
mapped_terms['concept_name_plus_time'] = mapped_terms.apply(
    lambda row: f"{row['concept_name']}_{row['time']}" if pd.notna(row['time']) else row['concept_name'],
    axis=1
)

### Will create separate nodes for compound terms with same arrows

In [141]:
# pd.DataFrame(mapped_terms[mapped_terms.source_code_description.duplicated(keep=False)].source_code_description.unique(), columns=['term']).to_csv('more_than_one_terms.csv', index=False)

In [142]:
compound_terms = list(pd.read_csv('more_than_one_terms.csv').term)

mask = mapped_terms.source_code_description.isin(compound_terms)
compound_term_mapping = mapped_terms[mask].groupby('source_code_description')[['target_concept_id','concept_name_plus_time']].agg(list).reset_index()
compound_term_mapping

Unnamed: 0,source_code_description,target_concept_id,concept_name_plus_time
0,Carotid Endarterectomy / Angioplasty / stenting,"[4283095, 4178631, 4050288]","[Carotid endarterectomy, Angioplasty of caroti..."
1,Weight Loss and Aerobic Exercise,"[4229881, 4116678]","[Weight loss, Exercise]"
2,atherosclerosis/thrombosis,"[4306703, 4231363]","[Atherosclerosis, Thrombosis]"
3,high fat/ high carbonhydrate diet,"[4024366, 4027011]","[High fat diet, High carbohydrate diet]"
4,higher fitness/mobility post,"[44800078, 4178501]","[Physical fitness state_2.0, Mobility_2.0]"
5,higher fitness/mobility pre,"[44800078, 4178501]","[Physical fitness state_1.0, Mobility_1.0]"
6,preeclampsia and other pregancy related compli...,"[439393, 42538946, 37018765]","[Pre-eclampsia, Hypertension complicating preg..."
7,vessel stenosis/occlusion,"[4217691, 4178903]","[Stenosis, Complete obstruction]"


In [143]:
compound_term_mapping_dict = compound_term_mapping.set_index('source_code_description')['target_concept_id'].to_dict()

In [144]:
normal_mapping = mapped_terms[~mask]
normal_mapping_dict = normal_mapping.set_index('source_code_description')['target_concept_id'].to_dict()

### Combining mapping with original DAG data

In [181]:
df_code = df.copy(deep=True)
df_time = df.copy(deep=True)

In [182]:
df_code['Exposure'] = df_code['Exposure'].replace(normal_mapping_dict)
df_code['Outcome'] = df_code['Outcome'].replace(normal_mapping_dict)

In [183]:
def expand_compound_rows(df, exposure, outcome, compound_dict):
    expanded_rows = []
    for _, row in df.iterrows():
        term1 = row[exposure]
        term2 = row[outcome]

        if (term1 in compound_dict.keys()) & (term2 in compound_dict.keys()):
            for i in compound_dict[term1]:
                for j in compound_dict[term2]:
                    new_row = row.copy(deep=True)
                    new_row[exposure] = i
                    new_row[outcome] = j
                    expanded_rows.append(new_row)              

        elif term1 in compound_dict.keys():
            for i in compound_dict[term1]:
                new_row = row.copy(deep=True)
                new_row[exposure] = i
                expanded_rows.append(new_row)  

        elif term2 in compound_dict.keys():
            for j in compound_dict[term2]:
                new_row = row.copy(deep=True)
                new_row[outcome] = j
                expanded_rows.append(new_row)  

        else:
            expanded_rows.append(row)

    return pd.DataFrame(expanded_rows)

df_code = expand_compound_rows(df_code, 'Exposure', 'Outcome', compound_term_mapping_dict)


In [184]:
df_code.to_csv('../data/DAGs_standardized.csv', index=False)

With time and name

In [185]:
compound_term_mapping_dict_time = compound_term_mapping.set_index('source_code_description')['concept_name_plus_time'].to_dict()
normal_mapping_dict_time = normal_mapping.set_index('source_code_description')['concept_name_plus_time'].to_dict()

In [186]:
df_time['Exposure_Standardized'] = df_time.Exposure.replace(normal_mapping_dict_time)
df_time['Outcome_Standardized'] = df_time.Outcome.replace(normal_mapping_dict_time)

In [188]:
df_time = expand_compound_rows(df_time, 'Exposure_Standardized', 'Outcome_Standardized', compound_term_mapping_dict_time)

In [191]:
df_time.to_csv('../data/DAGs_standardized_with_time.csv', index=False)