# Standardization

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('../data/DAGs.csv')
df.head()

Unnamed: 0,Author,Exposure,Outcome,Direction,Strength,ID,Status
0,,Diabetes,Ischemic stroke,Increase,6.0,1,Final
1,,age,Ischemic stroke,Increase,6.0,1,Final
2,,age,Diabetes,Increase,5.0,1,Final
3,,Sex (Male),Ischemic stroke,Increase,6.0,1,Final
4,,Hypertension,Ischemic stroke,Increase,6.0,1,Final


### Getting list of unique terms

In [8]:
terms = pd.concat([df.Exposure, df.Outcome], ignore_index=True).unique()
pd.DataFrame(terms, columns=['terms']).to_csv('terms.csv', index=False)

### Import Usagi mappping, non-standard SNOMED concept additions, and extra information

**usagi_export.csv**
* `source_code_description` - Original term name
* `target_concept_id` - OHDSI ATHENA ID for term 

**non_standard_concepts_mapping.xlsx**
(USAGI only includes standard concepts from the OHDSI CDM so some manual additions had to be done)
* `source_code_description` - Original term name
* `target_concept_id` - OHDSI ATHENA ID for term 

**term_time_and_direction_switches.xlsx**
* `time` - If a concept was used multiple times to keep track of its position in the DAG
* `direction` - Whether the direction in the DAG needs to be switched (e.g. female is coded as male so the direction needs to be changed from increase to decrease or vice versa in the original DAG) 

**! ISSUE !** since using unique term assigning time variable to a term may result in it being applied for multiple terms. will need to rename original variable to not have same name (e.g. depression --> depression before stroke)

In [11]:
mapping = pd.read_csv('usagi_export.csv', dtype={'target_concept_id': str})[['source_code_description','target_concept_id']]
mapping.replace('0',np.nan, inplace=True)

# Add non-standard snomed terms
non_standard_additions = pd.read_excel('non_standard_concepts_mapping.xlsx', dtype={'target_concept_id': str})
mapping = mapping.merge(non_standard_additions, on='source_code_description', how='left', suffixes=['','_non_standard'])
mapping['target_concept_id'] = mapping['target_concept_id'].fillna(mapping['target_concept_id_non_standard'])
mapping = mapping.drop('target_concept_id_non_standard', axis=1)

# Add annotations of time and direction
annotations = pd.read_excel('term_time_and_direction_switches.xlsx')
mapping = mapping.merge(annotations, left_on='source_code_description', right_on='term', how='left').drop('term', axis=1)

mapping.head()

Unnamed: 0,source_code_description,target_concept_id,time,direction
0,race,4013886,,
1,Lacunes,4046360,,
2,chemotherapy,4273629,,
3,malformations,4079975,,
4,kidney disease,198124,,


### Missing Mappings

In [12]:
mapping[mapping.target_concept_id.isna()]

Unnamed: 0,source_code_description,target_concept_id,time,direction
14,peripheral immune competence,,,
16,microglial proliferation,,,
44,nonatherosclerotic abnormalities,,,
52,LMV,,,
53,B-cell infiltration,,,
75,Cardiac Remodelling,,,
92,Other Cardiac Pathologies,,,
104,production of reactive oxygen species,,,
105,loss of brain structural integrity,,,
113,collateralization,,,


For now, fill missing mappings with original term.

In [13]:
mapping['target_concept_id'] = mapping.target_concept_id.fillna(mapping['source_code_description'])

### Match mapping codes with Athena database

In [7]:
concept_columns = ['concept_id',
                   'concept_name',
                   'domain_id',
                   'vocabulary_id',
                   'concept_class_id',
                   'concept_code'
                   ]
athena_concepts = pd.read_csv('athena_vocabulary/CONCEPT.csv', sep='\t', dtype={'concept_id': str, 'concept_code': str}, low_memory=False)[concept_columns]

mapped_terms = mapping.merge(athena_concepts, how='left', left_on='target_concept_id', right_on='concept_id').drop('concept_id', axis=1)
mapped_terms.head()

Unnamed: 0,source_code_description,target_concept_id,time,direction,concept_name,domain_id,vocabulary_id,concept_class_id,concept_code
0,race,4013886,,,Race,Observation,SNOMED,Observable Entity,103579009
1,Lacunes,4046360,,,Lacunar infarction,Condition,SNOMED,Disorder,230698000
2,chemotherapy,4273629,,,Chemotherapy,Procedure,SNOMED,Procedure,367336001
3,malformations,4079975,,,Congenital malformation,Condition,SNOMED,Disorder,276654001
4,kidney disease,198124,,,Kidney disease,Condition,SNOMED,Disorder,90708001


### Apply direction switches to arrows
Pairs that need switching:
* Blood-brain-barrier breakdown (vs. integrity)
* Cognitive function / cognition (vs. impairment)
* Female (vs. male)
* Immobility (vs. mobility)
* Low socioeconomic status (vs. socioeconomic status)
* Physical inactivity (vs. exercise)
* reduced quality of life (vs. quality of life)

In [8]:
switch_terms = mapped_terms[mapped_terms.direction=='switch'].source_code_description

In [9]:
def flip_direction(direction):
    return "Decrease" if direction == "Increase" else "Increase"

def apply_switch_logic(row):
    # Count how many terms in `switch_terms` appear in `Exposure` or `Outcome`
    match_count = sum(term in row["Exposure"] or term in row["Outcome"] for term in switch_terms)
    
    # If there is exactly one match, flip the direction
    if match_count == 1:
        return flip_direction(row["Direction"])
    # If there are two matches (both terms are present), keep the direction unchanged
    else:
        return row["Direction"]
    
df["Direction"] = df.apply(apply_switch_logic, axis=1)

### Will create separate nodes for compound terms with same arrows

In [10]:
mapped_terms[mapped_terms.source_code_description.duplicated(keep=False)]

Unnamed: 0,source_code_description,target_concept_id,time,direction,concept_name,domain_id,vocabulary_id,concept_class_id,concept_code
11,vessel stenosis/occlusion,4217691,,,Stenosis,Observation,SNOMED,Morph Abnormality,415582006
12,vessel stenosis/occlusion,4178903,,,Complete obstruction,Observation,SNOMED,Morph Abnormality,50173008
121,Carotid Endarterectomy / Angioplasty / stenting,4283095,,,Carotid endarterectomy,Procedure,SNOMED,Procedure,66951008
122,Carotid Endarterectomy / Angioplasty / stenting,4178631,,,Angioplasty of carotid artery,Procedure,SNOMED,Procedure,429287007
123,Carotid Endarterectomy / Angioplasty / stenting,4050288,,,Insertion of carotid artery stent,Procedure,SNOMED,Procedure,233405004
138,atherosclerosis/thrombosis,4306703,,,Atherosclerosis,Observation,SNOMED,Morph Abnormality,38716007
139,atherosclerosis/thrombosis,4231363,,,Thrombosis,Condition,SNOMED,Disorder,439127006
268,preeclampsia and other pregancy related compli...,439393,,,Pre-eclampsia,Condition,SNOMED,Disorder,398254007
269,preeclampsia and other pregancy related compli...,42538946,,,Hypertension complicating pregnancy,Condition,SNOMED,Disorder,82771000119102
270,preeclampsia and other pregancy related compli...,37018765,,,Gestational diabetes mellitus complicating pre...,Condition,SNOMED,Disorder,40801000119106


In [11]:
# pd.DataFrame(mapped_terms[mapped_terms.source_code_description.duplicated(keep=False)].source_code_description.unique(), columns=['term']).to_csv('more_than_one_terms.csv')

Option 1: Dropping duplicates

In [12]:
mapped_terms_no_dup = mapped_terms[~mapped_terms.source_code_description.duplicated(keep='first')]

Option 2: Creating new nodes with same connections for duplicates

### Combining mapping with original DAG data

In [13]:
mapped_terms_dict = mapped_terms_no_dup.set_index('source_code_description')['target_concept_id'].to_dict()

In [14]:
mapped_terms_no_dup['concept_name_plus_time'] = mapped_terms_no_dup.apply(
    lambda row: f"{row['concept_name']}{row['time']}" if pd.notna(row['time']) else row['concept_name'],
    axis=1
)
mapped_terms_dict2 = mapped_terms_no_dup.set_index('source_code_description')['concept_name_plus_time'].to_dict()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mapped_terms_no_dup['concept_name_plus_time'] = mapped_terms_no_dup.apply(


In [15]:
mapped_terms

Unnamed: 0,source_code_description,target_concept_id,time,direction,concept_name,domain_id,vocabulary_id,concept_class_id,concept_code
0,race,4013886,,,Race,Observation,SNOMED,Observable Entity,103579009
1,Lacunes,4046360,,,Lacunar infarction,Condition,SNOMED,Disorder,230698000
2,chemotherapy,4273629,,,Chemotherapy,Procedure,SNOMED,Procedure,367336001
3,malformations,4079975,,,Congenital malformation,Condition,SNOMED,Disorder,276654001
4,kidney disease,198124,,,Kidney disease,Condition,SNOMED,Disorder,90708001
...,...,...,...,...,...,...,...,...,...
313,inflammation,4119715,,,Inflammation,Observation,SNOMED,Qualifier Value,257552002
314,coping mechnisms,4225865,,,Coping behavior,Observation,SNOMED,Observable Entity,405056001
315,occurence of myocardial infarction,4329847,,,Myocardial infarction,Condition,SNOMED,Disorder,22298006
316,infection,432250,,,Infectious disease,Condition,SNOMED,Disorder,40733004


In [None]:
df2 = df.copy(deep=True)
df2['Exposure_Standardized'] = df.Exposure.replace(mapped_terms_dict2)
df2['Outcome_Standardized'] = df.Outcome.replace(mapped_terms_dict2)
df2.head()

Unnamed: 0,Author,Exposure,Outcome,Direction,Strength,ID,Status,Exposure_Standardized,Outcome_Standardized
0,,Diabetes,Ischemic stroke,Increase,6.0,1,Final,Diabetes mellitus,Ischemic stroke1.0
1,,age,Ischemic stroke,Increase,6.0,1,Final,Age factor,Ischemic stroke1.0
2,,age,Diabetes,Increase,5.0,1,Final,Age factor,Diabetes mellitus
3,,Sex (Male),Ischemic stroke,Increase,6.0,1,Final,Male,Ischemic stroke1.0
4,,Hypertension,Ischemic stroke,Increase,6.0,1,Final,Hypertensive disorder,Ischemic stroke1.0


In [21]:
df['Exposure'] = df.Exposure.replace(mapped_terms_dict)
df['Outcome'] = df.Outcome.replace(mapped_terms_dict)
df.tail(50)

Unnamed: 0,Author,Exposure,Outcome,Direction,Strength,ID,Status
860,,4327941,443432,Decrease,2.38,19,Draft
861,,4238738,4178501,Increase,6.84,19,Draft
862,,4238738,4052648,Decrease,6.84,19,Draft
863,,4024166,4234649,Increase,6.84,19,Draft
864,,4196427,4310996,Decrease,6.84,19,Draft
865,,4201926,4310996,Decrease,6.84,19,Draft
866,,4116678,4310996,Decrease,6.84,19,Draft
867,,4116678,4310996,Decrease,6.84,19,Draft
868,,4116678,443432,Decrease,6.84,19,Draft
869,,4024618,4306655,Decrease,4.08,19,Draft


In [None]:
df.to_csv('../data/DAGs_standardized.csv', index=False)

In [19]:
df2.to_csv('../data/DAGs_standardized_with_time.csv', index=False)

In [22]:
df2.tail(50)

Unnamed: 0,Author,Exposure,Outcome,Direction,Strength,ID,Status,Exposure_Standardized,Outcome_Standardized
860,,psychotherapy,cognitive decline,Decrease,2.38,19,Draft,Psychotherapy,Impaired cognition
861,,physiotherapy,immobility,Increase,6.84,19,Draft,Physical therapy procedure,Mobility2.0
862,,physiotherapy,severe disability,Decrease,6.84,19,Draft,Physical therapy procedure,Disability
863,,social work,reduced QoL/ADL,Increase,6.84,19,Draft,Social worker,Quality of life satisfaction
864,,secondary prevention,second stroke,Decrease,6.84,19,Draft,Secondary prevention,Ischemic stroke2.0
865,,primary prevention,first stroke,Decrease,6.84,19,Draft,Primary prevention,Ischemic stroke1.0
866,,physical exercise,first stroke,Decrease,6.84,19,Draft,Exercise,Ischemic stroke1.0
867,,physical exercise,second stroke,Decrease,6.84,19,Draft,Exercise,Ischemic stroke2.0
868,,physical exercise,cognitive decline,Decrease,6.84,19,Draft,Exercise,Impaired cognition
869,,care giving,death,Decrease,4.08,19,Draft,Care provision regime,Death
