# Data Preprocessing

### Import Data

In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv('data/DAGs.csv')
df.head()

Unnamed: 0,Author,Exposure,Outcome,Direction,Strength,ID,Status
0,,Diabetes,Ischemic stroke,Increase,6.0,1,Final
1,,age,Ischemic stroke,Increase,6.0,1,Final
2,,age,Diabetes,Increase,5.0,1,Final
3,,Sex (Male),Ischemic stroke,Increase,6.0,1,Final
4,,Hypertension,Ischemic stroke,Increase,6.0,1,Final


### Getting a List of Terms

In [3]:
terms = pd.concat([df.Exposure, df.Outcome], ignore_index=True).unique()
pd.DataFrame(terms, columns=['terms']).to_csv('terms.csv', index=False)

## Standardizing Terms with SNOMED CT, LOINC, ATC
* Preferred is SNOMED CT, used LOINC or ATC when SNOMED term was not available

### Merge mapped codes for terms (done manually in Excel) with OHDSI ATHENA database

Load mapped terms
* `term` - Original term name
* `concept_code` - SNOMED, LOINC, or ATC code for term
* `concept_code_2` - For concepts that don't exist on their own that are a combination of two codes (e.g. neuron survival -> neuron + survival rate)
* `time` - If a concept was used multiple times to keep track of its position in the DAG
* `direction` - Whether the direction in the DAG needs to be switched (e.g. female is coded as male so the direction needs to be changed from increase to decrease or vice versa in the original DAG) 

In [69]:
mapped_terms = pd.read_excel('terms_mapped_to_codes.xlsx', sheet_name='terms', dtype={'concept_code': str, 'concept_code_2':str})
mapped_terms.head()

Unnamed: 0,term,concept_code,concept_code_2,time,direction
0,Diabetes,73211009,,,
1,age,71395006,,,
2,Sex (Male),248153007,,,
3,Hypertension,59621000,,,
4,BMI,60621009,,,


Load Athena database

In [70]:
athena_concepts = pd.read_csv('ohdsi_athena_vocab/CONCEPT.csv', sep='\t', dtype={'concept_code': str}, low_memory=False)
concept_columns = ['concept_id','concept_name','domain_id','vocabulary_id','concept_class_id','concept_code']
athena_concepts = athena_concepts[concept_columns]
athena_concepts.head()

Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,concept_code
0,3541502,Adverse reaction to drug primarily affecting t...,Condition,SNOMED,Disorder,694331000000106
1,3542356,Adverse reaction to other central nervous syst...,Condition,SNOMED,Disorder,705311000000105
2,4327638,Borderline,Observation,SNOMED,Qualifier Value,75189007
3,42538812,Somatic hallucination,Condition,SNOMED,Clinical Finding,762620006
4,40629514,Stillbirth,Condition,SNOMED,Clinical Finding,76358005


Merge

In [72]:
primary_concepts = mapped_terms.merge(athena_concepts, how='left', on='concept_code')

In [74]:
secondary_concepts = mapped_terms.merge(athena_concepts, how='left', left_on='concept_code_2', right_on='concept_code', suffixes=('','2'))
secondary_concepts = secondary_concepts[concept_columns].drop('concept_code',axis=1)

In [75]:
all_terms = primary_concepts.merge(secondary_concepts, left_index=True, right_index=True, suffixes=('','_2'))
all_terms.head()

Unnamed: 0,term,concept_code,concept_code_2,time,direction,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,concept_id_2,concept_name_2,domain_id_2,vocabulary_id_2,concept_class_id_2
0,Diabetes,73211009,,,,201820.0,Diabetes mellitus,Condition,SNOMED,Disorder,,,,,
1,age,71395006,,,,4322978.0,Age factor,Observation,SNOMED,Observable Entity,,,,,
2,Sex (Male),248153007,,,,442985.0,Male,Observation,SNOMED,Clinical Finding,,,,,
3,Hypertension,59621000,,,,320128.0,Essential hypertension,Condition,SNOMED,Disorder,,,,,
4,BMI,60621009,,,,4245997.0,Body mass index,Measurement,SNOMED,Observable Entity,,,,,


### Combine mapping with original DAG data