# Data Preprocessing

### Import Data

In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv('data/DAGs.csv')
df.head()

Unnamed: 0,Author,Exposure,Outcome,Direction,Strength,ID,Status
0,,Diabetes,Ischemic stroke,Increase,6.0,1,Final
1,,age,Ischemic stroke,Increase,6.0,1,Final
2,,age,Diabetes,Increase,5.0,1,Final
3,,Sex (Male),Ischemic stroke,Increase,6.0,1,Final
4,,Hypertension,Ischemic stroke,Increase,6.0,1,Final


### Getting a List of Terms

In [3]:
terms = pd.concat([df.Exposure, df.Outcome], ignore_index=True).unique()
pd.DataFrame(terms, columns=['terms']).to_csv('terms.csv', index=False)

### Grouping Terms

Load SNOMED codes for terms

In [41]:
snomed_terms = pd.read_excel('terms_excel.xlsx', sheet_name='terms', dtype={'concept_code': str, 'qualifier_code':str})
snomed_terms.head()

Unnamed: 0,terms,concept_code,time,qualifier,qualifier_code,direction
0,Diabetes,73211009,,,,
1,age,71395006,,,,
2,Sex (Male),248153007,,,,
3,Hypertension,59621000,,,,
4,BMI,60621009,,,,


Load Athena database

In [49]:
athena_concepts = pd.read_csv('ohdsi_athena_vocab/CONCEPT.csv', sep='\t', dtype={'concept_code': str}, low_memory=False)
concept_columns = ['concept_id','concept_name','domain_id','vocabulary_id','concept_class_id','concept_code']
athena_concepts = athena_concepts[concept_columns]
athena_concepts.head()

Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,concept_code
0,3541502,Adverse reaction to drug primarily affecting t...,Condition,SNOMED,Disorder,694331000000106
1,3542356,Adverse reaction to other central nervous syst...,Condition,SNOMED,Disorder,705311000000105
2,4327638,Borderline,Observation,SNOMED,Qualifier Value,75189007
3,42538812,Somatic hallucination,Condition,SNOMED,Clinical Finding,762620006
4,40629514,Stillbirth,Condition,SNOMED,Clinical Finding,76358005


Merge

In [39]:
primary_concepts = snomed_terms.merge(athena_concepts, how='left', on='concept_code')

In [63]:
secondary_concepts = snomed_terms.merge(athena_concepts, how='left', left_on='qualifier_code', right_on='concept_code', suffixes=('','2'))
secondary_concepts = secondary_concepts[concept_columns].drop('concept_code',axis=1)

In [64]:
all_terms = primary_concepts.merge(secondary_concepts, left_index=True, right_index=True, suffixes=('','_2'))
all_terms.head()

Unnamed: 0,terms,concept_code,time,qualifier,qualifier_code,direction,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,concept_id_2,concept_name_2,domain_id_2,vocabulary_id_2,concept_class_id_2
0,Diabetes,73211009,,,,,201820.0,Diabetes mellitus,Condition,SNOMED,Disorder,,,,,
1,age,71395006,,,,,4322978.0,Age factor,Observation,SNOMED,Observable Entity,,,,,
2,Sex (Male),248153007,,,,,442985.0,Male,Observation,SNOMED,Clinical Finding,,,,,
3,Hypertension,59621000,,,,,320128.0,Essential hypertension,Condition,SNOMED,Disorder,,,,,
4,BMI,60621009,,,,,4245997.0,Body mass index,Measurement,SNOMED,Observable Entity,,,,,


### Check most used terms

In [68]:
all_terms.concept_name.value_counts()[:10]

concept_name
Ischemic stroke           13
Male                      12
Exercise                   6
Atrial fibrillation        5
Body mass index            5
Atherosclerosis            5
Essential hypertension     4
Diabetes mellitus          4
Age factor                 4
Inflammation               4
Name: count, dtype: int64