# Data Preprocessing

- Linking terms to SNOMED CT concepts

### Import Data

In [29]:
import json
import pandas as pd
from urllib.request import urlopen, Request
from urllib.parse import quote

In [4]:
df = pd.read_csv('data/DAGs.csv')
df.head()

Unnamed: 0,Author,Exposure,Outcome,Direction,Strength,ID,Status
0,,Diabetes,Ischemic stroke,Increase,6.0,1,Final
1,,age,Ischemic stroke,Increase,6.0,1,Final
2,,age,Diabetes,Increase,5.0,1,Final
3,,Sex (Male),Ischemic stroke,Increase,6.0,1,Final
4,,Hypertension,Ischemic stroke,Increase,6.0,1,Final


### Getting a List of Terms

In [28]:
terms = pd.concat([df.Exposure, df.Outcome], ignore_index=True).unique()
pd.DataFrame(terms, columns=['terms']).to_csv('terms.csv', index=False)

### Querying SNOMED CT API from Snowstorm

In [44]:
user_agent = 'example@example.com'
baseUrl = 'https://browser.ihtsdotools.org/snowstorm/snomed-ct'
edition = 'MAIN'
version = '2019-07-31'

# Helper function to handle API requests
def urlopen_with_header(url):
    req = Request(url)
    req.add_header('User-Agent', user_agent)
    return urlopen(req)

# Function to retrieve SNOMED concept for a term
def get_primary_snomed_concept(term):
    url = f"{baseUrl}/browser/{edition}/{version}/descriptions?term={quote(str(term))}&conceptActive=true&groupByConcept=false&searchMode=STANDARD&offset=0&limit=1"
    response = urlopen_with_header(url).read()
    data = json.loads(response.decode('utf-8'))
    if data['items']:
        primary_concept = data['items'][0]
        return {
            "Standardized_Term": primary_concept['term'],
            "SNOMED_ID": primary_concept['concept']['conceptId'],
        }
    else:
        return None

terms_df = pd.read_csv('terms.csv')
terms_df['normalized_term'] = terms_df['terms'].str.lower().str.strip()
unique_terms = terms_df['normalized_term'].unique()

term_mappings = []

for term in unique_terms:
    snomed_concept = get_primary_snomed_concept(term)
    if snomed_concept:
        snomed_concept['Original_Terms'] = [term]
        term_mappings.append(snomed_concept)

standardized_df = pd.DataFrame(term_mappings)