<a href="https://colab.research.google.com/github/hantswilliams/r22-sdoh-interoperability/blob/main/test_notebooks/simple_r21_tests_aim1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import re
import json

## Tokenizing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

## TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

## word embeddings
import gensim
from gensim.models import Word2Vec

## cosine
from scipy.spatial.distance import cosine


In [4]:
# Download stopwords if not available
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
reference_list = [
    "Limited access to nutrition supplies (finding)",
    "Noncompliance with dietary regimen (finding)",
    "Nutrition impaired due to limited access to healthful foods (finding)",
    "Mild food insecurity on United States household food security survey module (finding)",
    "Moderate food insecurity on United States household food security survey module (finding)",
    # ... (other descriptions)
    "Caregiver's noncompliance with patient's dietary regimen due to financial hardship"
]

In [6]:
# Tokenizing and filtering stopwords
all_tokens = []
for description in reference_list:
    tokens = word_tokenize(description)
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stop_words]
    all_tokens.append(' '.join(filtered_tokens))

# Computing TF-IDF values
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_tokens)
feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf_matrix.sum(axis=0).tolist()[0]
tfidf_dict = dict(zip(feature_names, tfidf_values))

print(tfidf_dict)  # This dictionary contains keywords with their respective TF-IDF values

{'access': 0.7889062102134055, 'caregiver': 0.3866311833916044, 'dietary': 0.8601407295599457, 'due': 0.6531548345643936, 'financial': 0.3866311833916044, 'finding': 1.180415192099782, 'food': 1.114664374721447, 'foods': 0.40988520714433785, 'hardship': 0.3866311833916044, 'healthful': 0.40988520714433785, 'household': 0.5573321873607235, 'impaired': 0.40988520714433785, 'insecurity': 0.5573321873607235, 'limited': 0.7889062102134055, 'mild': 0.33983077113876825, 'moderate': 0.3398307711387683, 'module': 0.5573321873607235, 'noncompliance': 0.8601407295599457, 'nutrition': 0.7889062102134055, 'patient': 0.3866311833916044, 'regimen': 0.8601407295599457, 'security': 0.5573321873607235, 'states': 0.5573321873607235, 'supplies': 0.5521787534625713, 'survey': 0.5573321873607235, 'united': 0.5573321873607235}


In [7]:
keywords = set()
for description in reference_list:
    tokens = word_tokenize(description)
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stop_words]
    keywords.update(filtered_tokens)
print(keywords)

{'due', 'financial', 'finding', 'healthful', 'regimen', 'household', 'survey', 'module', 'foods', 'hardship', 'supplies', "'s", ')', 'patient', 'access', 'caregiver', 'insecurity', 'noncompliance', 'states', 'security', 'mild', 'food', '(', 'limited', 'dietary', 'moderate', 'impaired', 'nutrition', 'united'}


In [8]:
# Filtering keywords based on the TF-IDF threshold
threshold = 0.5
keywords = {key for key, value in tfidf_dict.items() if value > threshold}
print(keywords)

{'united', 'insecurity', 'due', 'limited', 'noncompliance', 'finding', 'dietary', 'household', 'regimen', 'security', 'states', 'supplies', 'survey', 'food', 'module', 'nutrition', 'access'}


In [11]:
# Step 2: Metadata Parsing
metadata = """[
    {
        "accessLevel": "public",
        "landingPage": "https://healthdata.gov/d/229f-a34m",
        "issued": "2023-02-22",
        "@type": "dcat:Dataset",
        "modified": "2023-02-22",
        "contactPoint": {
            "@type": "vcard:Contact",
            "fn": "HHS Office of the Chief Data Officer",
            "hasEmail": "mailto:no-reply@healthdata.gov"
        },
        "identifier": "https://healthdata.gov/api/views/229f-a34m",
        "publisher": {
            "@type": "org:Organization",
            "name": "healthdata.gov"
        },
        "description": "",
        "title": "Access to Care (2023)"
    },
    {
        "accessLevel": "public",
        "landingPage": "https://healthdata.gov/d/238m-ezg9",
        "bureauCode": ["009:00"],
        "issued": "2021-10-14",
        "@type": "dcat:Dataset",
        "modified": "2021-10-13",
        "keyword": ["drug rebate program"],
        "contactPoint": {
            "@type": "vcard:Contact",
            "fn": "Medicaid.gov",
            "hasEmail": "mailto:Medicaid.gov@cms.hhs.gov"
        },
        "publisher": {
            "@type": "org:Organization",
            "name": "Centers for Medicare & Medicaid Services"
        },
        "identifier": "e28727b2-fe6b-46cb-8617-408de290200d",
        "description": "The data below contains newly reported, active covered outpatient drugs which were reported by participating drug manufacturers since the last quarterly update of the Drug Products in the Medicaid Drug Rebate Program (MDRP) database.",
        "title": "Product Data for Newly Reported Drugs in the Medicaid Drug Rebate Program 20210726 to 20210801",
        "programCode": ["009:000"],
        "distribution": [{
            "@type": "dcat:Distribution",
            "downloadURL": "https://data.medicaid.gov/sites/default/files/uploaded_resources/mdrp-newly-rpt-drugs-aug-20210726-to-20210801.csv",
            "mediaType": "text/csv"
        }],
        "license": "http://www.usa.gov/publicdomain/label/1.0/",
        "accrualPeriodicity": "R/P10Y"
    }
]"""

metadata_data = json.loads(metadata)

In [13]:
def check_keywords_in_metadata(metadata_entry, keywords):
    text_to_check = metadata_entry.get("description", "") + " " + metadata_entry.get("title", "")
    for keyword in keywords:
        if keyword in text_to_check.lower():
            return keyword
    return None


In [14]:
# Checking each metadata entry
for entry in metadata_data:
    matched_keyword = check_keywords_in_metadata(entry, keywords)
    if matched_keyword:
        print(f"Found a potential match in: {entry['title']} for keyword: {matched_keyword}")

Found a potential match in: Access to Care (2023) for keyword: access
