<a href="https://colab.research.google.com/github/hantswilliams/r22-sdoh-interoperability/blob/main/test_notebooks/advanced_r21_tests_aim1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
import re
import json
import numpy as np

## Tokenizing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

## TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

## word embeddings
import gensim
from gensim.models import Word2Vec
from gensim import models

## cosine
from scipy.spatial.distance import cosine


In [2]:
# Download stopwords if not available
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
reference_list = [
    "Limited access to nutrition supplies (finding)",
    "Noncompliance with dietary regimen (finding)",
    "Nutrition impaired due to limited access to healthful foods (finding)",
    "Mild food insecurity on United States household food security survey module (finding)",
    "Moderate food insecurity on United States household food security survey module (finding)",
    # ... (other descriptions)
    "Caregiver's noncompliance with patient's dietary regimen due to financial hardship"
]

In [6]:
# Tokenizing and filtering stopwords
all_tokens = []
for description in reference_list:
    tokens = word_tokenize(description)
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stop_words]
    all_tokens.append(' '.join(filtered_tokens))

# Computing TF-IDF values
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_tokens)
feature_names = vectorizer.get_feature_names_out()
tfidf_values = tfidf_matrix.sum(axis=0).tolist()[0]
tfidf_dict = dict(zip(feature_names, tfidf_values))

print(tfidf_dict)  # This dictionary contains keywords with their respective TF-IDF values

{'access': 0.7889062102134055, 'caregiver': 0.3866311833916044, 'dietary': 0.8601407295599457, 'due': 0.6531548345643936, 'financial': 0.3866311833916044, 'finding': 1.180415192099782, 'food': 1.114664374721447, 'foods': 0.40988520714433785, 'hardship': 0.3866311833916044, 'healthful': 0.40988520714433785, 'household': 0.5573321873607235, 'impaired': 0.40988520714433785, 'insecurity': 0.5573321873607235, 'limited': 0.7889062102134055, 'mild': 0.33983077113876825, 'moderate': 0.3398307711387683, 'module': 0.5573321873607235, 'noncompliance': 0.8601407295599457, 'nutrition': 0.7889062102134055, 'patient': 0.3866311833916044, 'regimen': 0.8601407295599457, 'security': 0.5573321873607235, 'states': 0.5573321873607235, 'supplies': 0.5521787534625713, 'survey': 0.5573321873607235, 'united': 0.5573321873607235}


In [4]:
keywords = set()
for description in reference_list:
    tokens = word_tokenize(description)
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stop_words]
    keywords.update(filtered_tokens)
print(keywords)

{'access', 'module', ')', 'due', 'patient', 'food', 'noncompliance', 'moderate', "'s", 'healthful', 'united', 'states', 'nutrition', 'caregiver', 'regimen', '(', 'survey', 'security', 'financial', 'insecurity', 'finding', 'dietary', 'hardship', 'limited', 'mild', 'impaired', 'foods', 'supplies', 'household'}


In [8]:
# Filtering keywords based on the TF-IDF threshold
threshold = 0.5
keywords = {key for key, value in tfidf_dict.items() if value > threshold}
print(keywords)

{'united', 'access', 'module', 'states', 'insecurity', 'finding', 'due', 'nutrition', 'dietary', 'food', 'regimen', 'limited', 'supplies', 'survey', 'noncompliance', 'household', 'security'}


In [25]:
## connect to pre-trained model

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [27]:
# # unzip model, only need to do this once
# # /content/drive/MyDrive/embeddings_model/GoogleNews-vectors-negative300.bin.gz
# !gunzip '/content/drive/MyDrive/embeddings_model/GoogleNews-vectors-negative300.bin.gz'


gzip: /content/drive/MyDrive/embeddings_model/GoogleNews-vectors-negative300.bin.gz: No such file or directory


In [32]:
# Assuming you have loaded a Word2Vec model (you can train or load a pre-trained one)
from gensim import models
model = models.KeyedVectors.load_word2vec_format(
    "/content/drive/MyDrive/embeddings_model/GoogleNews-vectors-negative300.bin",
    binary=True
    )

In [45]:
# Convert keywords into embeddings
keyword_embeddings = {keyword: model[keyword] for keyword in keywords if keyword in model}


In [50]:
# Assuming `model` is your loaded Word2Vec model and `keywords` contains your list of keywords
similar_words_dict = {}

for keyword in keywords:
    # Check if keyword exists in the model
    if keyword in model:
        # Find the top 10 similar words for each keyword
        similar_words = model.most_similar(keyword, topn=10)
        # Extract only the words, not the similarity scores
        similar_words_list = [word[0] for word in similar_words]
        # Update the dictionary
        similar_words_dict[keyword] = similar_words_list

# Print the dictionary to view the results
for key, values in similar_words_dict.items():
    print(f"{key}: {values}")


united: ['unite', 'uniting', 'unified', 'chanted_HRC_HRC', 'unity', 'disunited', 'unify', 'unifying', 'unites', 'resolute']
access: ['unfettered_access', 'acess', 'acccess', 'accesss', 'accessing', 'acces', 'Access', 'accessto', 'accessible', 'accessibility']
module: ['modules', 'Module', 'Modules', '###mb_PowerBook_G4', 'JTAG_port', 'image_contrib', 'databus', 'RS###_interface', 'RS###_port', 'PMBus_interface']
states: ['States', 'countries', 'jurisdictions', 'regions', 'counties', 'nations', 'Canadian_providences', 'legislatures', 'sates', 'districts']
insecurity: ['Vigilante_roadblocks', 'Insecurity', 'instability', 'hopelessness', 'lawlessness', 'Grinding_poverty', 'anxiety', 'insecure', 'alienation', 'powerlessness']
finding: ['Finding', 'find', 'discovering', 'found', 'searching', 'finds', 'proving', 'locating', 'figuring', 'obtaining']
due: ['owing', 'Due', 'dueto', 'attributed', 'Owing', 'because', 'necessitated', 'resulting', 'caused', 'exacerbated']
nutrition: ['nutritional',

In [48]:
# Limit the display of floating point numbers
np.set_printoptions(precision=4, suppress=True)

# Print embeddings for each keyword
for keyword, embedding in keyword_embeddings.items():
    print(f"Keyword: {keyword}")
    print("Embedding:", embedding[:10], "...")  # Here we're only showing the first 10 dimensions, but you can adjust as needed.
    print("-------------------------------------------------")


Keyword: united
Embedding: [-0.0209  0.106  -0.0299  0.1582 -0.418  -0.063   0.0221 -0.2852  0.2207
 -0.0139] ...
-------------------------------------------------
Keyword: access
Embedding: [-0.0874 -0.1865 -0.1514  0.0074  0.0767  0.1602  0.0044  0.0942  0.1709
 -0.1021] ...
-------------------------------------------------
Keyword: module
Embedding: [-0.2275 -0.2393  0.019  -0.0052  0.0454  0.0684  0.0581  0.127   0.0415
 -0.1064] ...
-------------------------------------------------
Keyword: states
Embedding: [ 0.1445 -0.104   0.0674  0.1836  0.0491  0.0315 -0.2393  0.0192 -0.1348
 -0.0454] ...
-------------------------------------------------
Keyword: insecurity
Embedding: [ 0.2773  0.0645 -0.2324  0.1377 -0.1211  0.1885 -0.2061 -0.3203  0.0242
 -0.1465] ...
-------------------------------------------------
Keyword: finding
Embedding: [-0.0161  0.0317 -0.2139  0.1328 -0.0664  0.0135  0.0137  0.0027  0.0854
  0.0094] ...
-------------------------------------------------
Keyword: du

In [36]:
# Step 2: Metadata Parsing
metadata = """[
    {
        "accessLevel": "public",
        "landingPage": "https://healthdata.gov/d/229f-a34m",
        "issued": "2023-02-22",
        "@type": "dcat:Dataset",
        "modified": "2023-02-22",
        "contactPoint": {
            "@type": "vcard:Contact",
            "fn": "HHS Office of the Chief Data Officer",
            "hasEmail": "mailto:no-reply@healthdata.gov"
        },
        "identifier": "https://healthdata.gov/api/views/229f-a34m",
        "publisher": {
            "@type": "org:Organization",
            "name": "healthdata.gov"
        },
        "description": "",
        "title": "Access to Care (2023)"
    },
    {
        "accessLevel": "public",
        "landingPage": "https://healthdata.gov/d/238m-ezg9",
        "bureauCode": ["009:00"],
        "issued": "2021-10-14",
        "@type": "dcat:Dataset",
        "modified": "2021-10-13",
        "keyword": ["drug rebate program"],
        "contactPoint": {
            "@type": "vcard:Contact",
            "fn": "Medicaid.gov",
            "hasEmail": "mailto:Medicaid.gov@cms.hhs.gov"
        },
        "publisher": {
            "@type": "org:Organization",
            "name": "Centers for Medicare & Medicaid Services"
        },
        "identifier": "e28727b2-fe6b-46cb-8617-408de290200d",
        "description": "The data below contains newly reported, active covered outpatient drugs which were reported by participating drug manufacturers since the last quarterly update of the Drug Products in the Medicaid Drug Rebate Program (MDRP) database.",
        "title": "Product Data for Newly Reported Drugs in the Medicaid Drug Rebate Program 20210726 to 20210801",
        "programCode": ["009:000"],
        "distribution": [{
            "@type": "dcat:Distribution",
            "downloadURL": "https://data.medicaid.gov/sites/default/files/uploaded_resources/mdrp-newly-rpt-drugs-aug-20210726-to-20210801.csv",
            "mediaType": "text/csv"
        }],
        "license": "http://www.usa.gov/publicdomain/label/1.0/",
        "accrualPeriodicity": "R/P10Y"
    }
]"""

metadata_data = json.loads(metadata)

In [38]:
# Convert metadata texts into embeddings
def convert_to_embeddings(text, model):
    tokens = word_tokenize(text)
    embeddings = [model[token] for token in tokens if token in model]
    return embeddings

metadata_embeddings = []
for entry in metadata_data:
    text = entry.get("description", "") + " " + entry.get("title", "")
    embeddings = convert_to_embeddings(text, model)
    metadata_embeddings.append((entry, embeddings))

In [44]:
# Step 3: Matching Algorithm

def match_using_embeddings(metadata_entry, keyword_embeddings):
    for keyword, key_embedding in keyword_embeddings.items():
        for meta_embedding in metadata_entry[1]:
            similarity = 1 - cosine(key_embedding, meta_embedding)
            if similarity > 0.5:  # Threshold for cosine similarity
                return True, keyword  # Return matched keyword
    return False, None  # If no match found

# Checking each metadata entry
for entry in metadata_embeddings:
    is_match, matched_keyword = match_using_embeddings(entry, keyword_embeddings)
    if is_match:
        print(f"Found a potential match in: {entry[0]['title']} for keyword: {matched_keyword}")


Found a potential match in: Access to Care (2023) for keyword: access
