In [1]:
import pickle
import torch
import numpy as np
import gensim
from gensim.models import KeyedVectors
import os
import pandas as pd
import csv
import fasttext
import sys
import torch

In [2]:
vectors_filename = "/home/jovyan/embeddings/BioWordVec_PubMed_MIMICIII_d200.vec.bin"
model_filename = "/home/jovyan/embeddings/BioWordVec_PubMed_MIMICIII_d200.bin"

#! Remove this path once I implement PSI
deid_notes_path = "/home/jovyan/mpc_use_case/unstructured_data/deidentified_notes"

knee_keywords_output = "/home/jovyan/mpc_use_case/crypten_structured_data/knee_keywords.pt"
hip_keywords_output = "/home/jovyan/mpc_use_case/crypten_structured_data/hip_keywords.pt"

In [3]:
bioword_vector = KeyedVectors.load_word2vec_format(vectors_filename, binary=True)
print("Vectors loaded")

Vectors loaded


In [4]:
bioword_model = fasttext.load_model(model_filename)
print("Model loaded")

In [None]:

#! -----------------------------------------------------------------------------------------
#TODO Untructured data - Remove after
#! -----------------------------------------------------------------------------------------
print("\nProcessing patients and their notes based on their diagnosis")

# Fetching demographic_no for all patients from the filename of notes
files = os.listdir(deid_notes_path)
all_demographic_nos_notes = set()
oa_patients = set()
for file in files:
    demographic_no = int(file.split("-")[1].split(".")[0])
    all_demographic_nos_notes.add(demographic_no)
print("Number of patients having patient notes:", len(all_demographic_nos_notes))


Processing patients and their notes based on their diagnosis
Number of patients having patient notes: 163


In [None]:
all_demographic_nos_dxresearch = set()
oa_patients = set()

# Convert txt to csv
with open('/home/jovyan/mpc_use_case/structured_data/DxResearch.txt', 'r') as in_file:
    stripped = (line.strip() for line in in_file)
    lines = (line.split(",") for line in stripped if line)
    with open('/home/jovyan/mpc_use_case/prototype/oaTypes/DxResearch.csv', 'w') as out_file:
        writer = csv.writer(out_file)
        writer.writerows(lines)

# Fetching demographic_no of total patients and OA patients from the DxResearch table
df = pd.read_csv("/home/jovyan/mpc_use_case/prototype/oaTypes/DxResearch.csv")
# df.head()
for index, row in df.iterrows():
    no = row['demographic_no']
    all_demographic_nos_dxresearch.add(no)
    if row['dxresearch_code'] == 715:
        oa_patients.add(no)
print("\nNumber of patients listed with disease code:", len(all_demographic_nos_dxresearch))
print("Number of patients listed in disease code table as having OA:", len(oa_patients))


Number of patients listed with disease code: 163
Number of patients listed in disease code table as having OA: 33


In [None]:

#! -----------------------------------------------------------------------------------------
#TODO Where PSI comes in
#! -----------------------------------------------------------------------------------------
# Deducting the demographic_no of OA patients having notes
oa_patients_with_notes = oa_patients.intersection(all_demographic_nos_notes)
#print("Number of patients having OA and notes:", len(oa_patients_with_notes))
print("Patient IDs:", sorted(oa_patients_with_notes))

Patient IDs: [4, 5, 6, 7, 8, 9, 11, 14, 18, 26, 37, 40, 54, 58, 61, 63, 64, 76, 77, 83, 94, 101, 103, 106, 110, 115, 133, 135, 148, 150, 155, 159, 162]


In [None]:
# Lists that contain keywords that we can look for in the clinical notes
knee_oa_substrings = ['knee pain', 'pain knee', 'knee oa', 'oa knee', 'osteoarthrities knee', 'knee osteoarthritis', 
                      'kneepain', 'painknee', 'kneeoa', 'oaknee', 'osteoarthritiesknee', 'kneeosteoarthritis']
hip_oa_substrings = ['hip pain', 'pain hip', 'hip oa', 'oa hip', 'osteoarthrities hip', 'hip osteoarthritis',
                     'hippain', 'painhip', 'hipoa', 'oahip', 'osteoarthritieship', 'hiposteoarthritis']

# knee_oa_substrings = ['kneepain', 'painknee', 'kneeoa', 'oaknee', 'osteoarthritiesknee', 'kneeosteoarthritis']
# hip_oa_substrings = ['hip pain', 'pain hip', 'hip oa', 'oa hip', 'osteoarthrities hip', 'hip osteoarthritis',
#                      'hippain', 'painhip', 'hipoa', 'oahip', 'osteoarthritieship', 'hiposteoarthritis']

In [None]:
# This adds every individual word into a new list, and splits up strings that contain more than one word so that their
# individual words can be added to the list. The new list contains unique individual words that are found in any of the original
# strings. Returns a dictionary of individual words and their associated embeddings
def create_embeddings(substrings):  
    vectors = {}
    single_words =[]
    for i in substrings:
        # If the substring only contains one word
        if len(i.split()) == 1:
            try:
                word_embedding = bioword_vector[i]
            # If the word does not have an embedding already, we use the model to create one for it
            except:
                word_embedding = bioword_model.get_word_vector(i)
            vectors[i] = word_embedding
        else:
            # Split the string into two words
            multiple_words = i.split()
            multiple_words_embeddings = []
            # This creates an embedding for each word in the string
            for word in multiple_words:
                try:
                    word_embedding = bioword_vector[word]
                # If the word does not have an embedding already, we use the model to create one for it
                except:
                    word_embedding = bioword_model.get_word_vector(word)
                multiple_words_embeddings.append(word_embedding)
            vectors[i] = multiple_words_embeddings
    return vectors

knee_oa_embeddings = create_embeddings(knee_oa_substrings)
hip_oa_embeddings = create_embeddings(hip_oa_substrings)

print(knee_oa_embeddings.keys())
print(hip_oa_embeddings.keys())

dict_keys(['knee pain', 'pain knee', 'knee oa', 'oa knee', 'osteoarthrities knee', 'knee osteoarthritis', 'kneepain', 'painknee', 'kneeoa', 'oaknee', 'osteoarthritiesknee', 'kneeosteoarthritis'])
dict_keys(['hip pain', 'pain hip', 'hip oa', 'oa hip', 'osteoarthrities hip', 'hip osteoarthritis', 'hippain', 'painhip', 'hipoa', 'oahip', 'osteoarthritieship', 'hiposteoarthritis'])


In [None]:
# Convert the word embeddings into CrypTen tensors, save the encrypted tensors to a file for future use
def create_tensors(list_embeddings):
    tensors = []
    for key in list_embeddings:
        if len(key.split()) == 1:
            tensor = torch.Tensor(list_embeddings[key])
            tensors.append(tensor)
        else:
            list_tensors = []
            for i in range(len((key.split()))):
                tensor = torch.Tensor(list_embeddings[key][i])
                list_tensors.append(tensor)
            tensors.append(list_tensors)
    return tensors

knee_oa_tensors = create_tensors(knee_oa_embeddings)
hip_oa_tensors = create_tensors(hip_oa_embeddings)

print(knee_oa_tensors[0])

[tensor([-0.2603,  0.4272,  0.3095, -0.3169, -0.2128, -0.5430, -0.2666,  0.1239,
         1.5385,  0.4681, -0.7621,  0.1065,  0.5792,  0.3217,  0.3974,  0.4679,
         0.4497,  0.0905,  0.4148,  0.1793, -0.3565,  0.4682, -0.7171,  0.2464,
         0.4396, -0.1002,  0.3209, -0.0387,  1.1097,  0.1031,  0.3740,  0.4466,
         0.0312, -0.0053, -1.0080, -0.3279, -0.2582, -0.3532, -0.4033,  0.2935,
         0.2130, -0.1458, -0.7886,  0.7468, -0.4722,  0.7275,  0.5605, -0.1837,
        -1.1738, -0.2133,  0.0285, -0.0681, -1.0151,  0.5577, -0.3304,  0.3571,
        -0.0506,  0.3155, -0.3221, -0.0435,  0.5139,  0.3927,  0.3636,  0.5923,
         0.3153,  0.1941, -0.0179,  0.5383,  0.3122,  0.1515, -0.6059,  0.1711,
         0.0454,  0.2127, -0.3796, -0.3050,  0.1448,  0.7174,  0.1922, -0.6134,
         0.0054, -0.9837, -0.6054,  0.3950,  0.0085,  0.1025, -0.4495, -0.3519,
         0.7130, -0.0902,  0.2316, -0.5953,  0.1683, -0.0857, -0.5314, -0.8464,
        -0.3151,  0.5821,  0.1732, -0.8

  tensor = torch.Tensor(list_embeddings[key][i])


In [None]:
# This function creates pickle files that can be retrieved later in the MPC protocol
def create_file(filename, tensors):
    with open(filename, 'wb') as f:
        torch.save(tensors, f)
    
create_file(knee_keywords_output, knee_oa_tensors)
create_file(hip_keywords_output, hip_oa_tensors)