# Check list

- DataElements
    - ~ContactTracing~
    - ~Interview~
    - ~LocatingInformation~
    - ~PreExistingConditions~
    - ~RiskFactors~
    - ~SARCoV2Test~
    - ~SymptomsAndClinicalCourse~
    - ~Connection to investigation~
- ~Patient~
- Investigator

# Initial Settings

In [1]:
from lxml import etree
import random
import copy
from datetime import datetime, timedelta
import pytz
from faker import Faker
from tqdm import trange
import gc

In [2]:
# https://stackoverflow.com/questions/25991860/unable-to-pass-an-lxml-etree-object-to-a-separate-process
import copyreg
from io import StringIO

def element_unpickler(data):
    return etree.fromstring(data)

def element_pickler(element):
    data = etree.tostring(element)
    return element_unpickler, (data,)

copyreg.pickle(etree._Element, element_pickler, element_unpickler)

def elementtree_unpickler(data):
    data = StringIO(data)
    return etree.parse(data)

def elementtree_pickler(tree):
    data = StringIO()
    tree.write(data)
    return elementtree_unpickler, (data.getvalue(),)

copyreg.pickle(etree._ElementTree, elementtree_pickler, elementtree_unpickler)

In [3]:
NAME_SPACE = {
    "base": "https://ebiquity.umbc.edu/cdc-contact-tracing-data-elements#",
    "owl":"http://www.w3.org/2002/07/owl#",
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
    None: "https://ebiquity.umbc.edu/cdc-contact-tracing-data-elements#"
}

In [4]:
YNUR = ['Y', 'N', 'U', 'R']
YNPR = ['Y', 'N', 'P', 'R']
PNUE = ['Pos', 'Neg', 'Unk', 'Equi']
YN = ['Y', 'N']
CONGREGATE = ["Corrections", "Dorm", "Group home", "Multi-family household", "Multi-generational household"]
EMPLOYED = ["No-retired", "No-student", "No-unable to work", "No-unemployed", "Yes"]
HCP = ["1stResponder", "AmbulatoryCare", "EMS", "Fire", "Hospice", "Hospital", "LawEnforcement", "LongTermCare", "UrgentCare"]
CONTACT = ["DayCare", "Home", "Hospital/MedicalCare", "PlaceOfWorship", "RetailSetting", "School", "Shelter", "Travel/Transit", 
           "Workplace"]
ETHNICITY = ["HispanicOrLatino", "NotHispanicOrLatino"]
GENDER = ["Female", "Male", "Other_Gender", "Unknown_Gender"]
LANGUAGE = ["Bengali" , "Chinese" , "English" , "French" , "German" , "Hindi" , "Italian" , "Korean" , "Portuguese" , "Spanish"]
RACE = ["AlaskaNative" , "AmericanIndian" , "Asian" , "BlackOrAfricanAmerian" , "NativeHawaiianOrOtherPacificIslander" , "White"]

# Data Generation

## General Method

In [5]:
def create_individual (root, name, class_name):
    individual = etree.SubElement(root, "{%s}NamedIndividual" %NAME_SPACE["owl"])
    individual.set("{%s}about" %NAME_SPACE["rdf"], f"{NAME_SPACE['base']}{name}")
    individual_type = etree.SubElement(individual, "{%s}type" %NAME_SPACE["rdf"])
    individual_type.set("{%s}resource" %NAME_SPACE["rdf"], f"{NAME_SPACE['base']}{class_name}")
    return individual

In [6]:
def create_object_property(sub, name, obj=None, random_suffix = None, random_value = None):
    object_property = etree.SubElement(sub, "{%s}%s" %(NAME_SPACE[None],name))
    if obj and random_suffix:
        object_property.set("{%s}resource" %NAME_SPACE["rdf"], f"{NAME_SPACE['base']}{obj}_{random.choice(random_suffix)}")
    elif not obj and not random_suffix and random_value:
        object_property.set("{%s}resource" %NAME_SPACE["rdf"], f"{NAME_SPACE['base']}{random.choice(random_value)}")
    else:
        object_property.set("{%s}resource" %NAME_SPACE["rdf"], f"{NAME_SPACE['base']}{obj}")

In [7]:
def create_data_property(sub, name, data_type, value = None, random_value = None):
    data_property = etree.SubElement(sub, "{%s}%s" %(NAME_SPACE[None],name))
    data_property.set("{%s}datatype" %NAME_SPACE["rdf"], data_type)
    if value and not random_value:
        data_property.text=str(value)
    elif not value and random_value:
        data_property.text=str(random.choice(random_value))

In [8]:
def random_date_between(start, end):
    start = start.astimezone().replace(microsecond=0)
    end = end.astimezone().replace(microsecond=0)
    delta = end - start
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
    random_second = random.randrange(int_delta)
    return start + timedelta(seconds=random_second)

## DataElements > SymptomsAndClinicalCourse

In [9]:
def create_symptoms_and_clinical_course(root, id_):
    symptoms = create_individual(root, f"SymptomsAndClinicalCourse_{id_:07}","SymptomsAndClinicalCourse")
    create_data_property(symptoms, "AbdominalPain", "http://www.w3.org/2001/XMLSchema#string", random_value="YNUR")
    create_data_property(symptoms, "Chills", "http://www.w3.org/2001/XMLSchema#string", random_value="YNUR")
    create_data_property(symptoms, "Cough", "http://www.w3.org/2001/XMLSchema#string", random_value="YNUR")
    create_data_property(symptoms, "Death", "http://www.w3.org/2001/XMLSchema#string", random_value="YNPR")
    create_data_property(symptoms, "Diarrhea_GI", "http://www.w3.org/2001/XMLSchema#string", random_value="YNUR")
    create_data_property(symptoms, "ECMO", "http://www.w3.org/2001/XMLSchema#string", random_value="YNPR")
    create_data_property(symptoms, "Fatigue", "http://www.w3.org/2001/XMLSchema#string", random_value="YNUR")
    create_data_property(symptoms, "Fever", "http://www.w3.org/2001/XMLSchema#string", random_value="YNUR")
    create_data_property(symptoms, "Headache", "http://www.w3.org/2001/XMLSchema#string", random_value="YNUR")
    create_data_property(symptoms, "Hospitalized", "http://www.w3.org/2001/XMLSchema#string", random_value="YNPR")
    create_data_property(symptoms, "ICU", "http://www.w3.org/2001/XMLSchema#string", random_value="YNPR")
    create_data_property(symptoms, "LossOfSenseOfSmell", "http://www.w3.org/2001/XMLSchema#string", random_value="YNUR")
    create_data_property(symptoms, "LossOfSenseOfTaste", "http://www.w3.org/2001/XMLSchema#string", random_value="YNUR")
    create_data_property(symptoms, "Malaise", "http://www.w3.org/2001/XMLSchema#string", random_value="YNUR")
    create_data_property(symptoms, "MI", "http://www.w3.org/2001/XMLSchema#string", random_value="YNPR")
    create_data_property(symptoms, "MuscleAche", "http://www.w3.org/2001/XMLSchema#string", random_value="YNUR")
    create_data_property(symptoms, "NasalCongestion", "http://www.w3.org/2001/XMLSchema#string", random_value="YNUR")
    create_data_property(symptoms, "Pneumonia", "http://www.w3.org/2001/XMLSchema#string", random_value="YNUR")
    create_data_property(symptoms, "ShortnessOfBreath", "http://www.w3.org/2001/XMLSchema#string", random_value="YNUR")
    create_data_property(symptoms, "SoreThroat", "http://www.w3.org/2001/XMLSchema#string", random_value="YNUR")
    create_data_property(symptoms, "Stroke", "http://www.w3.org/2001/XMLSchema#string", random_value="YNPR")
    create_data_property(symptoms, "Vomiting", "http://www.w3.org/2001/XMLSchema#string", random_value="YNUR")

    random_date = random_date_between(
        datetime(2021,7,1,0,0,0, tzinfo=pytz.timezone('US/Eastern')), 
        datetime(2021,7,15,23,59,59,tzinfo=pytz.timezone('US/Eastern'))).isoformat()
    create_data_property(symptoms, "DateOfSymptomOnset", "http://www.w3.org/2001/XMLSchema#dateTime", value=random_date)

## DataElements > SARSCoV2Test

In [10]:
def create_sars_cov2_test(root, id_):
    test = create_individual(root, f"SARSCoV2Test_{id_:07}","SARSCoV2Test")
    test_occurred = random.choice(YNUR)
    create_data_property(test, "SARSCoV2TestOccurred", "http://www.w3.org/2001/XMLSchema#string", value=test_occurred)

    if test_occurred == 'Y':
        # Set datetime
        create_data_property(test, "ResultOfFirstSARSCoV2Test", "http://www.w3.org/2001/XMLSchema#string", random_value=PNUE)
        first_datetime = random_date_between(datetime(2021,7,1,0,0,0, tzinfo=pytz.timezone('US/Eastern')), 
                                             datetime(2021,7,10,23,59,59,tzinfo=pytz.timezone('US/Eastern')))
        first_datetime_iso = first_datetime.isoformat()

        # Set data property
        create_data_property(test, "DateOfFirstSARSCoV2Test", "http://www.w3.org/2001/XMLSchema#dateTime", value=first_datetime_iso)

        # Set datetime
        last_test_occurred = random.choice(YN)
        if last_test_occurred == 'Y':
            create_data_property(test, "ResultOfLastSARSCoV2Test", "http://www.w3.org/2001/XMLSchema#string", random_value=PNUE)
            last_datetime = random_date_between(
                datetime(2021,7,first_datetime.day+14,0,0,0,tzinfo=pytz.timezone('US/Eastern')),
                datetime(2021,7,31,23,59,59,tzinfo=pytz.timezone('US/Eastern')))
            last_datetime_iso = last_datetime.isoformat()

            # Set data property
            create_data_property(test, "DateOfLastSARSCoV2Test", "http://www.w3.org/2001/XMLSchema#dateTime", value=last_datetime_iso)

## DataElements > RiskFactors

In [11]:
def create_risk_factors(root, id_):
    risk_factors = create_individual(root, f"RiskFactors_{id_:07}", "RiskFactors")

    congregate = random.choice(YNUR)
    create_data_property(risk_factors, "Congregate", "http://www.w3.org/2001/XMLSchema#string", value=congregate)
    if congregate == 'Y':
        create_data_property(risk_factors, "CongregateSetting", "http://www.w3.org/2001/XMLSchema#string", random_value=CONGREGATE)

    create_data_property(risk_factors, "ContactWithConfirmedCovidCase", "http://www.w3.org/2001/XMLSchema#string", random_value=YNUR)
    create_data_property(risk_factors, "Employed", "http://www.w3.org/2001/XMLSchema#string", random_value=EMPLOYED)

    hcp = random.choice(YNUR)
    create_data_property(risk_factors, "HCP", "http://www.w3.org/2001/XMLSchema#string", value=hcp)
    if hcp == 'Y':
        create_data_property(risk_factors, "HCPSetting", "http://www.w3.org/2001/XMLSchema#string", random_value=HCP)

## DateElements > PreExistingConditions

In [12]:
def create_pre_existing_conditions(root, id_, gender=None):
    pre = create_individual(root, f"PreExistingConditions_{id_:07}", "PreExistingConditions")
    create_data_property(pre, "ChronicLiverDisease", "http://www.w3.org/2001/XMLSchema#string", random_value=YNPR)
    create_data_property(pre, "ChronicLungDisease", "http://www.w3.org/2001/XMLSchema#string", random_value=YNPR)
    create_data_property(pre, "ChronicRenalDisease", "http://www.w3.org/2001/XMLSchema#string", random_value=YNPR)
    create_data_property(pre, "CVD", "http://www.w3.org/2001/XMLSchema#string", random_value=YNPR)
    create_data_property(pre, "Diabetes", "http://www.w3.org/2001/XMLSchema#string", random_value=YNPR)
    create_data_property(pre, "Immunocompromised", "http://www.w3.org/2001/XMLSchema#string", random_value=YNPR)
    
    if gender == 'F':
        create_data_property(pre, "Pregnant", "http://www.w3.org/2001/XMLSchema#string", random_value=YNPR) ### ONLY FEMALE ###
    else:
        create_data_property(pre, "Pregnant", "http://www.w3.org/2001/XMLSchema#string", value='N')
        
    create_data_property(pre, "SevereObesity", "http://www.w3.org/2001/XMLSchema#string", random_value=YNPR)

## DataElements > Interview

In [13]:
def create_interview(root, id_):
    interview = create_individual(root, f"Interview_{id_:07}", "Interview")

    first_datetime = random_date_between(
        datetime(2021,7,15,0,0,0, tzinfo=pytz.timezone('US/Eastern')), 
        datetime(2021,7,20,23,59,59,tzinfo=pytz.timezone('US/Eastern')))

    second_datetime = random_date_between(
        datetime(2021,7,first_datetime.day+3,0,0,0,tzinfo=pytz.timezone('US/Eastern')),
        datetime(2021,7,25,23,59,59,tzinfo=pytz.timezone('US/Eastern')))

    third_datetime = random_date_between(
        datetime(2021,7,second_datetime.day+3,0,0,0,tzinfo=pytz.timezone('US/Eastern')),
        datetime(2021,7,31,23,59,59,tzinfo=pytz.timezone('US/Eastern')))

    create_data_property(interview, "Interview_1_Occurred", "http://www.w3.org/2001/XMLSchema#string", random_value=YNPR)
    create_data_property(interview, "Interview_2_Occurred", "http://www.w3.org/2001/XMLSchema#string", random_value=YNPR)
    create_data_property(interview, "Interview_3_Occurred", "http://www.w3.org/2001/XMLSchema#string", random_value=YNPR)

    create_data_property(interview, "DateOfInterviewAttempt_1", "http://www.w3.org/2001/XMLSchema#dateTime", 
                         value=first_datetime.isoformat())
    create_data_property(interview, "DateOfInterviewAttempt_2", "http://www.w3.org/2001/XMLSchema#dateTime", 
                         value=second_datetime.isoformat())
    create_data_property(interview, "DateOfInterviewAttempt_3", "http://www.w3.org/2001/XMLSchema#dateTime", 
                         value=third_datetime.isoformat())

## DataElement > ContactTracing

In [14]:
def create_contact_tracing(root, id_):
    contact = create_individual(root, f"ContactTracing_{id_:07}", "ContactTracing")

    household = random.choice(YNUR)
    create_data_property(contact, "AnyHouseholdContact", "http://www.w3.org/2001/XMLSchema#string", value=household)
    if household == 'Y':
        create_data_property(contact, "TotalNumberOfHouseholdContacts", "http://www.w3.org/2001/XMLSchema#int", random_value="123456789")
    else:
        create_data_property(contact, "TotalNumberOfHouseholdContacts", "http://www.w3.org/2001/XMLSchema#int", value="0")

    create_data_property(contact, "CanSelfIsolate", "http://www.w3.org/2001/XMLSchema#string", random_value=YNUR)
    create_data_property(contact, "NeedAssistanceToSelfIsolate", "http://www.w3.org/2001/XMLSchema#string", random_value=YNUR)

    intimate = random.choice(YNUR)
    create_data_property(contact, "AnyIntimatePartners", "http://www.w3.org/2001/XMLSchema#string", random_value=YNUR)
    if intimate == 'Y':
        create_data_property(contact, "TotalNumberOfIntimatePartners", "http://www.w3.org/2001/XMLSchema#int", random_value="123456789")
    else:
        create_data_property(contact, "TotalNumberOfIntimatePartners", "http://www.w3.org/2001/XMLSchema#int", random_value="0")

    create_data_property(contact, "AnyOtherCloseContact", "http://www.w3.org/2001/XMLSchema#string", random_value=YNUR)

## DataElements > LocatingInformation

In [15]:
def create_locating_information(root, id_):
    Faker.seed(datetime.now())
    fake = Faker()

    locating = create_individual(root, f"LocatingInformation_{id_:07}", "LocatingInformation")

    create_data_property(locating, "BornInTheUnitedStates", "http://www.w3.org/2001/XMLSchema#string", random_value=YNUR)
    create_data_property(locating, "CityOfResidence", "http://www.w3.org/2001/XMLSchema#string", value=fake.city())
    create_data_property(locating, "DOB", "http://www.w3.org/2001/XMLSchema#dateTime",
                         value = datetime.combine(fake.date_of_birth(), datetime.min.time()).isoformat())
    create_data_property(locating, "Email_1", "http://www.w3.org/2001/XMLSchema#string", value=fake.free_email())
    create_data_property(locating, "Email_2", "http://www.w3.org/2001/XMLSchema#string", value=fake.free_email())
    create_data_property(locating, "Ethnicity", "http://www.w3.org/2001/XMLSchema#string", random_value=ETHNICITY)
    create_data_property(locating, "FirstName", "http://www.w3.org/2001/XMLSchema#string", value=fake.first_name())
    create_data_property(locating, "LastName", "http://www.w3.org/2001/XMLSchema#string", value=fake.last_name())
    
    gender = random.choice(GENDER)
    create_data_property(locating, "Gender", "http://www.w3.org/2001/XMLSchema#string", value=gender)
    
    create_data_property(locating, "InterpreterUsed", "http://www.w3.org/2001/XMLSchema#string", random_value=YNUR)
    create_data_property(locating, "OkToEmail", "http://www.w3.org/2001/XMLSchema#string", random_value=YNUR)
    create_data_property(locating, "OkToText", "http://www.w3.org/2001/XMLSchema#string", random_value=YNPR)
    create_data_property(locating, "PhoneNumber_1", "http://www.w3.org/2001/XMLSchema#string", value=fake.phone_number())
    create_data_property(locating, "PhoneNumber_2", "http://www.w3.org/2001/XMLSchema#string", value=fake.phone_number())
    create_data_property(locating, "PrimaryLanguage", "http://www.w3.org/2001/XMLSchema#string", random_value=LANGUAGE)
    create_data_property(locating, "Race", "http://www.w3.org/2001/XMLSchema#string", random_value=RACE)
    create_data_property(locating, "ResidentialStreetAddress", "http://www.w3.org/2001/XMLSchema#string", value=fake.street_address())
    create_data_property(locating, "StateOfResidence", "http://www.w3.org/2001/XMLSchema#string", value=fake.state_abbr())
    create_data_property(locating, "ZipCode", "http://www.w3.org/2001/XMLSchema#int", value=fake.zipcode_in_state())
    
    return gender

## Investigator

In [16]:
def create_investigator(root, id_):
    investigator = create_individual(root, f"Investigator_{id_}", "Investigator")
    # Organization -> identity trust
    # Role -> identity trust
    # behavioral trust
    

## Connection to Investigation

In [17]:
def connect_to_patient(root, id_):
    #investigation = create_individual(root, f"Investigation_{id_:07}", "Investigation")
    #Patient
    patient = create_individual(root, f"Patient_{id_:07}", "Patient")
    
    # Assigned Date
    assigned_datetime = random_date_between(datetime(2021,7,1,0,0,0, tzinfo=pytz.timezone('US/Eastern')), 
                                            datetime(2021,7,10,23,59,59,tzinfo=pytz.timezone('US/Eastern'))).isoformat()
    create_data_property(patient, "DateAssignedToInvestigation", "http://www.w3.org/2001/XMLSchema#dateTime", assigned_datetime)
    
    # Connection
    create_object_property(patient, "hasContactTracing", f"ContactTracing_{id_:07}")
    create_object_property(patient, "hasInterview", f"Interview_{id_:07}")
    create_object_property(patient, "hasLocatingInformation", f"LocatingInformation_{id_:07}")
    create_object_property(patient, "hasPreExistingConditions", f"PreExistingConditions_{id_:07}")
    create_object_property(patient, "hasRiskFactors", f"RiskFactors_{id_:07}")
    create_object_property(patient, "hasSARSCoV2Test", f"SARSCoV2Test_{id_:07}")
    create_object_property(patient, "hasSymptomsAndClinicalCourse", f"SymptomsAndClinicalCourse_{id_:07}")

# Loop

In [18]:
def populate_data(i):
    root = tree.getroot()
    create_interview(root,i)
    create_risk_factors(root,i)
    create_sars_cov2_test(root,i)
    create_contact_tracing(root, i)
    create_symptoms_and_clinical_course(root,i)
    gender = create_locating_information(root,i)
    create_pre_existing_conditions(root,i,gender=gender)
    connect_to_patient(root,i)
    #return root

In [19]:
#tree = etree.parse("base/contact_tracing_base_v5_without_user.owl")
#root = tree.getroot()
#for i in trange(100):
#    populate_data(i)
#    if i%10000 == 0:
#        tree.write(f"population_{i}.owl", pretty_print=True)

In [20]:
size = 1000000
partition = 50000
tree = etree.parse("base/contact_tracing_base_v5_without_user.owl")
root = tree.getroot()
for i in trange(size):
    populate_data(i)
    if i != 0 and (i+1)%partition == 0:
        tree.write(f"population_{int((i+1)/partition)}.owl", pretty_print=True)
        tree = etree.parse("base/contact_tracing_base_v5_without_user.owl")
        root = tree.getroot()
        gc.collect()

100%|██████████| 1000000/1000000 [26:54:54<00:00, 10.32it/s]   
