# Import DPAA MIA/KIAs to Neptune Graph DB
Author: John Bonfardeci<br />
Last modified: 2021-05-12

In [1]:
import json
import logging
import pandas as pd
import re
from gremlin_python.process.anonymous_traversal import traversal
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from gremlin_python.structure.graph import Vertex
from gremlin_python.process.graph_traversal import GraphTraversalSource
import dask
import dask.dataframe as dd
import requests
import pyodbc

In [2]:
# Fix Tornado issue for 'RuntimeError: This event loop is already running':
# https://medium.com/@vyshali.enukonda/how-to-get-around-runtimeerror-this-event-loop-is-already-running-3f26f67e762e
import nest_asyncio
nest_asyncio.apply()

In [3]:
%graph_notebook_version

2.1.2


In [4]:
%graph_notebook_config

{
  "host": "database-1.cluster-cbqe6d3hp7zw.us-east-1.neptune.amazonaws.com",
  "port": 8182,
  "auth_mode": "DEFAULT",
  "load_from_s3_arn": "",
  "ssl": true,
  "aws_region": "us-east-1",
  "sparql": {
    "path": "sparql"
  }
}


<graph_notebook.configuration.generate_config.Configuration at 0x7fe1e843d668>

In [5]:
%status

{'status': 'healthy',
 'startTime': 'Fri May 14 02:03:27 UTC 2021',
 'dbEngineVersion': '1.0.4.1.R4',
 'role': 'writer',
 'gremlin': {'version': 'tinkerpop-3.4.8'},
 'sparql': {'version': 'sparql-1.1'},
 'labMode': {'NeptuneML': 'disabled',
  'ObjectIndex': 'disabled',
  'DFEQueryEngine': 'disabled',
  'ReadWriteConflictDetection': 'enabled'}}

In [6]:
# Constants
NEPTUNE_ENDPOINT = 'database-1-instance-1.cbqe6d3hp7zw.us-east-1.neptune.amazonaws.com:8182'
S3_BUCKET = 's3://dldpaas3/identification-data-set'
DELETE_ALL_DATA = False

In [None]:
# Clear all data from Neptune
# https://aws.amazon.com/blogs/database/resetting-your-graph-data-in-amazon-neptune-in-seconds/
if DELETE_ALL_DATA:
    response = requests.post('https://'+NEPTUNE_ENDPOINT+'/system'
                  , json={"action" : "initiateDatabaseReset"})

    token = response.json()['payload']['token']

    response = requests.post('https://'+NEPTUNE_ENDPOINT+'/system'
                  , json={
                      "action": "performDatabaseReset" ,
                      "token" : token
                  })

    j = response.json()
    print(j)

In [None]:
def setup_graph():
    endpoint = 'wss://'+NEPTUNE_ENDPOINT+'/gremlin'
    try:
        logging.info('Trying To Login...')
        conn = DriverRemoteConnection(endpoint, 'g')
        graph = traversal().withRemote(conn)
        logging.info('Successfully Logged In')
    except Exception as e:
        logging.error(e, exc_info=True)
        raise RuntimeError('Could not connect to Neptune')
    return graph, conn


def trim(s):
    return re.sub(r'(^\s+|\s+$)', '', str(s))


def create_vertex(g: GraphTraversalSource, label: str, prop_name: str, prop_value) -> Vertex:
    if re.match(r'(UNKNOWN|-NA-)', str(prop_value)) or len(trim(prop_value)) == 0:
        return None

    v = g.V().hasLabel(label).has(prop_name, prop_value)
    if v.hasNext():
        return v.next()
    return g.addV(label).property(prop_name, prop_value).next()

In [None]:
def import_data(dataframe):
    g, conn = setup_graph()
    ct = 0
    
    def person_exists(person_id):
        return g.V().hasLabel('person').has('person_id', person_id).hasNext()

    for ix, row in dataframe.iterrows():
        person_id = row['icf_r_object_id']
        
        # Skip of this record already exists
        if person_exists(person_id)
            continue
            
        identified = create_vertex(g, 'identified', 'value', row['Identified'])
        person: Vertex = create_vertex(g, 'person', 'person_id', person_id)
        g.V().hasLabel('person').has('person_id', person_id).property('gender', row['bio_profile_sex'])
        g.addE('has_identified_status').from_(person).to(identified).iterate()

        conflict: Vertex = create_vertex(g, 'conflict', 'value', row['icf_conflict'])
        service_branch: Vertex = create_vertex(g, 'service_branch', 'value', row['icf_service_branch'])
        unit: Vertex = create_vertex(g, 'unit', 'value', row['icf_unit'])
        rank: Vertex = create_vertex(g, 'rank', 'value', row['icf_rank'])
        conflict_status_code: Vertex = create_vertex(g, 'conflict_status_code', 'value'
                                                     , row['icf_conflict_status_code'])
        mccain_releasability: Vertex = create_vertex(g, 'mccain_releasability', 'value'
                                                     , row['icf_mccain_releasability'])
        individual_type: Vertex = create_vertex(g, 'individual_type', 'value', row['icf_individual_type'])
        case_status: Vertex = create_vertex(g, 'case_status', 'value', row['case_status'])
        case_category: Vertex = create_vertex(g, 'case_category', 'value', row['case_category'])
        notification_year: Vertex = create_vertex(g, 'notification_year', 'value', row['icf_notification_year'])
        notification_month: Vertex = create_vertex(g, 'notification_month', 'value', row['icf_notification_month'])
        pow_status: Vertex = create_vertex(g, 'pow', 'value', row['icf_pow'])
        loss_area_historical: Vertex = create_vertex(g, 'loss_area_historical', 'value'
                                                     , row['icf_loss_area_historical'])
        loss_area_current: Vertex = create_vertex(g, 'loss_area_current', 'value', row['icf_loss_area_current'])
        birth_year: Vertex = create_vertex(g, 'birth_year', 'value', row['icf_birth_year'])
        home_rec_state: Vertex = create_vertex(g, 'home_rec_state', 'value', row['icf_home_rec_state'])
        religion: Vertex = create_vertex(g, 'icf_religion', 'value', row['icf_religion'])
        status_description: Vertex = create_vertex(g, 'status_description', 'value', row['icf_status_description'])
        loe_bio_profile: Vertex = create_vertex(g, 'loe_bio_profile', 'value', row['icf_loe_bio_profile'])
        loe_dental_profile: Vertex = create_vertex(g, 'loe_dental_profile', 'value', row['icf_loe_dental_profile'])
        loe_chest_radio: Vertex = create_vertex(g, 'loe_chest_radio', 'value', row['icf_loe_chest_radio'])
        loe_mito_dna_ref: Vertex = create_vertex(g, 'loe_mito_dna_ref', 'value', row['icf_loe_mito_dna_ref'])
        loe_y_dna_ref: Vertex = create_vertex(g, 'loe_y_dna_ref', 'value', row['icf_loe_y_dna_ref'])
        loe_auto_dna_ref: Vertex = create_vertex(g, 'loe_auto_dna_ref', 'value', row['icf_loe_auto_dna_ref'])
        finding_death_date_year: Vertex = create_vertex(g, 'finding_death_date_year', 'value',
                                                        row['icf_finding_death_date_year'])
        finding_death_date_month: Vertex = create_vertex(g, 'finding_death_date_month', 'value'
                                                         , row['icf_finding_death_date_month'])
        death_date_year: Vertex = create_vertex(g, 'death_date_year', 'value', row['icf_death_date_year'])
        death_date_month: Vertex = create_vertex(g, 'death_date_month', 'value', row['icf_death_date_month'])
        nationality: Vertex = create_vertex(g, 'nationality', 'value', row['icf_nationality'])
        eligible_case: Vertex = create_vertex(g, 'eligible_case', 'value', row['icf_eligible_case'])
        country: Vertex = create_vertex(g, 'country', 'value', row['icf_country'])
        is_accounting_eligible: Vertex = create_vertex(g, 'is_accounting_eligible', 'value'
                                                       , row['icf_is_accounting_eligible'])
        external_interests: Vertex = create_vertex(g, 'external_interests', 'value', row['icf_external_interests'])
        home_rec_country: Vertex = create_vertex(g, 'home_rec_country', 'value', row['icf_home_rec_country'])
        creator_name: Vertex = create_vertex(g, 'creator_name', 'value', row['icf_r_creator_name'])
        year_created: Vertex = create_vertex(g, 'year_created', 'value', row['icf_year_created'])
        profile_ancestry: Vertex = create_vertex(g, 'profile_ancestry', 'value', row['bio_profile_ancestry'])
        profile_analyst_name: Vertex = create_vertex(g, 'profile_analyst_name', 'value'
                                                     , row['bio_profile_analyst_name'])
        profile_burn: Vertex = create_vertex(g, 'profile_burn', 'value', row['bio_profile_burn'])
        accession_number: Vertex = create_vertex(g, 'accession_number', 'value', row['accession_number'])

        if conflict:
            g.addE('served_in').from_(person).to(conflict).iterate()

        if service_branch and conflict:
            g.addE('served_under').from_(person).to(service_branch).iterate()

        if unit and service_branch:
            g.addE('served_in').from_(person).to(unit).iterate()

        if rank:
            g.addE('had_rank').from_(person).to(rank).iterate()

        if conflict_status_code:
            g.addE('has_conflict_status_code').from_(person).to(conflict_status_code).iterate()

        if mccain_releasability:
            g.addE('has_mccain_releasability').from_(person).to(mccain_releasability).iterate()

        if individual_type:
            g.addE('is_individual_type').from_(person).to(individual_type).iterate()

        if case_status:
            g.addE('has_case_status').from_(person).to(case_status).iterate()

        if case_category:
            g.addE('has_case_category').from_(person).to(case_category).iterate()

        if notification_year:
            g.addE('has_notification_year').from_(person).to(notification_year).iterate()

        if notification_month:
            g.addE('has_notification_month').from_(person).to(notification_month).iterate()

        if pow_status:
            g.addE('has_pow_status').from_(pow_status).to(pow_status).iterate()

        if loss_area_historical:
            g.addE('has_loss_area_historical').from_(person).to(loss_area_historical).iterate()

        if loss_area_current:
            g.addE('has_loss_area_current').from_(person).to(loss_area_current).iterate()

        if birth_year:
            g.addE('has_birth_year').from_(person).to(birth_year).iterate()

        if home_rec_state:
            g.addE('has_home_rec_state').from_(person).to(home_rec_state).iterate()

        if religion:
            g.addE('is_religion').from_(person).to(religion).iterate()

        if status_description:
            g.addE('has_status_description').from_(person).to(status_description).iterate()

        if loe_bio_profile:
            g.addE('has_loe_bio_profile').from_(person).to(loe_bio_profile).iterate()

        if loe_dental_profile:
            g.addE('has_loe_dental_profile').from_(person).to(loe_dental_profile).iterate()

        if loe_chest_radio:
            g.addE('has_loe_chest_radio').from_(person).to(loe_chest_radio).iterate()

        if loe_mito_dna_ref:
            g.addE('has_loe_mito_dna_ref').from_(person).to(loe_mito_dna_ref).iterate()

        if loe_y_dna_ref:
            g.addE('has_loe_y_dna_ref').from_(person).to(loe_y_dna_ref).iterate()

        if loe_auto_dna_ref:
            g.addE('has_loe_auto_dna_ref').from_(person).to(loe_auto_dna_ref).iterate()

        if finding_death_date_year:
            g.addE('has_finding_death_date_year').from_(person).to(finding_death_date_year).iterate()

        if finding_death_date_month:
            g.addE('has_finding_death_date_month').from_(person).to(finding_death_date_month).iterate()

        if death_date_year:
            g.addE('has_death_date_year').from_(person).to(death_date_year).iterate()

        if death_date_month:
            g.addE('has_death_date_month').from_(person).to(death_date_month).iterate()

        if nationality:
            g.addE('has_nationality').from_(person).to(nationality).iterate()

        if eligible_case:
            g.addE('is_eligible_case').from_(person).to(eligible_case).iterate()

        if country:
            g.addE('from_country').from_(person).to(country).iterate()

        if is_accounting_eligible:
            g.addE('has_accounting_eligible').from_(person).to(is_accounting_eligible).iterate()

        if external_interests:
            g.addE('has_external_interests').from_(person).to(external_interests).iterate()

        if home_rec_country:
            g.addE('has_home_rec_country').from_(person).to(home_rec_country).iterate()

        if creator_name:
            g.addE('created').from_(creator_name).to(person).iterate()

        if year_created:
            g.addE('record_year_created').from_(year_created).to(person).iterate()

        if profile_ancestry:
            g.addE('has_profile_ancestry').from_(person).to(profile_ancestry).iterate()

        if profile_analyst_name:
            g.addE('has_profile_analyst_name').from_(person).to(profile_analyst_name).iterate()

        if profile_burn:
            g.addE('has_profile_burn').from_(person).to(profile_burn).iterate()

        if accession_number:
            g.addE('has_accession_number').from_(person).to(accession_number).iterate()

        # Confirm new record was created.
        if person_exists(person_id):
            ct += 1

    # Always close connection when done.
    conn.close()
    return ct

In [None]:
if __name__ == '__main__':
    data_schema = {
        'icf_r_object_id': 'object',
        'Identified': 'int8',
        'icf_service_branch': 'category',
        'icf_conflict': 'category',
        'icf_unit': 'category',
        'icf_rank': 'category',
        'icf_rate': 'category',
        'icf_post_rank': 'category',
        'icf_conflict_status_code': 'category',
        'icf_mccain_releasability': 'category',
        'icf_individual_type': 'category',
        'case_status': 'category',
        'case_category': 'category',
        'icf_notification_year': 'category',
        'icf_notification_month': 'category',
        'icf_pow': 'int8',
        'icf_loss_area_historical': 'category',
        'icf_birth_year': 'category',
        'icf_home_rec_state': 'category',
        'icf_religion': 'category',
        'icf_status_description': 'object',
        'icf_loe_bio_profile': 'int8',
        'icf_loe_dental_profile': 'int8',
        'icf_loe_chest_radio': 'int8',
        'icf_loe_mito_dna_ref': 'int8',
        'icf_loe_y_dna_ref': 'int8',
        'icf_loe_auto_dna_ref': 'int8',
        'icf_finding_death_date_year': 'category',
        'icf_finding_death_date_month': 'category',
        'icf_death_date_year': 'category',
        'icf_death_date_month': 'category',
        'icf_nationality': 'category',
        'icf_eligible_case': 'category',
        'icf_loss_area_current': 'category',
        'icf_substatus_description': 'category',
        'icf_substatus_other': 'category',
        'icf_country': 'category',
        'icf_is_accounting_eligible': 'int8',
        'icf_external_interests': 'category',
        'icf_historian_comments': 'object',
        'icf_home_rec_country': 'category',
        'icf_r_creator_name': 'category',
        'icf_year_created': 'category',
        'bio_profile_sex': 'category',
        'bio_profile_ancestry': 'category',
        'bio_profile_analyst_name': 'category',
        'bio_profile_burn': 'int8',
        'bio_profile_mangled': 'int8',
        'bio_profile_mangled_description': 'category',
        'accession_number': 'category',
        'accession_country': 'category',
        'accession_keywords': 'object'
    }
    
    data_path = S3_BUCKET+'/dpaa_identified_model_vw.csv'
    data = dd.read_csv(data_path, dtype=data_schema)
    import_count = import_data(data)
    difference = len(data) - import_count
    
    if difference > 0:
        print(str.format('Failed to import {0} records!', difference))
    else:
        print(str.format('All {0} records imported!', import_count))