In [50]:
import numpy as np
import pyTigerGraph as tg
from pyTigerGraph.pyTigerGraph import TigerGraphException
import pandas as pd
from tqdm import tqdm

In [51]:
batch_size = 10000

# Create & Test Connection

In [52]:
secret = '9tklmj5j3ubcrtr1606mk4hhkb5bcq2j'
cred = {
    "host": "https://gradient-test.i.tgcloud.io",
    "graphname": "PrescribedDrugReview",
    "username": "tigergraph",
    "password": "Gradient123"
}


token, token_life_sec, token_expir_date = tg.TigerGraphConnection(**cred).getToken(secret="9tklmj5j3ubcrtr1606mk4hhkb5bcq2j")

cred["apiToken"] = token
conn = tg.TigerGraphConnection(**cred)
print(conn.gsql('ls')) # test connection

---- Graph PrescribedDrugReview
Vertex Types:
- VERTEX Drug(PRIMARY_ID generic_name STRING) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"
- VERTEX Disease(PRIMARY_ID disease_name STRING, description STRING) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"
- VERTEX BrandName(PRIMARY_ID name STRING) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"
- VERTEX Prescriber(PRIMARY_ID prescriber_npi STRING, first_name STRING, last_name STRING) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"
- VERTEX Prescriber_Type(PRIMARY_ID prescriber_type STRING) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"
- VERTEX Review(PRIMARY_ID review_id STRING, review STRING, rating INT, date DATETIME) WITH STATS="OUTDEGREE_BY_EDGETYPE", PRIMARY_ID_AS_ATTRIBUTE="true"
Edge Types:
- UNDIRECTED EDGE used_for(FROM Drug, TO Disease)
- UNDIRECTED EDGE associated_with(FROM Review, TO Disease)
- UNDIRECTED EDGE has_name(FROM Dru

# Create Graph

In [3]:
graph_not_exists_message = f"Graph name {cred['graphname']} cannot be found. Please provide a valid graph name via 'graph=xxx' parameter string."

try:
    conn.getSchema() # Check if graph exists
    #print(conn.gsql(f"DROP GRAPH {cred['graphname']}"))
except TigerGraphException as e:
    print(e.message)

The graph PrescribedDrugReview is dropped.


In [4]:
create_graph_query_fname = f"../core/graph/create_graph_schema_{cred['graphname']}.gsql"

with open(create_graph_query_fname, "r") as f:
    create_graph_query = f.read()

#print(conn.gsql(create_graph_query))

The graph PrescribedDrugReview is created.
Using graph 'PrescribedDrugReview'
Successfully created schema change jobs: [initGraph].

Current graph version 0
Trying to add vertex Drug.
Trying to add vertex Disease.
Trying to add vertex BrandName.
Trying to add vertex Prescriber.
Trying to add vertex Prescriber_Type.
Trying to add vertex Review.
Trying to add edge used_for.
Trying to add edge associated_with.
Trying to add edge has_name.
Trying to add edge has_prescriber_type.
Trying to add edge prescribed.
Trying to add edge has_review.
Kick off job initGraph

Graph PrescribedDrugReview update to new version 1
The job initGraph completes in 9.525 seconds!


# Load Graph

In [53]:
data_dir = '../core/dataset/output'

### Load Drug & Brand name vertices & relation

In [63]:
drug_df = pd.read_csv(f'{data_dir}/drug-ids.tsv', sep='\t')

print(
    conn.upsertVertexDataFrame(
        drug_df,
        'Drug',
        v_id = 'Gnrc_Name',
        attributes = {}
    )
)

print(
    conn.upsertVertexDataFrame(
        drug_df,
        'BrandName',
        v_id = 'Brnd_Name',
        attributes = {}
    )
)

print(
    conn.upsertEdgeDataFrame(
        drug_df,
        'Drug',
        'has_name',
        'BrandName',
        from_id = 'Gnrc_Name',
        to_id = 'Brnd_Name',
        attributes = {}
    )
)

856
1351
1381


### Load Prescriber & Type & Prescriber-Drug edges

In [64]:
prescriber_entities_df = pd.read_csv(f'{data_dir}/prescriber.tsv', sep='\t')
prescriber_type_df = pd.read_csv(f'{data_dir}/prescriber-type.tsv', sep='\t')
unique_prescriber_type_df = pd.read_csv(f'{data_dir}/unique_prescriber_type.tsv', sep='\t')
prescriber_drug_cost_edge_df = pd.read_csv(f'{data_dir}/prescriber-drug.tsv', sep='\t')

In [10]:
unique_prescriber_type_df

Unnamed: 0,Prscrbr_NPI,Prscrbr_Last_Org_Name,Prscrbr_First_Name
0,1003000126,Enkeshafi,Ardalan
1,1003000142,Khalil,Rashid
2,1003000167,Escobar,Julio
3,1003000282,Blakemore,Rosie
4,1003000423,Velotta,Jennifer
...,...,...,...
882871,1992999650,Yong,Wayne
882872,1992999759,Soriano,Dino
882873,1992999825,Deschenes,Geoffrey
882874,1992999833,Shaw,L. Noah


In [11]:
conn.upsertVertexDataFrame(
    prescriber_entities_df,
    'Prescriber',
    v_id = 'Prscrbr_NPI',
    attributes = {
        'first_name': 'Prscrbr_First_Name',
        'last_name': 'Prscrbr_Last_Org_Name'
    }
)

882876

In [12]:
conn.upsertVertexDataFrame(
    unique_prescriber_type_df,
    'Prescriber_Type',
    v_id = 'Prscrbr_Type',
    attributes = {}
)

172

In [21]:
num_batches = int(np.ceil(prescriber_type_df.shape[0] / batch_size))
for i in tqdm(range(num_batches)):
    conn.upsertEdgeDataFrame(
        prescriber_type_df[batch_size*i:batch_size*(i+1)],
        'Prescriber',
        'has_prescriber_type',
        'Prescriber_Type',
        from_id = 'Prscrbr_NPI',
        to_id = 'Prscrbr_Type',
        attributes = {}
    )

100%|██████████| 1167/1167 [36:09<00:00,  1.86s/it]


In [76]:
num_batches = int(np.ceil(prescriber_drug_cost_edge_df.shape[0] / batch_size))
for i in tqdm(range(num_batches)):
    conn.upsertEdgeDataFrame(
        prescriber_drug_cost_edge_df[batch_size*i:batch_size*(i+1)],
        'Prescriber',
        'prescribed',
        'Drug',
        from_id = 'Prscrbr_NPI',
        to_id = 'Gnrc_Name',
        attributes = {
            'total_drug_cost': 'Tot_Drug_Cst',
            'total_drug_cost_ge65': 'GE65_Tot_Drug_Cst'
        }
    )

100%|██████████| 1167/1167 [50:40<00:00,  2.61s/it]


### Load Disease Vertices

In [68]:
conditions_df = pd.read_csv(f'{data_dir}/disease-names.tsv', sep='\t')
used_for_df = pd.read_csv(f'{data_dir}/drug-disease.tsv', sep='\t')

In [69]:
print(
    conn.upsertVertexDataFrame(
        conditions_df,
        'Disease',
        v_id = 'condition',
        attributes = {}
    )
)

836


In [70]:
print(
conn.upsertEdgeDataFrame(
    used_for_df,
    'Drug',
    'used_for',
    'Disease',
    from_id = 'Gnrc_Name',
    to_id = 'condition',
    attributes = {}
))

2858


### Load Review Nodes & Edges

In [71]:
reviews_df = pd.read_csv(f'{data_dir}/review.tsv', sep='\t')
review_drug_df = pd.read_csv(f'{data_dir}/drug-review.tsv', sep='\t')
review_disease_df = pd.read_csv(f'{data_dir}/disease-review.tsv', sep='\t')

In [72]:
reviews_df = reviews_df.astype({'rating': int})
reviews_df['date'] = pd.to_datetime(reviews_df['date']).astype(str)
reviews_df.iloc[:2]

Unnamed: 0,review_id,review,rating,date
0,0,"""It has no side effect, I take it in combinati...",9,2012-05-20
1,1,"""2nd day on 5mg started to work with rock hard...",2,2015-11-28


In [73]:
num_batches = int(np.ceil(reviews_df.shape[0] / batch_size))
for i in tqdm(range(num_batches)):
    print(conn.upsertVertexDataFrame(
        reviews_df[batch_size*i:batch_size*(i+1)],
        'Review',
        v_id = 'review_id',
        attributes={
            'review': 'review',
            'rating': 'rating',
            'date': 'date'
        }
    ))

 10%|█         | 1/10 [00:04<00:42,  4.73s/it]

10000


 20%|██        | 2/10 [00:08<00:35,  4.40s/it]

10000


 30%|███       | 3/10 [00:12<00:28,  4.14s/it]

10000


 40%|████      | 4/10 [00:16<00:23,  3.90s/it]

10000


 50%|█████     | 5/10 [00:19<00:18,  3.71s/it]

10000


 60%|██████    | 6/10 [00:22<00:13,  3.38s/it]

10000


 70%|███████   | 7/10 [00:25<00:09,  3.21s/it]

10000


 80%|████████  | 8/10 [00:28<00:06,  3.08s/it]

10000


 90%|█████████ | 9/10 [00:31<00:03,  3.21s/it]

10000


100%|██████████| 10/10 [00:32<00:00,  3.26s/it]

1047





In [74]:
num_batches = int(np.ceil(review_drug_df.shape[0] / batch_size))
for i in tqdm(range(num_batches)):
    print(conn.upsertEdgeDataFrame(
        review_drug_df[batch_size*i:batch_size*(i+1)],
        'Drug',
        'has_review',
        'Review',
        from_id = 'Gnrc_Name',
        to_id = 'review_id',
        attributes={}
    ))

 10%|█         | 1/10 [00:01<00:14,  1.59s/it]

10000


 20%|██        | 2/10 [00:03<00:13,  1.63s/it]

10000


 30%|███       | 3/10 [00:04<00:11,  1.59s/it]

10000


 40%|████      | 4/10 [00:06<00:09,  1.60s/it]

10000


 50%|█████     | 5/10 [00:07<00:07,  1.59s/it]

10000


 60%|██████    | 6/10 [00:09<00:06,  1.59s/it]

10000


 70%|███████   | 7/10 [00:11<00:04,  1.60s/it]

10000


 80%|████████  | 8/10 [00:12<00:03,  1.60s/it]

10000


 90%|█████████ | 9/10 [00:14<00:01,  1.57s/it]

10000


100%|██████████| 10/10 [00:14<00:00,  1.48s/it]

1047





In [75]:
num_batches = int(np.ceil(review_disease_df.shape[0] / batch_size))
for i in tqdm(range(num_batches)):
    print(conn.upsertEdgeDataFrame(
        review_disease_df[batch_size*i:batch_size*(i+1)],
        'Review',
        'associated_with',
        'Disease',
        from_id = 'review_id',
        to_id = 'condition',
        attributes={}
    ))

 10%|█         | 1/10 [00:01<00:17,  1.92s/it]

10000


 20%|██        | 2/10 [00:03<00:15,  1.88s/it]

10000


 30%|███       | 3/10 [00:05<00:13,  1.92s/it]

10000


 40%|████      | 4/10 [00:07<00:11,  1.90s/it]

10000


 50%|█████     | 5/10 [00:09<00:09,  1.92s/it]

10000


 60%|██████    | 6/10 [00:11<00:07,  1.90s/it]

10000


 70%|███████   | 7/10 [00:13<00:06,  2.02s/it]

10000


 80%|████████  | 8/10 [00:15<00:04,  2.09s/it]

10000


 90%|█████████ | 9/10 [00:18<00:02,  2.15s/it]

10000


100%|██████████| 10/10 [00:18<00:00,  1.89s/it]

1047



