<img src="./images/1.png" width ="600">

In [4]:
import pandas as pd
import numpy as np
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import torchmetrics

## data preprocessing

In [19]:
# df = pd.read_csv('./kaggle_data(drug)/webmd.csv')
df_ = pd.read_feather('./kaggle_data/webmd.ftr')
df_.head()
print(df_.shape)

(362806, 12)


### remove null data
- remove null value it include ' '(empty value) 

In [20]:
for i in df_.columns:
    df_ = df_[df_[i]!=' ']
df_.dropna(inplace=True)
df_.reset_index(drop=True, inplace=True)

In [21]:
print(df_.shape)
df_.head()

(280127, 12)


Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
0,75 or over,Stuffy Nose,9/21/2014,25dph-7.5peh,146724,5,5,I'm a retired physician and of all the meds I ...,5,Male,"Drowsiness, dizziness , dry mouth /nose/thro...",0
1,25-34,Cold Symptoms,1/13/2011,25dph-7.5peh,146724,5,5,cleared me right up even with my throat hurtin...,5,Female,"Drowsiness, dizziness , dry mouth /nose/thro...",1
2,25-34,Birth Control,6/15/2017,wymzya fe,163180,5,5,Haven't gotten pregnant so it does it's job. I...,2,Female,"Nausea , vomiting , headache , bloating , ...",0
3,45-54,Disease of Ovaries with Cysts,1/30/2017,wymzya fe,163180,5,5,I have take this for 5 years age 45-50 to prev...,5,Female,"Nausea , vomiting , headache , bloating , ...",0
4,55-64,Stuffy Nose,10/29/2012,"12 hour nasal relief spray, non-aerosol",9800,4,2,The 12 hour spray only works for me for 6 hours.,2,Male,"Temporary burning, stinging, dryness in the no...",0



- change value
`Effectiveness` , `EaseofUse`, `Satisfaction`
---
```
rating : 0~ 5
    
    0~3 =  0
    4~5 =  1
```


In [22]:
# change value Effectiveness, EaseofUse, Satisfaction
df_['Effectiveness'] = df_['Effectiveness'].apply(lambda x: 0 if (x<4) else 1)
df_['EaseofUse'] = df_['EaseofUse'].apply(lambda x: 0 if (x<4) else 1)
df_['Satisfaction'] = df_['Satisfaction'].apply(lambda x: 0 if (x<4) else 1)

In [23]:
df_.head()

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount
0,75 or over,Stuffy Nose,9/21/2014,25dph-7.5peh,146724,1,1,I'm a retired physician and of all the meds I ...,1,Male,"Drowsiness, dizziness , dry mouth /nose/thro...",0
1,25-34,Cold Symptoms,1/13/2011,25dph-7.5peh,146724,1,1,cleared me right up even with my throat hurtin...,1,Female,"Drowsiness, dizziness , dry mouth /nose/thro...",1
2,25-34,Birth Control,6/15/2017,wymzya fe,163180,1,1,Haven't gotten pregnant so it does it's job. I...,0,Female,"Nausea , vomiting , headache , bloating , ...",0
3,45-54,Disease of Ovaries with Cysts,1/30/2017,wymzya fe,163180,1,1,I have take this for 5 years age 45-50 to prev...,1,Female,"Nausea , vomiting , headache , bloating , ...",0
4,55-64,Stuffy Nose,10/29/2012,"12 hour nasal relief spray, non-aerosol",9800,1,0,The 12 hour spray only works for me for 6 hours.,0,Male,"Temporary burning, stinging, dryness in the no...",0


In [24]:
# get `patient_id` column
df_['patient_id'] = [i for i in range(len(df_))]

### Node to dictionary for feature

In [25]:
def get_dict(df, column : str):
    index = 0
    val_lst = [i for i in df[column].value_counts().index]
    
    dictionary = {}
    
    for i in val_lst:
        dictionary[i] = index
        index += 1
    return dictionary

- drug_id indexing for creating graph \
  -> it must start from 0~ in a row

In [26]:
# get_druc_dict
drug_dict = get_dict(df_, 'DrugId')

# get_feature_dict_of_patient
patient_feat_dict = get_dict(df_, ['Age', 'Sex'])

# get_feature_dict_of_condition
cond_dict = get_dict(df_, 'Condition')


<img src="./images/2.png" width ="600">

- drug : side_effect dictionary

In [27]:
### Drug Feature

# drug_side_dict
drug_side_dict = {}

### most frequency side effect of each drug (dictionary)
for i in drug_dict.keys():
    most_side = df_[df_['DrugId']==i]['Sides'].value_counts().index[0]
    drug_side_dict[i] = most_side


# side_dict (Cause of sides duplicate, using set)
side_dict = {}
index = 0
side_set = set()
side_set.update(drug_side_dict.values())

# side effect dictionary
for j in side_set:
    side_dict[j] = index
    index += 1


# drug_feat_dict
drug_feat_dict = {}

for i in drug_side_dict.keys():
    drug_feat_dict[drug_dict[i]] = side_dict[drug_side_dict[i]]

In [28]:
# # embedding features
# def get_embed(len_keys, lst, embed_n):

#     embedding_table = nn.Embedding(num_embeddings=len_keys, 
#                                embedding_dim=embed_n)

    
#     embed_feat = embedding_table(torch.LongTensor(lst))

#     return embed_feat

# ### Patient Feature
# # get_feature lst
# patient_f_lst = [patient_feat_dict[(df_['Age'][i], df_['Sex'][i])] for i in range(len(df_))]

# # patient embedding
# patient_embed = get_embed(len(patient_feat_dict.keys()), patient_f_lst, 10)    # ( 280127 * 10 ) -> 280127 types


# ### Condition Feature
# # condition feature lst
# cond_f_lst = [i for i in range(len(cond_dict.values()))]

# # condition embedding
# cond_embed = get_embed(len(cond_dict.values()), cond_f_lst, 10)    # ( 1584 * 10 ) -> 1584 types


# ### drug Feature
# # drug feature lst
# drug_f_lst = [i for i in drug_feat_dict.values()]

# # drug embedding
# drug_embed = get_embed(len(drug_feat_dict.keys()), drug_f_lst, 10)   # ( 4522 * 10 ) -> 1557 types

### Create Heterogeneous Graph

In [29]:
def get_n_arr(dataframe, dictionary, column):    
    num_lst = [int(dictionary[i]) for i in dataframe[column]]
    
    return np.array(num_lst)

# arr list
"""
patient_arr / drug_arr / cond_arr
"""

patient_arr = np.array(df_['patient_id'])

# mapping using dictionary
drug_arr = get_n_arr(df_, drug_dict, 'DrugId')
cond_arr = get_n_arr(df_, cond_dict, 'Condition')

label_arr = torch.tensor(df_['Satisfaction'])

- node : [`patient`, `drug`, `condition`]
- edge : 
  - (`patient`, `satisfaction`, `drug`) : `label`
  - (`condition`, `symptom`, `patient`)
  - (`drug`, `Effectiveness`, `condition`)

### train / Inference data

In [30]:
df_.shape

(280127, 13)

In [31]:
train_bool = torch.zeros(len(df_), dtype=torch.bool).bernoulli(0.9)
inf_bool = ~train_bool

In [32]:
train_index = df_.iloc[patient_arr[train_bool],:].index
print(len(train_index))
train_index

251820


Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            280114, 280115, 280116, 280117, 280118, 280119, 280121, 280122,
            280123, 280125],
           dtype='int64', length=251820)

In [33]:
inf_index = df_.iloc[patient_arr[inf_bool],:].index
print(len(inf_index))
inf_index

28307


Int64Index([    12,     38,     46,     56,     64,     69,     70,     75,
                77,     80,
            ...
            280014, 280027, 280041, 280062, 280082, 280086, 280108, 280120,
            280124, 280126],
           dtype='int64', length=28307)

In [34]:
df_t = df_.iloc[train_index, :]
df_inf = df_.iloc[inf_index, :]

In [35]:
df_t['patient_id'] = [i for i in range(len(df_t))]
df_t = df_t.reset_index(drop=True)


df_inf['patient_id'] = [i for i in range(len(df_inf))]
df_inf = df_inf.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_t['patient_id'] = [i for i in range(len(df_t))]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_inf['patient_id'] = [i for i in range(len(df_inf))]


- drug_side_dict : 가장 빈번한 Side Effect 를 매칭시키기 위한 dictionary
    - **different drug -> same side**
- side_dict : 위에서 정의된 Side Effect 인덱스 번호

In [36]:
### train array
patient_arr_t = np.array(df_t['patient_id'])
drug_arr_t = get_n_arr(df_t, drug_dict, 'DrugId')
cond_arr_t = get_n_arr(df_t, cond_dict, 'Condition')

label_arr_t = torch.tensor(list(df_t['Satisfaction']))

In [37]:
patient_arr_t.shape

(251820,)

In [38]:
drug_arr_t.shape

(251820,)

In [39]:
### TRAIN data
hetero_graph_t = dgl.heterograph({
    ('patient', 'satisfaction', 'drug'): (patient_arr_t, drug_arr_t),
    ('condition', 'symptom', 'patient'): (cond_arr_t, patient_arr_t),
    ('drug', 'Easy', 'patient'): (drug_arr_t[df_t['EaseofUse']==1], patient_arr_t[df_t['EaseofUse']==1]),
    ('drug', 'Effectiveness', 'condition'): (drug_arr_t[df_t['Effectiveness']==1], cond_arr_t[df_t['Effectiveness']==1])
      })

In [40]:
hetero_graph_t

Graph(num_nodes={'condition': 1584, 'drug': 4522, 'patient': 251820},
      num_edges={('condition', 'symptom', 'patient'): 251820, ('drug', 'Easy', 'patient'): 186580, ('drug', 'Effectiveness', 'condition'): 147499, ('patient', 'satisfaction', 'drug'): 251820},
      metagraph=[('condition', 'patient', 'symptom'), ('patient', 'drug', 'satisfaction'), ('drug', 'patient', 'Easy'), ('drug', 'condition', 'Effectiveness')])

### Node_Feature_Embedding

In [41]:
df_t.shape

(251820, 13)

In [42]:
# embedding features
def get_embed(len_keys, lst, embed_n):

    embedding_table = nn.Embedding(num_embeddings=len_keys, 
                               embedding_dim=embed_n)

    
    embed_feat = embedding_table(torch.LongTensor(lst))

    return embed_feat

In [73]:
df_.shape

(280127, 13)

In [75]:
### Patient Feature
# get_feature lst
patient_f_lst = [patient_feat_dict[(df_['Age'][i], df_['Sex'][i])] for i in range(len(df_))]

# patient embedding
patient_embed = get_embed(len(patient_feat_dict.keys()), patient_f_lst, 10)    # ( 280127 * 10 ) -> 22 types

patient_embed_t = patient_embed[: hetero_graph_t.num_nodes('patient'), : ]

### Condition Feature
# condition feature lst
cond_f_lst = [i for i in range(len(cond_dict.values()))]

# condition embedding
cond_embed = get_embed(len(cond_dict.values()), cond_f_lst, 10)    # ( 1584 * 10 ) -> 1584 Condition types

cond_embed_t = cond_embed[:hetero_graph_t.num_nodes('condition'), : ]   # ( 1584 * 10 ) -> 1584 training Condition types

### drug Feature
# drug feature lst
drug_f_lst = [i for i in drug_feat_dict.values()]

# drug embedding
drug_embed = get_embed(len(drug_feat_dict.keys()), drug_f_lst, 10)   # ( 4522 * 10 ) -> 4522 drug types and 1557 side effect types

drug_embed_t = drug_embed[: hetero_graph_t.num_nodes('drug'), : ]    # ( 4522 * 10 ) -> 4522 training drug types and 1557 side effect types

In [76]:
print('Patient')
print(patient_embed_t.shape)

print('------------------')
print('Condition')
print(cond_embed_t.shape)

print('------------------')
print('Drug')
print(drug_embed_t.shape)

Patient
torch.Size([251820, 10])
------------------
Condition
torch.Size([1584, 10])
------------------
Drug
torch.Size([4522, 10])


In [77]:
hetero_graph_t

Graph(num_nodes={'condition': 1584, 'drug': 4522, 'patient': 251820},
      num_edges={('condition', 'symptom', 'patient'): 251820, ('drug', 'Easy', 'patient'): 186580, ('drug', 'Effectiveness', 'condition'): 147499, ('patient', 'satisfaction', 'drug'): 251820},
      metagraph=[('condition', 'patient', 'symptom'), ('patient', 'drug', 'satisfaction'), ('drug', 'patient', 'Easy'), ('drug', 'condition', 'Effectiveness')])

In [78]:
## node feature and labeling

# train data
hetero_graph_t.edges['satisfaction'].data['label'] = label_arr_t
hetero_graph_t.nodes['patient'].data['feature'] = patient_embed_t
hetero_graph_t.nodes['drug'].data['feature'] = drug_embed_t
hetero_graph_t.nodes['condition'].data['feature'] = cond_embed_t

In [79]:
hetero_graph_t

Graph(num_nodes={'condition': 1584, 'drug': 4522, 'patient': 251820},
      num_edges={('condition', 'symptom', 'patient'): 251820, ('drug', 'Easy', 'patient'): 186580, ('drug', 'Effectiveness', 'condition'): 147499, ('patient', 'satisfaction', 'drug'): 251820},
      metagraph=[('condition', 'patient', 'symptom'), ('patient', 'drug', 'satisfaction'), ('drug', 'patient', 'Easy'), ('drug', 'condition', 'Effectiveness')])

### model

In [80]:
import dgl.nn as dglnn
import torch.nn as nn
import torch.nn.functional as F

In [81]:
class RGCN(nn.Module):
    def __init__(self, in_feats, hid_feats, out_feats, rel_names):
        super().__init__()

        self.conv1 = dglnn.HeteroGraphConv({
            rel: dglnn.GraphConv(in_feats, hid_feats)
            for rel in rel_names}, aggregate='sum')
        
        self.conv2 = dglnn.HeteroGraphConv({
            rel: dglnn.GraphConv(hid_feats, out_feats)
            for rel in rel_names}, aggregate='sum')


    def forward(self, graph, inputs):
        # inputs are features of nodes
        h = self.conv1(graph, inputs)
        h = {k: F.relu(v) for k, v in h.items()}
        h = self.conv2(graph, h)
        return h

In [82]:
class HeteroMLPPredictor(nn.Module):
    def __init__(self, in_dims, n_classes):
        super().__init__() 
        self.W = nn.Linear(in_dims * 2, n_classes)

    def apply_edges(self, edges):
        x = torch.cat([edges.src['h'], edges.dst['h']], 1)
        y = self.W(x)
        return {'score': y}

    def forward(self, graph, h):
        # h contains the node representations for each edge type computed from
        # the GNN for heterogeneous graphs defined in the node classification
        # section (Section 5.1).
        with graph.local_scope():
            graph.ndata['h'] = h   # assigns 'h' of all node types in one shot
            graph.apply_edges(self.apply_edges)
            return graph.edata['score']

In [83]:
class Model(nn.Module):
    def __init__(self, in_features, hidden_features, out_features, rel_names, bi_pred=False):
        super().__init__()
        self.sage = RGCN(in_features, hidden_features, out_features, rel_names)
        
        if bi_pred==False:
            self.pred = HeteroMLPPredictor(out_features, len(rel_names))
        else:
            self.pred = HeteroMLPPredictor(out_features, 2)
        
    def forward(self, g, x, dec_graph):
        h = self.sage(g, x)
        h_2 = {'drug': h['drug'], 'patient': h['patient']}
        return self.pred(dec_graph, h_2)

### Masking Task

In [84]:
# edge length
num_edges = len(hetero_graph_t.edata['label'][('patient', 'satisfaction', 'drug')])

train_mask = torch.zeros(num_edges, dtype=torch.bool).bernoulli(0.8)
val_mask = ~train_mask

In [85]:
hetero_graph_t

Graph(num_nodes={'condition': 1584, 'drug': 4522, 'patient': 251820},
      num_edges={('condition', 'symptom', 'patient'): 251820, ('drug', 'Easy', 'patient'): 186580, ('drug', 'Effectiveness', 'condition'): 147499, ('patient', 'satisfaction', 'drug'): 251820},
      metagraph=[('condition', 'patient', 'symptom'), ('patient', 'drug', 'satisfaction'), ('drug', 'patient', 'Easy'), ('drug', 'condition', 'Effectiveness')])

In [86]:
dec_graph_t = hetero_graph_t['patient', :, 'drug']

label_arr_t = torch.tensor(list(df_t['Satisfaction']))

model = Model(10, 20, 5, hetero_graph_t.etypes, 1)

patient_feats_t = hetero_graph_t.nodes['patient'].data['feature']
drug_feats_t = hetero_graph_t.nodes['drug'].data['feature']
cond_feats_t = hetero_graph_t.nodes['condition'].data['feature']


node_features_t = {'patient': patient_feats_t, 'drug': drug_feats_t, 'condition': cond_feats_t}

dec_graph_t = hetero_graph_t['patient', :, 'drug']

opt = torch.optim.Adam(model.parameters())

for epoch in range(300):
    logits = model(hetero_graph_t, node_features_t, dec_graph_t)
    loss = F.cross_entropy(logits[train_mask], label_arr_t[train_mask])
    opt.zero_grad()
    loss.backward(retain_graph=True)
    opt.step()

    if epoch % 5 == 0:
        acc_val = torchmetrics.functional.accuracy(logits[val_mask], label_arr_t[val_mask])
        print(f"--------- {epoch} ---------")
        print('val_acc : ', acc_val)

--------- 0 ---------
val_acc :  tensor(0.4679)
--------- 5 ---------
val_acc :  tensor(0.4520)
--------- 10 ---------
val_acc :  tensor(0.4725)
--------- 15 ---------
val_acc :  tensor(0.4842)
--------- 20 ---------
val_acc :  tensor(0.5048)
--------- 25 ---------
val_acc :  tensor(0.5107)
--------- 30 ---------
val_acc :  tensor(0.5154)
--------- 35 ---------
val_acc :  tensor(0.5240)
--------- 40 ---------
val_acc :  tensor(0.5347)
--------- 45 ---------
val_acc :  tensor(0.5441)
--------- 50 ---------
val_acc :  tensor(0.5522)
--------- 55 ---------
val_acc :  tensor(0.5579)
--------- 60 ---------
val_acc :  tensor(0.5653)
--------- 65 ---------
val_acc :  tensor(0.5716)
--------- 70 ---------
val_acc :  tensor(0.5813)
--------- 75 ---------
val_acc :  tensor(0.5874)
--------- 80 ---------
val_acc :  tensor(0.5930)
--------- 85 ---------
val_acc :  tensor(0.6038)
--------- 90 ---------
val_acc :  tensor(0.6156)
--------- 95 ---------
val_acc :  tensor(0.6252)
--------- 100 --------

### Inference

In [61]:
df_inf.head()

Unnamed: 0,Age,Condition,Date,Drug,DrugId,EaseofUse,Effectiveness,Reviews,Satisfaction,Sex,Sides,UsefulCount,patient_id
0,35-44,Birth Control,8/20/2015,lyza,164750,1,1,I have gained weight even with working out. I ...,0,Female,"Nausea , vomiting , headache , bloating , ...",1,0
1,45-54,Abnormally Long or Heavy Periods,1/26/2013,lysteda,154120,1,1,I took Lysteda for about 18 months. It defina...,0,Female,"Nausea , vomiting , diarrhea , and muscle p...",3,1
2,35-44,Abnormally Long or Heavy Periods,10/31/2012,lysteda,154120,1,1,"This medication changed my life, I have always...",1,Female,"Nausea , vomiting , diarrhea , and muscle p...",3,2
3,25-34,Abnormally Long or Heavy Periods,6/30/2012,lysteda,154120,1,0,I had heavy periods most of my adult life. Lar...,0,Female,"Nausea , vomiting , diarrhea , and muscle p...",29,3
4,25-34,Abnormally Long or Heavy Periods,4/24/2012,lysteda,154120,1,1,my bleeding was really bad to the point that i...,1,Female,"Nausea , vomiting , diarrhea , and muscle p...",7,4


In [62]:
# ### inference array
# patient_arr_inf = np.array(df_inf['patient_id'])
# drug_arr_inf = get_n_arr(df_inf, drug_dict, 'DrugId')
# cond_arr_inf = get_n_arr(df_inf, cond_dict, 'Condition')

# label_arr_inf = torch.tensor(list(df_inf['Satisfaction']))


# ### INFERENCE data
# hetero_graph_inf = dgl.heterograph({
#     ('patient', 'satisfaction', 'drug'): (patient_arr_inf, drug_arr_inf),
#     ('condition', 'symptom', 'patient'): (cond_arr_inf, patient_arr_inf),
#     ('drug', 'Easy', 'patient'): (drug_arr_inf[df_inf['EaseofUse']==1], patient_arr_inf[df_inf['EaseofUse']==1]),
#     ('drug', 'Effectiveness', 'condition'): (drug_arr_inf[df_inf['Effectiveness']==1], cond_arr_inf[df_inf['Effectiveness']==1])
#       })

# # inference patient embedding 
# patient_feat_lst_inf = [(df_inf['Age'][i], df_inf['Sex'][i]) for i in range(len(df_inf))]
# patient_embed_inf = get_embed(patient_feat_dict, patient_feat_lst_inf, 10)


# # inference condition embedding
# cond_lst_inf = [i for i in range(max(cond_arr_inf)+1)]
# cond_embed_inf = cond_embed_table(torch.LongTensor(cond_lst_inf))


# # inference_drug_feature embedding
# drug_embed_inf = drug_embed_table(torch.LongTensor([drug_feat_dict[i] for i in range(hetero_graph_inf.num_nodes('drug'))]))

# # inference data
# hetero_graph_inf.edges['satisfaction'].data['label'] = label_arr_inf
# hetero_graph_inf.nodes['patient'].data['feature'] = patient_embed_inf
# hetero_graph_inf.nodes['drug'].data['feature'] = drug_embed_inf
# hetero_graph_inf.nodes['condition'].data['feature'] = cond_embed_inf

# dec_graph_inf = hetero_graph_inf['patient', :, 'drug']

# label_arr_inf = torch.tensor(list(df_inf['Satisfaction']))

# # model = Model(10, 20, 5, hetero_graph_inf.etypes, 1)

# patient_feats_inf = hetero_graph_inf.nodes['patient'].data['feature']
# drug_feats_inf = hetero_graph_inf.nodes['drug'].data['feature']
# cond_feats_inf = hetero_graph_inf.nodes['condition'].data['feature']

# node_features_inf = {'patient': patient_feats_inf, 'drug': drug_feats_inf, 'condition': cond_feats_inf}

# dec_graph_inf = hetero_graph_inf['patient', :, 'drug']


# model.eval()

# with torch.no_grad():
#     test_logit = model(hetero_graph_inf, node_features_inf, dec_graph_inf)
# test_logit

# torch.argmax(test_logit, dim=1).shape

# torchmetrics.functional.accuracy(test_logit, label_arr_inf)

### inference

In [63]:
df_t.shape

(251820, 13)

In [87]:
hetero_graph_t

Graph(num_nodes={'condition': 1584, 'drug': 4522, 'patient': 251820},
      num_edges={('condition', 'symptom', 'patient'): 251820, ('drug', 'Easy', 'patient'): 186580, ('drug', 'Effectiveness', 'condition'): 147499, ('patient', 'satisfaction', 'drug'): 251820},
      metagraph=[('condition', 'patient', 'symptom'), ('patient', 'drug', 'satisfaction'), ('drug', 'patient', 'Easy'), ('drug', 'condition', 'Effectiveness')])

In [88]:
df_test = df_.iloc[hetero_graph_t.num_nodes('patient'):, :]

In [89]:
### added_inference array
patient_arr_inf = np.array(df_test['patient_id'])
drug_arr_inf = get_n_arr(df_test, drug_dict, 'DrugId')
cond_arr_inf = get_n_arr(df_test, cond_dict, 'Condition')

label_arr_inf = torch.tensor(list(df_test['Satisfaction']))

In [91]:
patient_arr_inf

array([251820, 251821, 251822, ..., 280124, 280125, 280126])

In [92]:
### add edges
hetero_graph_inf = dgl.add_edges(hetero_graph_t, patient_arr_inf, drug_arr_inf, etype='satisfaction')
hetero_graph_inf = dgl.add_edges(hetero_graph_inf, cond_arr_inf, patient_arr_inf, etype='symptom')
hetero_graph_inf = dgl.add_edges(hetero_graph_inf, drug_arr_inf[df_test['EaseofUse']==1], patient_arr_inf[df_test['EaseofUse']==1], etype='Easy')
hetero_graph_inf = dgl.add_edges(hetero_graph_inf, drug_arr_inf[df_test['Effectiveness']==1], cond_arr_inf[df_test['Effectiveness']==1], etype='Effectiveness')

In [93]:
hetero_graph_inf

Graph(num_nodes={'condition': 1584, 'drug': 4522, 'patient': 280127},
      num_edges={('condition', 'symptom', 'patient'): 280127, ('drug', 'Easy', 'patient'): 206555, ('drug', 'Effectiveness', 'condition'): 164096, ('patient', 'satisfaction', 'drug'): 280127},
      metagraph=[('condition', 'patient', 'symptom'), ('patient', 'drug', 'satisfaction'), ('drug', 'patient', 'Easy'), ('drug', 'condition', 'Effectiveness')])

In [94]:
hetero_graph_t

Graph(num_nodes={'condition': 1584, 'drug': 4522, 'patient': 251820},
      num_edges={('condition', 'symptom', 'patient'): 251820, ('drug', 'Easy', 'patient'): 186580, ('drug', 'Effectiveness', 'condition'): 147499, ('patient', 'satisfaction', 'drug'): 251820},
      metagraph=[('condition', 'patient', 'symptom'), ('patient', 'drug', 'satisfaction'), ('drug', 'patient', 'Easy'), ('drug', 'condition', 'Effectiveness')])

In [95]:
# feature embedding
patient_embed_inf = patient_embed

cond_embed_inf = cond_embed

drug_embed_inf = drug_embed

In [97]:
hetero_graph_inf

Graph(num_nodes={'condition': 1584, 'drug': 4522, 'patient': 280127},
      num_edges={('condition', 'symptom', 'patient'): 280127, ('drug', 'Easy', 'patient'): 206555, ('drug', 'Effectiveness', 'condition'): 164096, ('patient', 'satisfaction', 'drug'): 280127},
      metagraph=[('condition', 'patient', 'symptom'), ('patient', 'drug', 'satisfaction'), ('drug', 'patient', 'Easy'), ('drug', 'condition', 'Effectiveness')])

In [98]:
## node feature and labeling
label_arr_inf = torch.tensor(list(df_['Satisfaction']))

# inference data
hetero_graph_inf.edges['satisfaction'].data['label'] = label_arr_inf
hetero_graph_inf.nodes['patient'].data['feature'] = patient_embed_inf
hetero_graph_inf.nodes['drug'].data['feature'] = drug_embed_inf
hetero_graph_inf.nodes['condition'].data['feature'] = cond_embed_inf

In [99]:
hetero_graph_inf

Graph(num_nodes={'condition': 1584, 'drug': 4522, 'patient': 280127},
      num_edges={('condition', 'symptom', 'patient'): 280127, ('drug', 'Easy', 'patient'): 206555, ('drug', 'Effectiveness', 'condition'): 164096, ('patient', 'satisfaction', 'drug'): 280127},
      metagraph=[('condition', 'patient', 'symptom'), ('patient', 'drug', 'satisfaction'), ('drug', 'patient', 'Easy'), ('drug', 'condition', 'Effectiveness')])

In [100]:
patient_feats_inf = hetero_graph_inf.nodes['patient'].data['feature']
drug_feats_inf = hetero_graph_inf.nodes['drug'].data['feature']
cond_feats_inf = hetero_graph_inf.nodes['condition'].data['feature']

node_features_inf = {'patient': patient_feats_inf, 'drug': drug_feats_inf, 'condition': cond_feats_inf}


In [101]:
node_features_inf = {'patient': patient_feats_inf, 'drug': drug_feats_inf, 'condition': cond_feats_inf}

dec_graph_inf = hetero_graph_inf['patient', :, 'drug']


In [102]:
model.eval()

with torch.no_grad():
    test_logit = model(hetero_graph_inf, node_features_inf, dec_graph_inf)
test_logit

tensor([[-0.2010,  0.1733],
        [-0.1880,  0.1499],
        [ 0.0773,  0.0511],
        ...,
        [ 0.7915, -0.5857],
        [-0.1894,  0.5945],
        [-0.1894,  0.5945]])

In [103]:
start = hetero_graph_t.num_nodes('patient')
torch.argmax(test_logit, dim=1)[start : start+100]

tensor([0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,
        0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
        0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0,
        1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0,
        1, 1, 1, 1])

In [105]:
label_arr_inf[start : start+100]

tensor([0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,
        1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
        0, 0, 1, 1])

In [108]:
inference_acc =torchmetrics.functional.accuracy(test_logit[start:], label_arr_inf[start:])
print("acc : ", inference_acc.item())

acc :  0.6967180967330933


## etc

- `side effect` 는 나누어서 컬럼으로 부여

In [None]:
df_sides = df_['Sides'].str.split(',')
df_sides = pd.DataFrame(df_sides.tolist())
df_sides.columns = ['side'+ str(i) for i in range(17)]
df_sides.head()

In [97]:
patient_fea = df_[['Age', 'Sex']]
fea_df = pd.concat([patient_fea, df_sides], axis=1)
fea_df.head()

Unnamed: 0,Age,Sex,side0,side1,side2,side3,side4,side5,side6,side7,side8,side9,side10,side11,side12,side13,side14,side15,side16
0,75 or over,Male,Drowsiness,dizziness,dry mouth /nose/throat,headache,upset stomach,constipation,or trouble sleeping may occur.,,,,,,,,,,
1,25-34,Female,Drowsiness,dizziness,dry mouth /nose/throat,headache,upset stomach,constipation,or trouble sleeping may occur.,,,,,,,,,,
2,25-34,Female,Nausea,vomiting,headache,bloating,breast tenderness,swelling of the ankles /feet (fluid retention),or weight change may occur.,,,,,,,,,,
3,45-54,Female,Nausea,vomiting,headache,bloating,breast tenderness,swelling of the ankles /feet (fluid retention),or weight change may occur.,,,,,,,,,,
4,55-64,Male,Temporary burning,stinging,dryness in the nose,runny nose,and sneezing may occur.,,,,,,,,,,,,


### train/val/test Split

In [60]:
label_df = df_[['Satisfaction']]
x = df_.drop('Satisfaction', axis=1)

In [459]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    x, label_df, test_size=0.2, random_state=123, stratify=label_df)

In [462]:
x_train['Satisfaction'] = y_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train['Satisfaction'] = y_train
