# Imports

In [1]:
import torch
import re
import torch.nn as nn
import pandas as pd
import simple_icd_10 as icd
import numpy as np

# Feature Engineering

In [2]:
df = pd.read_csv("dataset.csv")

In [3]:
hpo_vocab_df = pd.Series(', '.join(df['hpo_features'].to_list()), name='hpo_vocab')
icd10_vocab_df = pd.Series(', '.join(df['icd10_codes'].to_list()), name='icd10_vocab')

In [4]:
hpo_vocab = set(re.split(r'\s+|;|,', hpo_vocab_df.iloc[0]))
icd10_vocab = set(re.split(r'\s+|;|,', icd10_vocab_df.iloc[0]))

# Model Preparation

In [6]:
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")
val_df = pd.read_csv("val_data.csv")

In [5]:
def convert(input, hpo_to_ix):
    idxs = []

    for w in input:
        if w in hpo_to_ix.keys():
          idxs.append(hpo_to_ix[w])




    return torch.tensor(idxs, dtype=torch.long)


def vec_to_word(input, ix_to_word):
    words = []

    for w in input:
        if w in ix_to_word.keys():
            words.append(ix_to_word[w])

    return words


In [7]:
EMDEDDING_DIM = 100

features,target = train_df['hpo_features'].copy(), train_df['icd10_codes'].copy()
val_features,val_target = val_df['hpo_features'].copy(), val_df['icd10_codes'].copy()
test_features,test_target = test_df['hpo_features'].copy(), test_df['icd10_codes'].copy()

hpo_vocab_size = len(hpo_vocab)
icd10_vocab_size = len(icd10_vocab)

hpo_to_ix = {word:ix for ix, word in enumerate(hpo_vocab)}
ix_to_word = {ix:word for ix, word in enumerate(hpo_vocab)}
icd10_to_ix = {word:ix for ix, word in enumerate(icd10_vocab)}
ix_to_icd10 = {ix:word for ix, word in enumerate(icd10_vocab)}

In [11]:
class CBOW(torch.nn.Module):
    def __init__(self, hpo_vocab_size, embedding_dim):
        super(CBOW, self).__init__()

        #out: 1 x emdedding_dim
        self.embeddings = nn.Embedding(hpo_vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim,128)
        self.activation_function1 = nn.ReLU()
        
        #out: 1 x hpo_vocab_size
        self.linear2 = nn.Linear(128, icd10_vocab_size)
        self.activation_function2 = nn.LogSoftmax(dim = -1)
        

    def forward(self, inputs):
        embeds = sum(self.embeddings(inputs)).view(1,-1)
        out = self.linear1(embeds)
        out = self.activation_function1(out)
        out = self.linear2(out)
        out = self.activation_function2(out)

        return out


### Training

In [12]:

model = CBOW(hpo_vocab_size, EMDEDDING_DIM)

loss_function = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

correct_predicted_train = 0

#TRAINING
for epoch in range(100):
    train_loss = 0
    val_loss = 0

    correct_predicted_train = 0
    correct_predicted_val = 0


    for x, y in zip(features,target):

        input_vector = convert(re.split(r'\s+|;|,', x), hpo_to_ix) 
        logs = model(input_vector)
        train_loss += loss_function(logs, torch.tensor([icd10_to_ix[y]]))


        pred = [torch.argmax(logs).item()]
        pred_word = vec_to_word(pred,ix_to_icd10)
        correct_predicted_train +=  1 if pred_word[0] == y else 0


    with torch.no_grad():
        for x, y in zip(val_features, val_target):
            input_vector = convert(re.split(r'\s+|;|,', x), hpo_to_ix) 
            logs = model(input_vector)
            val_loss += loss_function(logs, torch.tensor([icd10_to_ix[y]]))

            pred = [torch.argmax(logs).item()]
            pred_word = vec_to_word(pred,ix_to_icd10)
            correct_predicted_val +=  1 if pred_word[0] == y else 0




    #optimize at the end of each epoch
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()

    if epoch%10 == 0:
        print(" train accuracy: {}".format(correct_predicted_train/len(target)))
        print(" val accuracy: {}".format(correct_predicted_val/len(val_df)))


 train accuracy: 0.0
 val accuracy: 0.0
 train accuracy: 0.88
 val accuracy: 0.625
 train accuracy: 0.88
 val accuracy: 0.625
 train accuracy: 0.88
 val accuracy: 0.625
 train accuracy: 0.88
 val accuracy: 0.625
 train accuracy: 0.88
 val accuracy: 0.625
 train accuracy: 0.88
 val accuracy: 0.625
 train accuracy: 0.88
 val accuracy: 0.625
 train accuracy: 0.88
 val accuracy: 0.625
 train accuracy: 0.88
 val accuracy: 0.625


### Inference

In [13]:
correct_predicted_test = 0

for x, y in zip(test_features, test_target):
    input_vector = convert(re.split(r'\s+|;|,', x), hpo_to_ix) 
    logs = model(input_vector)
    pred = [torch.argmax(logs).item()]
    pred_word = vec_to_word(pred,ix_to_icd10)

    print("predicted: {}".format(icd.get_description(pred_word[0])))

    correct_predicted_test +=  1 if pred_word[0] == y else 0

predicted: Sepsis, unspecified
predicted: Pneumonia, unspecified
predicted: Acute subendocardial myocardial infarction
predicted: Sepsis, unspecified
predicted: Sepsis, unspecified
predicted: Pneumonia, unspecified
predicted: Intracerebral haemorrhage, unspecified
predicted: Acute subendocardial myocardial infarction
predicted: Sepsis, unspecified
