In [1]:
import os
import zipfile
import pandas as pd
import numpy as np
import altair as alt
import streamlit as st
from vega_datasets import data
import icd10

In [5]:
with zipfile.ZipFile("smaller.zip") as myzip:    
    no1 = myzip.open("smaller.csv")

#Now, we can read in the data
df = pd.read_csv(eval('no1'), index_col=0)

In [6]:
def col_to_list(column):
  """
  String to list of strings for an entire column
  Can be applied to "drugs", "diseases" and "icdcodes"
  """

  def remove_artefacts(entry):

    return "".join([c for c in entry if c not in ["[", "]", "'", "\""]])
  
  column = [remove_artefacts(e) for e in column]
  column = [e.split(", ") for e in column]

  return column


def remove_abbreviations(l):

  return [it.split(sep=" (")[0] for it in l]

In [7]:
# reformat train, validation and test data, does not work in a for loop
df["drugs"] = col_to_list(df["drugs"])
df["diseases"] = col_to_list(df["diseases"])
df["diseases"] = [remove_abbreviations(l) for l in df["diseases"]]
df["icdcodes"] = col_to_list(df["icdcodes"])

In [8]:
def get_desc(x): 
  code = icd10.find(x)
  #print(code)
  desc = ''
  if (code == None):
    desc= 'Other' 
  else: 
    desc =  code.description
  return desc



def get_chapter(x): 
  code = icd10.find(x)
  #print(code)
  desc = 'Other'

  try: 
    desc =  code.chapter
  except Exception: 
    pass 
  return desc



def get_block(x): 
  code = icd10.find(x)
  desc = 'Other'
  try: 
    desc =  code.block
  except Exception: 
    pass 

  return desc


def get_block_desc(x): 
  code = icd10.find(x)
  #print(code)
  desc = 'Other'
  try: 
    desc =  code.block_description
  except Exception: 
    pass 
  return desc

In [9]:
df['icdcodes_first'] = df['icdcodes'].apply(lambda x: x[0])

In [10]:
df['description'] = df['icdcodes_first'].apply(get_desc)
df['chapter'] = df['icdcodes_first'].apply(get_chapter)
df['block'] = df['icdcodes_first'].apply(get_block)
df['block_desc'] = df['icdcodes_first'].apply(get_block_desc)

df = df[['nct_id', 'status', 'phase', 'diseases', 'icdcodes', 'drugs',
       'smiless', 'study_date', 'country', 'participant_count', 'outcome',
       'icdcodes_first', 'description', 'chapter', 'block', 'block_desc']]

In [13]:
df

Unnamed: 0,nct_id,status,phase,diseases,icdcodes,drugs,smiless,study_date,country,participant_count,outcome,icdcodes_first,description,chapter,block,block_desc
0,NCT01288573,completed,phase 1/phase 2,"[ewings sarcoma/soft tissue sarcoma, neuroblas...","[C71.7, C71.9, C79.31, D33.0, D33.1, D33.2, D4...","[plerixafor, plerixafor, plerixafor]",['C(N1CCCNCCNCCCNCC1)C1=CC=C(CN2CCCNCCNCCCNCC2...,2011-01-28,Belgium,,1,C71.7,Malignant neoplasm of brain stem,,,
1,NCT01288573,completed,phase 1/phase 2,"[ewings sarcoma/soft tissue sarcoma, neuroblas...","[C71.7, C71.9, C79.31, D33.0, D33.1, D33.2, D4...","[plerixafor, plerixafor, plerixafor]",['C(N1CCCNCCNCCCNCC1)C1=CC=C(CN2CCCNCCNCCCNCC2...,2011-01-28,Czechia,,1,C71.7,Malignant neoplasm of brain stem,,,
2,NCT01288573,completed,phase 1/phase 2,"[ewings sarcoma/soft tissue sarcoma, neuroblas...","[C71.7, C71.9, C79.31, D33.0, D33.1, D33.2, D4...","[plerixafor, plerixafor, plerixafor]",['C(N1CCCNCCNCCCNCC1)C1=CC=C(CN2CCCNCCNCCCNCC2...,2011-01-28,Denmark,,1,C71.7,Malignant neoplasm of brain stem,,,
3,NCT01288573,completed,phase 1/phase 2,"[ewings sarcoma/soft tissue sarcoma, neuroblas...","[C71.7, C71.9, C79.31, D33.0, D33.1, D33.2, D4...","[plerixafor, plerixafor, plerixafor]",['C(N1CCCNCCNCCCNCC1)C1=CC=C(CN2CCCNCCNCCCNCC2...,2011-01-28,France,,1,C71.7,Malignant neoplasm of brain stem,,,
4,NCT01288573,completed,phase 1/phase 2,"[ewings sarcoma/soft tissue sarcoma, neuroblas...","[C71.7, C71.9, C79.31, D33.0, D33.1, D33.2, D4...","[plerixafor, plerixafor, plerixafor]",['C(N1CCCNCCNCCCNCC1)C1=CC=C(CN2CCCNCCNCCCNCC2...,2011-01-28,Germany,,1,C71.7,Malignant neoplasm of brain stem,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1352647,NCT01364649,completed,phase 3,[treatment outcome],"[Z01.12, Z92.89, Z75.2, M27.59, Z53.9, Z91.19,...","[vortioxetine, escitalopram, placebo]","['CC1=CC=C(SC2=CC=CC=C2N2CCNCC2)C(C)=C1', 'CN(...",2011-05-31,Canada,217.0,1,Z01.12,Encounter for hearing conservation and treatment,XXI,Z00-Z99,Factors influencing health status and contact ...
1352648,NCT01364649,completed,phase 3,[treatment outcome],"[Z01.12, Z92.89, Z75.2, M27.59, Z53.9, Z91.19,...","[vortioxetine, escitalopram, placebo]","['CC1=CC=C(SC2=CC=CC=C2N2CCNCC2)C(C)=C1', 'CN(...",2011-05-31,Canada,207.0,1,Z01.12,Encounter for hearing conservation and treatment,XXI,Z00-Z99,Factors influencing health status and contact ...
1352649,NCT01364649,completed,phase 3,[treatment outcome],"[Z01.12, Z92.89, Z75.2, M27.59, Z53.9, Z91.19,...","[vortioxetine, escitalopram, placebo]","['CC1=CC=C(SC2=CC=CC=C2N2CCNCC2)C(C)=C1', 'CN(...",2011-05-31,Canada,217.0,1,Z01.12,Encounter for hearing conservation and treatment,XXI,Z00-Z99,Factors influencing health status and contact ...
1352650,NCT01364649,completed,phase 3,[treatment outcome],"[Z01.12, Z92.89, Z75.2, M27.59, Z53.9, Z91.19,...","[vortioxetine, escitalopram, placebo]","['CC1=CC=C(SC2=CC=CC=C2N2CCNCC2)C(C)=C1', 'CN(...",2011-05-31,Canada,207.0,1,Z01.12,Encounter for hearing conservation and treatment,XXI,Z00-Z99,Factors influencing health status and contact ...


In [11]:
# hand curation
df2.loc[df2['icdcodes_first'] == 'B00.0', 'chapter'] = 'I'
df2.loc[df2['icdcodes_first'] == 'B00.81', 'chapter'] = 'I'
df2.loc[df2['icdcodes_first'] == 'F32.A', 'chapter'] = 'V'
df2.loc[df2['icdcodes_first'] == 'M45.A2', 'chapter'] = 'XIII'
df2.loc[df2['icdcodes_first'] == 'M45.A1', 'chapter'] = 'XIII'
df2.loc[df2['icdcodes_first'] == 'M31.11', 'chapter'] = 'XIII'
df2.loc[df2['icdcodes_first'] == 'J82.83', 'chapter'] = 'X'

df2.loc[df2['chapter'].isna(), 'chapter'] = 'II'
df2.loc[df2['icdcodes_first'] == 'C7A', 'chapter'] = 'II'
df2.loc[df2['icdcodes_first'] == 'J00', 'chapter'] = 'X'
df2.loc[df2['icdcodes_first'] == 'K94', 'chapter'] = 'XI'
df2.loc[df2['icdcodes_first'] == 'O00', 'chapter'] = 'XV'
df2.loc[df2['icdcodes_first'] == 'O9A', 'chapter'] = 'XV'

NameError: name 'df2' is not defined

In [None]:
# Change n_participants to participant_count

df2 = df2.rename(columns={"n_participants": "participant_count"})


In [None]:
df3 = df3.drop_duplicates('nct_id', keep='first')