In [1]:
import os
import zipfile
import pandas as pd
import numpy as np
import altair as alt
import streamlit as st
from vega_datasets import data
import icd10

In [5]:
with zipfile.ZipFile("smaller.zip") as myzip:    
    no1 = myzip.open("smaller.csv")

#Now, we can read in the data
df = pd.read_csv(eval('no1'), index_col=0)

In [6]:
def col_to_list(column):
  """
  String to list of strings for an entire column
  Can be applied to "drugs", "diseases" and "icdcodes"
  """

  def remove_artefacts(entry):

    return "".join([c for c in entry if c not in ["[", "]", "'", "\""]])
  
  column = [remove_artefacts(e) for e in column]
  column = [e.split(", ") for e in column]

  return column


def remove_abbreviations(l):

  return [it.split(sep=" (")[0] for it in l]

In [7]:
# reformat train, validation and test data, does not work in a for loop
df["drugs"] = col_to_list(df["drugs"])
df["diseases"] = col_to_list(df["diseases"])
df["diseases"] = [remove_abbreviations(l) for l in df["diseases"]]
df["icdcodes"] = col_to_list(df["icdcodes"])

In [8]:
def get_desc(x): 
  code = icd10.find(x)
  #print(code)
  desc = ''
  if (code == None):
    desc= 'Other' 
  else: 
    desc =  code.description
  return desc



def get_chapter(x): 
  code = icd10.find(x)
  #print(code)
  desc = 'Other'

  try: 
    desc =  code.chapter
  except Exception: 
    pass 
  return desc



def get_block(x): 
  code = icd10.find(x)
  desc = 'Other'
  try: 
    desc =  code.block
  except Exception: 
    pass 

  return desc


def get_block_desc(x): 
  code = icd10.find(x)
  #print(code)
  desc = 'Other'
  try: 
    desc =  code.block_description
  except Exception: 
    pass 
  return desc

In [9]:
df['icdcodes_first'] = df['icdcodes'].apply(lambda x: x[0])

In [10]:
df['description'] = df['icdcodes_first'].apply(get_desc)
df['chapter'] = df['icdcodes_first'].apply(get_chapter)
df['block'] = df['icdcodes_first'].apply(get_block)
df['block_desc'] = df['icdcodes_first'].apply(get_block_desc)

df = df[['nct_id', 'status', 'phase', 'diseases', 'icdcodes', 'drugs',
       'smiless', 'study_date', 'country', 'participant_count', 'outcome',
       'icdcodes_first', 'description', 'chapter', 'block', 'block_desc']]

In [14]:
# hand curation
df.loc[df['icdcodes_first'] == 'B00.0', 'chapter'] = 'I'
df.loc[df['icdcodes_first'] == 'B00.81', 'chapter'] = 'I'
df.loc[df['icdcodes_first'] == 'F32.A', 'chapter'] = 'V'
df.loc[df['icdcodes_first'] == 'M45.A2', 'chapter'] = 'XIII'
df.loc[df['icdcodes_first'] == 'M45.A1', 'chapter'] = 'XIII'
df.loc[df['icdcodes_first'] == 'M31.11', 'chapter'] = 'XIII'
df.loc[df['icdcodes_first'] == 'J82.83', 'chapter'] = 'X'

df.loc[df['chapter'].isna(), 'chapter'] = 'II'
df.loc[df['icdcodes_first'] == 'C7A', 'chapter'] = 'II'
df.loc[df['icdcodes_first'] == 'J00', 'chapter'] = 'X'
df.loc[df['icdcodes_first'] == 'K94', 'chapter'] = 'XI'
df.loc[df['icdcodes_first'] == 'O00', 'chapter'] = 'XV'
df.loc[df['icdcodes_first'] == 'O9A', 'chapter'] = 'XV'

In [None]:
# Change n_participants to participant_count

df = df.rename(columns={"n_participants": "participant_count"})


In [None]:
df3 = df3.drop_duplicates('nct_id', keep='first')

In [None]:
# Page 3 dataviz

def mol2fparr(mol):
    arr = np.zeros((0,))
    #print(mol)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol,2)
    DataStructs.ConvertToNumpyArray(fp, arr)
    #except Exception: 
    #   pass
    return arr

In [None]:
pca = PCA(n_components=2)

In [None]:
PandasTools.AddMoleculeColumnToFrame(df_new2,'smiless_first','molecule',includeFingerprints=True)
df_new2 = df_new2.replace(to_replace='None', value=np.nan).dropna()



X = np.asarray([mol2fparr(mol) for mol in df_new2.molecule])


res = pca.fit_transform(X)
print(res.shape)

df_desc = df[['nct_id','block_desc']]

df_desc = df_desc.drop_duplicates(keep='first')


df_desc = df_desc.fillna('others')

df_new2['PCA1'] = res[:,0]
df_new2['PCA2'] = res[:,1]


df_final = df_new2.merge(df_desc, how='left', left_on='nct_id', right_on='nct_id')


df_final.columns

df_final2 = df_final[['nct_id','smiless_first','block_desc','PCA1','PCA2']] 


alt.Chart(df_final2[0:5000]).mark_point().encode(
           x = 'PCA1',
           y = 'PCA2',
           color = 'block_desc',
           tooltip = ['nct_id', 'smiless_first']).interactive()