In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
pd.options.mode.chained_assignment = None  # default='warn'
from matplotlib.colors import ListedColormap

def query_tree(tdf,search_term):
    mesh_list = []
    tlist = tdf[tdf.mesh_id==search_term].mesh_treenumbers.tolist()
    for it in tlist:
        nchar = len(it)
        mesh = tdf[tdf.mesh_treenumbers.str[:nchar]==it].mesh_id.to_list()
        mesh_list += mesh
    return np.unique(mesh_list)

def query_tags(tadf, mesh_list, time_range):
    min_date = int(time_range[0])
    max_date = int(time_range[1])
    match_df = tadf.loc[(np.isin(tadf.mesh_id, mesh_list))&(tadf.year<=max_date)&(tadf.year>=min_date),:] # find matches for the mesh list
    return match_df

def get_counts_per_year(matchdf,time_range):
    # counts per year
    ts = matchdf[['geo_id', 'year', 'aux']].groupby(['geo_id', 'year']).mean().reset_index()[['geo_id', 'year']]
    cpy = ts.groupby('year').size().reset_index(name='c')
    # make sure that all years are in df
    all_years = pd.DataFrame(pd.Series(np.arange(int(time_range[0]), int(time_range[1]),1)), columns = ['year'])
    all_years['aux'] = 1
    cpy_merge = pd.merge(all_years, cpy, on = 'year', how='outer')
    cpy_merge = cpy_merge.drop('aux', axis = 1)
    cpy_merge.loc[cpy_merge.c.isnull(), 'c'] = 0
    return cpy_merge

def get_samples_per_year(matchdf,time_range):
    ts = matchdf.groupby(['geo_id', 'year']).nsamples.mean().reset_index()
    spy = ts.groupby('year').nsamples.sum().reset_index()
    all_years = pd.DataFrame(pd.Series(np.arange(int(time_range[0]), int(time_range[1]),1)), columns = ['year'])
    all_years['aux'] = 1
    spy = pd.merge(all_years, spy, on = 'year', how='outer')
    spy = spy.drop('aux', axis = 1)
    spy.loc[spy.nsamples.isnull(), 'nsamples'] = 0
    return spy

# load tree
path = '../../data/final'
tree_file = 'mesh.pkl'
trdf = pd.read_pickle(os.path.join(path, tree_file))

# load tags
tag_file ='geo_filtered.pkl'
tadf = pd.read_pickle(os.path.join(path, tag_file))
tadf['year'] = (tadf.date.str[:4]).astype(int) # year
tadf = pd.merge(tadf,trdf[['mesh_id', 'mesh_treenumbers']], on='mesh_id', how='left')

# load top topics based on network
rank_file ='All_Time_Top.pkl'
rankdf = pd.read_pickle(os.path.join(path, rank_file))

# load top topics based on n series
countsdf = pd.read_pickle('../../data/final/meshids_rankedby_NSeries.pkl')
countsdf = pd.merge(countsdf,trdf, on='mesh_id', how='left')
countsdf = countsdf.loc[np.isin(countsdf.category, ['C'])].reset_index(drop=True)

In [None]:
# generate dataframes with samples and counts over time

use_graph_or_counts = 1 # for selection: 0 for graph, 1 for counts
ntopdisease = 200
min_year = '2000'
max_year = '2017'

if use_graph_or_counts == 0:
    attribute = 'Diseases' # ['Diseases', 'Drugs']
    metric = 'EigenCentrality' # ['Degree', 'EigenCentrality', 'PageRank']
    attr = list(rankdf.columns.levels[0])
    metrics = list(rankdf.columns.levels[1])
    mesh_ids = list(rankdf[attribute][metric][:ntopdisease])
elif use_graph_or_counts == 1:
    mesh_ids = countsdf.loc[:ntopdisease,'mesh_id'].to_list()


top_d_df = trdf.loc[np.isin(trdf.mesh_id, mesh_ids),:]
# build 3 levels of hierarchy
top_d_df['level'] = top_d_df.mesh_treenumbers.str.len()
top_d_df.loc[top_d_df['level']<4,'level'] = 1
top_d_df.loc[(top_d_df['level']>=4) & (top_d_df['level']<=20),'level'] = 2
top_d_df.loc[top_d_df['level']>20,'level'] = 3
top_d_df = top_d_df.reset_index(drop=True)

mesh_ids = top_d_df.mesh_id.to_list()
mesh_headings = top_d_df.mesh_heading.to_list()

countdf = pd.DataFrame()
sampledf = pd.DataFrame()
for iid, ihead in zip(mesh_ids, mesh_headings):
    time_range = [min_year, max_year]
    mesh_list = query_tree(trdf,iid)
    matchdf = query_tags(tadf, mesh_list, time_range)
    matchdf['year'] = matchdf.date.astype(str).str[:4].astype(int)
    matchdf['aux'] = 1
    cpy = get_counts_per_year(matchdf,time_range); cpy['mesh_id'] = iid; cpy['mesh_heading'] = ihead; 
    spy = get_samples_per_year(matchdf,time_range); spy['mesh_id'] = iid; spy['mesh_heading'] = ihead;
    countdf = countdf.append(cpy)
    sampledf = sampledf.append(spy)

# add parent column to df
n = 1 # start with level 1
l1_tree_list = top_d_df.loc[top_d_df.level==n].mesh_treenumbers.tolist()
l1_mesh_headings = top_d_df.loc[top_d_df.level==n].mesh_heading.tolist()
for itree, iheading in zip(l1_tree_list, l1_mesh_headings):
    nchar = len(itree)
    top_d_df.loc[(top_d_df.level==n+1)&(top_d_df.mesh_treenumbers.str[:nchar]==itree),'parent'] = iheading 
top_d_df = pd.merge(countsdf[['mesh_id', 'n']],top_d_df, on=['mesh_id'], how='inner')
top_d_df = top_d_df.sort_values('n', ascending=False)

top_d_df.to_pickle('../../data/final/top_diseases_for_plotting.pkl')
countdf.to_pickle('../../data/final/countsbyyear_for_plotting.pkl')
sampledf.to_pickle('../../data/final/samplesbyyear_for_plotting.pkl')

In [None]:
sampledf = pd.read_pickle('../../data/final/samplesbyyear_for_plotting.pkl')
countdf = pd.read_pickle('../../data/final/countsbyyear_for_plotting.pkl')
top_d_df = pd.read_pickle('../../data/final/top_diseases_for_plotting.pkl')
counts_or_samples = 1


sampledf = sampledf.rename(index=str, columns = {'nsamples':'c'})
label_font = 12
l1_list = top_d_df.loc[top_d_df.level==1].mesh_heading.unique().tolist()
l1_array = np.array(l1_list)

if counts_or_samples==0:
    plot_df = countdf
    filename = 'disease_count_trends.pdf'
    labelz = '# Series'
    l1_array = l1_array[~np.isin(l1_array,['Neoplasms'])]
else:
    plot_df = sampledf
    filename = 'disease_sample_trends.pdf'
    labelz = '# Samples'
my_cmap_main = sns.color_palette("hsv",12).as_hex()    
my_cmap_sub = sns.color_palette("hsv",15).as_hex()

neoplasm_array = np.array(top_d_df.loc[top_d_df.parent=='Neoplasms'].mesh_heading.unique())[:20]
neoplasm_remove = ['Neoplasm Metastasis','Neoplasms by Histologic Type','Neoplasms by Site','Neoplastic Processes','Carcinogenesis']
neoplasm_array = neoplasm_array[~np.isin(neoplasm_array, neoplasm_remove)]

imm_array = np.array(top_d_df.loc[top_d_df.parent=='Immune System Diseases'].mesh_heading.unique())[:20]
imm_remove = []
imm_array = imm_array[~np.isin(imm_array, imm_remove)]


fig,axs =plt.subplots(2,4, figsize=(20,12))

axs[0,0].remove()

ax = axs[0,1]
sns.pointplot(data=plot_df.loc[np.isin(plot_df.mesh_heading, l1_array),:], x='year', y = 'c', hue='mesh_heading',ax=ax,palette=my_cmap_main, hue_order=l1_array)
ax.set_xticks(range(len(plot_df.year.unique())))
ax.set_xticklabels(plot_df.year.unique(), rotation=60,fontsize=label_font)
ax.set_yticklabels(ax.get_yticks().astype(int),fontsize=label_font)
ax.set_xlabel('Year',fontsize = label_font)
ax.set_ylabel('',fontsize = label_font)
ax.legend(bbox_to_anchor = (2.75,.75),frameon=False)
# move legend here

axs[0,2].remove()
axs[1,1].remove()

ax = axs[1,0]
sns.pointplot(data=plot_df.loc[np.isin(plot_df.mesh_heading, neoplasm_array),:], x='year', y = 'c', hue='mesh_heading',ax=ax,palette=my_cmap_sub, hue_order=neoplasm_array)
ax.set_xticks(range(len(plot_df.year.unique())))
ax.set_xticklabels(plot_df.year.unique(), rotation=60,fontsize=label_font)
ax.set_yticklabels(ax.get_yticks().astype(int),fontsize=label_font)
ax.set_xlabel('Year',fontsize = label_font)
ax.set_ylabel(labelz,fontsize = label_font)
ax.legend(bbox_to_anchor = (2,.9),frameon=False)

ax = axs[1,2]
sns.pointplot(data=plot_df.loc[np.isin(plot_df.mesh_heading, imm_array),:], x='year', y = 'c', hue='mesh_heading',ax=ax,palette=my_cmap_sub)
ax.set_xticks(range(len(plot_df.year.unique())))
ax.set_xticklabels(plot_df.year.unique(), rotation=60,fontsize=label_font)
ax.set_yticklabels(ax.get_yticks().astype(int),fontsize=label_font)
ax.set_xlabel('Year',fontsize = label_font)
ax.set_ylabel(labelz,fontsize = label_font)
ax.legend(bbox_to_anchor = (1,.9),frameon=False)
sns.despine()

axs[0,3].remove()
axs[1,3].remove()

fig.subplots_adjust(left=.1, bottom=.1, right=.9, top=.9, wspace=.2, hspace=.5)

plt.savefig(filename, papertype = 'a4', orientation = 'portrait', format = 'pdf') 

In [None]:
def normalise_it(df):
    df['c_norm'] = (df.c)/(df.c.sum())
    return df

countdf = countdf.drop_duplicates()
sampledf = sampledf.drop_duplicates()
counts_or_samples = 1
my_cmap = sns.color_palette("hsv",15).as_hex()

if counts_or_samples==0:
    plot_df = countdf.loc[np.isin(countdf.mesh_heading, neoplasm_array),:]
    filename = 'area_plot_neo_counts.pdf'
    labelz = '% Series'
    l1_array = l1_array[~np.isin(l1_array,['Neoplasms'])]
elif counts_or_samples==1:
    plot_df = sampledf.loc[np.isin(sampledf.mesh_heading, neoplasm_array),:]
    filename = 'area_plot_neo_samples.pdf'
    labelz = '% Samples'
    

l1_df = plot_df.loc[plot_df.year>2001,:].reset_index(drop=True)
df_norm = l1_df.groupby('year').apply(normalise_it)
df_norm.loc[df_norm.c_norm.isnull(),'c_norm'] = 0

fig,ax =plt.subplots(1,1, figsize=(10,5))
stack_data = df_norm.groupby('mesh_heading').apply(lambda x: x.c_norm.tolist()).reset_index(name='ts')
stack_data = pd.merge(stack_data,countsdf[['mesh_heading', 'n']].drop_duplicates(), on='mesh_heading', how='left').sort_values('n', ascending=False)
stack_data = stack_data.reset_index(drop=True)
mh = stack_data.mesh_heading
x = df_norm.year.unique()
y = stack_data.ts.tolist()
ax.stackplot(x,y, labels=mh,colors=my_cmap)
ax.set_xlim(2002,2017)
ax.set_ylim(0,1)
ax.set_xticks(df_norm.year.unique())
ax.set_xticklabels(df_norm.year.unique(), rotation=60,fontsize=label_font)
ax.set_yticks([0,.5,1])
ax.set_yticklabels([0,.5,1], fontsize=label_font)
ax.set_xlabel('Year',fontsize = label_font)
ax.set_ylabel(labelz,fontsize = label_font)
ax.legend(bbox_to_anchor=[2,1],frameon=False)

sns.despine()
fig.subplots_adjust(left=.2, bottom=.2, right=.6, top=.9, wspace=.2, hspace=.2)
plt.savefig(filename, papertype = 'a4', orientation = 'landscape', format = 'pdf')

In [None]:
def normalise_it(df):
    df['c_norm'] = (df.c)/(df.c.sum())
    return df

sampledf = sampledf.rename(index=str, columns = {'nsamples':'c'})
my_cmap_main = sns.color_palette("hsv",12).as_hex()
counts_or_samples = 1

if counts_or_samples==0:
    plot_df = countdf
    filename = 'area_plot_counts.pdf'
    labelz = '% Series'
    l1_array = l1_array[~np.isin(l1_array,['Neoplasms'])]
elif counts_or_samples==1:
    plot_df = sampledf
    filename = 'area_plot_samples.pdf'
    labelz = '% Samples'
    

l1_df = plot_df.loc[(np.isin(plot_df.mesh_heading, l1_list))& (plot_df.year>2000),:].reset_index(drop=True)
df_norm = l1_df.groupby('year').apply(normalise_it)

fig,ax =plt.subplots(1,1, figsize=(10,5))
stack_data = df_norm.groupby('mesh_heading').apply(lambda x: x.c_norm.tolist()).reset_index(name='ts')
stack_data = pd.merge(stack_data,countsdf[['mesh_heading', 'n']], on='mesh_heading', how='left').sort_values('n', ascending=False)
mh = stack_data.mesh_heading
x = df_norm.year.unique()
y = stack_data.ts.tolist()
ax.stackplot(x,y, labels=mh,colors=my_cmap_main)
ax.set_xlim(2001,2017)
ax.set_ylim(0,1)
ax.set_xticks(df_norm.year.unique())
ax.set_xticklabels(df_norm.year.unique(), rotation=60,fontsize=label_font)
ax.set_yticks([0,.5,1])
ax.set_yticklabels([0,.5,1], fontsize=label_font)
ax.set_xlabel('Year',fontsize = label_font)
ax.set_ylabel(labelz,fontsize = label_font)
ax.legend(bbox_to_anchor=[2.5,1],frameon=False)

sns.despine()
fig.subplots_adjust(left=.2, bottom=.2, right=.6, top=.9, wspace=.2, hspace=.2)
plt.savefig(filename, papertype = 'a4', orientation = 'landscape', format = 'pdf')

In [None]:
sampledf = pd.read_pickle('../../data/final/samplesbyyear_for_plotting.pkl')
total_samples = sampledf.groupby('mesh_heading').sum().reset_index()
total_samples = pd.merge(total_samples, countsdf[['mesh_heading','mesh_treenumbers']], on='mesh_heading', how='left')
total_samples = total_samples[['mesh_heading', 'nsamples', 'mesh_treenumbers']].drop_duplicates()
my_cmap_main = sns.color_palette("hsv",10).as_hex()

fig,ax =plt.subplots(1,1, figsize=(20,12))
my_cmap_main = sns.color_palette("hsv",12).as_hex()
l1 = total_samples.loc[total_samples.mesh_treenumbers.str.len() ==3,['mesh_heading', 'nsamples']].drop_duplicates()
l1 = l1[~np.isin(l1.mesh_heading,['Occupational Diseases','Parasitic Diseases', 'Wounds and Injuries'])]
l1 = l1.sort_values('nsamples', ascending=False)
l1 = l1[['mesh_heading','nsamples']].set_index('mesh_heading')
l1.plot(kind='pie', subplots=True, figsize=(8, 8),legend=False, fontsize=20, ax = ax, colors=my_cmap_main)

plt.savefig('pie.pdf', papertype = 'a4', orientation = 'landscape', format = 'pdf') 