In [196]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('ticks', rc={"axes.facecolor": (0, 0, 0, 0)})
sns.set_context('talk')
pd.set_option('display.max_columns', 500)

from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Arial']

In [197]:
ecoli="../data/ec_metadata_hospital.csv"
ecoli_st="../data/ecoli_st.tsv"
paeruginosa="../data/pa_metadata_hospital.csv"
kpneumoniae="../data/kp_metadata_hospital.csv"
kpneumoniae_st="../../tenet/kpneumo/data/report_kleborate_hospital.csv"

In [198]:
ec=pd.read_csv(ecoli, sep="\t")
ec_st=pd.read_csv(ecoli_st, sep="\t", header=None)
pa=pd.read_csv(paeruginosa, sep="\t")
kp=pd.read_csv(kpneumoniae, sep="\t")
kp_st=pd.read_csv(kpneumoniae_st, sep="\t")

### **Prepare data**

In [199]:
# get data 
pa = pa[["sampleid", "samplingdate", "ST", "country", "pat_id"]]

# add hospital information
pa["hospital"] = ["CPH" if x == "DNK"
                 else "MHH"
                 for x in pa["country"]]

# drop country col
pa.drop("country", axis=1, inplace=True)

In [200]:
# get data
ec = ec[["sampleid", "samplingdate", "country", "pat_id"]]

# add hospital information
ec["hospital"] = ["CPH" if x == "DNK"
                 else "MHH"
                 for x in ec["country"]]

# drop country col
ec.drop("country", axis=1, inplace=True)

# add ST data
ec_st.rename(columns={0:"sampleid", 1:"ST"}, inplace=True)

# remove suffix and preffix info
ec_st["sampleid"] = ec_st["sampleid"].str[len("/vol/projects/jburgaya/tenet/ecoli/fastas/"):]
ec_st["sampleid"] = ec_st["sampleid"].str[:-len(".fna")]

# merge st data to metadata"
ec = pd.merge(ec, ec_st, on="sampleid", how="left")

In [201]:
# get data
kp = kp[["sampleid", "samplingdate", "country", "pat_id"]]

# add hospital information
kp["hospital"] = ["CPH" if x == "DNK"
                 else "MHH"
                 for x in kp["country"]]

# drop country col
kp.drop("country", axis=1, inplace=True)

kp_st.rename(columns={"strain":"sampleid"}, inplace=True)
kp_st=kp_st[["sampleid", "ST"]]

kp_st["ST"] = kp_st["ST"].str[len("ST"):]

kp = pd.merge(kp, kp_st, on="sampleid", how="left")

In [202]:
# remove rows with empty samplingdate
kp = kp[kp["samplingdate"].notna()]
ec = ec[ec["samplingdate"].notna()]
pa = pa[pa["samplingdate"].notna()]

In [203]:
# remove rows with same patientid for same ST and only keep one
ec = ec.drop_duplicates(subset=['pat_id', 'ST'])
kp = kp.drop_duplicates(subset=['pat_id', 'ST'])
pa = pa.drop_duplicates(subset=['pat_id', 'ST'])

In [None]:
# select study period
start_date = '2021-03-01'
end_date = '2022-10-31'
# filter
ec = ec[(ec['samplingdate'] >= start_date) & (ec['samplingdate'] <= end_date)]
kp = kp[(kp['samplingdate'] >= start_date) & (kp['samplingdate'] <= end_date)]
pa = pa[(pa['samplingdate'] >= start_date) & (pa['samplingdate'] <= end_date)]

### **Plot**

Plot ST distribution by month, create a plot for each hospital. Normalize counts. merge together ST with less than X counts into a category named "others" (for E. coli and P. aeruginosa, merge also ST="-" into others category).

#### K. pneumoniae


In [None]:
# extract month from samplingdate
kp['month'] = pd.to_datetime(kp['samplingdate']).dt.to_period('M')

# group by hospital, month, and ST to get counts
kp_g = kp.groupby(['hospital', 'month', 'ST']).size().reset_index(name='counts')

# merge ST with less than 20 counts
st_counts = kp_g.groupby('ST')['counts'].sum()
st_counts = st_counts[st_counts >= 10]
kp_g['ST'] = kp_g['ST'].apply(lambda x: x if x in st_counts.index else 'others')

# group again to normalize counts
kp_g = kp_g.groupby(['hospital', 'month', 'ST']).sum().reset_index()
kp_g['normalized_counts'] = kp_g.groupby(['hospital', 'month'])['counts'].transform(lambda x: x / x.sum())

In [None]:
# prepare data for seaborn
df_final = kp_g.pivot_table(index=['hospital', 'month'], columns='ST', values='normalized_counts', fill_value=0).reset_index()

# define a custom color palette
unique_st = df_final.columns[2:]
colors = sns.color_palette("Set3", len(unique_st)-1)
color_dict = {st: color for st, color in zip(unique_st, colors)}
color_dict['others'] = '#e6daa6'

# plot
hospitals = df_final['hospital'].unique()

for hospital in hospitals:
    df_hosp = df_final[df_final['hospital'] == hospital]
    df_hosp = df_hosp.set_index('month')
    df_hosp.drop(columns='hospital', inplace=True)
    
    ax = df_hosp.plot(kind='area', stacked=True, figsize=(14, 8), color=[color_dict[col] for col in df_hosp.columns])
    plt.title(r'$\it{K. pnuemoniae}$' + f', {hospital}')
    plt.xlabel('Month')
    plt.ylabel('Normalized counts')
    plt.ylim(0, 1)
    
    # Custom legend with two columns
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles, labels, title='ST > 10', facecolor="white", bbox_to_anchor=(1.05, 1), loc='upper left', ncol=2)

    plt.savefig(f'../out/figures/ST_kp_{hospital}.png',
                dpi=300,
                bbox_inches='tight',
                transparent=True)
    plt.savefig(f'../out/figures/ST_kp_{hospital}.svg',
                dpi=300, bbox_inches='tight',
                transparent=True);

findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial
findfont: Generic family 'sans-serif' not found because

#### P. aeruginosa

In [None]:
# extract month from samplingdate
pa['month'] = pd.to_datetime(pa['samplingdate']).dt.to_period('M')

# group by hospital, month, and ST to get counts
pa_g = pa.groupby(['hospital', 'month', 'ST']).size().reset_index(name='counts')

# merge ST with less than 20 counts and ST == "-" into "others"
st_counts = pa_g.groupby('ST')['counts'].sum()
st_counts = st_counts[st_counts >= 15]
pa_g['ST'] = pa_g['ST'].apply(lambda x: x if x in st_counts.index and x != '-' else 'others')

# group again to normalize counts
pa_g = pa_g.groupby(['hospital', 'month', 'ST']).sum().reset_index()
pa_g['normalized_counts'] = pa_g.groupby(['hospital', 'month'])['counts'].transform(lambda x: x / x.sum())

In [None]:
# prepare data for seaborn
df_final = pa_g.pivot_table(index=['hospital', 'month'], columns='ST', values='normalized_counts', fill_value=0).reset_index()

# define a custom color palette
unique_st = df_final.columns[2:]
colors = sns.color_palette("Set3", len(unique_st)-1)
color_dict = {st: color for st, color in zip(unique_st, colors)}
color_dict['others'] = '#e6daa6'

# plot
hospitals = df_final['hospital'].unique()

for hospital in hospitals:
    df_hosp = df_final[df_final['hospital'] == hospital]
    df_hosp = df_hosp.set_index('month')
    df_hosp.drop(columns='hospital', inplace=True)
    
    df_hosp.plot(kind='area', stacked=True, figsize=(14, 8), color=[color_dict[col] for col in df_hosp.columns])
    plt.title(r'$\it{P. aeruginosa}$' + f', {hospital}')
    plt.xlabel('Month')
    plt.ylabel('Normalized counts')
    plt.ylim(0, 1)
    plt.legend(title='ST > 15', facecolor="white", bbox_to_anchor=(1.05, 1), loc='upper left')

    plt.savefig(f'../out/figures/ST_pa_{hospital}.png',
                dpi=300,
                bbox_inches='tight',
                transparent=True)
    plt.savefig(f'../out/figures/ST_pa_{hospital}.svg',
                dpi=300, bbox_inches='tight',
                transparent=True);

#### E. coli

In [None]:
# extract month from samplingdate
ec['month'] = pd.to_datetime(ec['samplingdate']).dt.to_period('M')

# group by hospital, month, and ST to get counts
ec_grouped = ec.groupby(['hospital', 'month', 'ST']).size().reset_index(name='counts')

# merge ST with less than 100 counts and ST == "-" into "others"
st_counts = ec_grouped.groupby('ST')['counts'].sum()
st_counts = st_counts[st_counts >= 100]
ec_grouped['ST'] = ec_grouped['ST'].apply(lambda x: x if x in st_counts.index and x != '-' else 'others')

# group again to normalize counts
ec_grouped = ec_grouped.groupby(['hospital', 'month', 'ST']).sum().reset_index()
ec_grouped['normalized_counts'] = ec_grouped.groupby(['hospital', 'month'])['counts'].transform(lambda x: x / x.sum())

In [None]:
# prepare data for seaborn
df_final = ec_grouped.pivot_table(index=['hospital', 'month'], columns='ST', values='normalized_counts', fill_value=0).reset_index()

# define a custom color palette
unique_st = df_final.columns[2:]
colors = sns.color_palette("Set3", len(unique_st)-1)
color_dict = {st: color for st, color in zip(unique_st, colors)}
color_dict['others'] = '#e6daa6'

# plot
hospitals = df_final['hospital'].unique()

for hospital in hospitals:
    df_hosp = df_final[df_final['hospital'] == hospital]
    df_hosp = df_hosp.set_index('month')
    df_hosp.drop(columns='hospital', inplace=True)
    
    df_hosp.plot(kind='area', stacked=True, figsize=(14, 8), color=[color_dict[col] for col in df_hosp.columns])
    plt.title(r'$\it{E. coli}$' + f', {hospital}')
    plt.xlabel('Month')
    plt.ylabel('Normalized counts')
    plt.ylim(0, 1)
    plt.legend(title='ST > 100', facecolor="white", bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.savefig(f'../out/figures/ST_ec_{hospital}.png',
                dpi=300,
                bbox_inches='tight',
                transparent=True)
    plt.savefig(f'../out/figures/ST_ec_{hospital}.svg',
                dpi=300, bbox_inches='tight',
                transparent=True);