In [1]:
import os

import numpy as np
import pandas as pd

from natsort import natsorted

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.ticker import MultipleLocator        

In [2]:
# I/O

main = pd.read_csv(os.path.join(os.getcwd(), 'input/main.csv'))  

out = os.path.join(os.getcwd(), 'output/cells_per_patch')
if not os.path.exists(out):
    os.makedirs(out)

In [None]:
# generate histograms showing the distribution of nuclei among 14x14-pixel and 46x46-pixel image patches

for window_size in [14, 46]:
    
    print(f'Window size = {window_size}')
    
    if not os.path.exists(os.path.join(out, f'counts{window_size}.csv')):

        half_window_size = window_size / 2
        
        counts = []
        for i, j in main.iterrows():
            if i % 10000 == 0:
                print(i)
            
            # isolate cells in window_size x window_size area, append nuclei count to counts list
            x = j['X_centroid']
            y = j['Y_centroid']
            patch = (
                main[(main['Y_centroid'].between(y - half_window_size, y + half_window_size)) &
                        (main['X_centroid'].between(x - half_window_size, x + half_window_size))].copy()
            )
            counts.append(len(patch))

        plot = pd.DataFrame(
            data={f'Window Size: {window_size}x{window_size} pixels': 1, 'Cell Count': counts}
        )

        q1, q3 = np.percentile(plot['Cell Count'], [25, 75])
        iqr = q3 - q1
        plot['Q1'] = q1
        plot['Q3'] = q3
        plot['IQR'] = iqr

        plot.to_csv(os.path.join(out, f'counts{window_size}.csv'), index=False)
        
        plt.rcParams['font.family'] = 'Arial'
        sns.set_style('whitegrid')

        if window_size == 14:
            fig, ax = plt.subplots(figsize=(2.25,5))
        elif window_size == 46:
            fig, ax = plt.subplots(figsize=(7,5))

        g = sns.histplot(data=plot, x='Cell Count', discrete=True, stat='percent', color='tab:blue', alpha=1)
        
        # Get the heights of the bars (counts)
        heights = [patch.get_height() for patch in g.patches]
        
        # increase x-tick frequency
        g.axes.xaxis.set_major_locator(MultipleLocator(1))
        
        g.set_xlabel('# nuclei', fontsize=15)
        g.set_ylabel('% patches', fontsize=15)

        plt.tight_layout()
        plt.savefig(os.path.join(out, f'patch_counts_w{window_size}.pdf'))
        plt.show()
        plt.close('all')
        print()
    
    else:  # if counts.csv has already been saved
        
        plot = pd.read_csv(os.path.join(out, f'counts{window_size}.csv'))

        plt.rcParams['font.family'] = 'Arial'
        sns.set_style('whitegrid')
        
        if window_size == 14:
            fig, ax = plt.subplots(figsize=(2.25,5))
        elif window_size == 46:
            fig, ax = plt.subplots(figsize=(7,5))

        g = sns.histplot(data=plot, x='Cell Count', discrete=True, stat='percent', color='tab:blue', alpha=1)
        
        # Get the heights of the bars (counts)
        heights = [patch.get_height() for patch in g.patches]
        import pdb; pdb.set_trace()
        
        # increase x-tick frequency
        g.axes.xaxis.set_major_locator(MultipleLocator(1))
        
        g.set_xlabel('# nuclei', fontsize=15)
        g.set_ylabel('% patches', fontsize=15)
        
        plt.xticks(fontsize=11)
        plt.yticks(fontsize=11)

        plt.tight_layout()
        plt.savefig(os.path.join(out, f'patch_counts_w{window_size}.pdf'))
        plt.show()
        plt.close('all')
        print()

Window size = 14
> [0;32m/var/folders/_h/pbzrx8ss6n5f031pf4hc97_w0000gp/T/ipykernel_10351/2296146089.py[0m(81)[0;36m<module>[0;34m()[0m
[0;32m     79 [0;31m[0;34m[0m[0m
[0m[0;32m     80 [0;31m        [0;31m# increase x-tick frequency[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 81 [0;31m        [0mg[0m[0;34m.[0m[0maxes[0m[0;34m.[0m[0mxaxis[0m[0;34m.[0m[0mset_major_locator[0m[0;34m([0m[0mMultipleLocator[0m[0;34m([0m[0;36m1[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     82 [0;31m[0;34m[0m[0m
[0m[0;32m     83 [0;31m        [0mg[0m[0;34m.[0m[0mset_xlabel[0m[0;34m([0m[0;34m'# nuclei'[0m[0;34m,[0m [0mfontsize[0m[0;34m=[0m[0;36m15[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  heights


[86.25166458636855, 12.658996697961586, 1.0444627195271112, 0.04445854501584227, 0.0004174511269093171]


In [None]:
# generate histograms showing the distribution of nuclei per VAE9 image patch cluster

window_size = 14
half_window_size = window_size / 2

means = {}  # dict of mean nuceli values for bar plot below

for clus in natsorted(main['VAE9_VIG7'].unique()):

    if not os.path.exists(os.path.join(out, f'counts{window_size}_cluster{clus}.csv')):
        
        print(f'Working on cluster {clus} with window size {window_size}.')
        
        subset = main[main['VAE9_VIG7'] == clus]
        subset.reset_index(inplace=True)
        
        counts = []
        for i, j in subset.iterrows():
            if i % 10000 == 0:
                print(i)
            
            # isolate cells in window_size x window_size area, append nuclei count to counts list
            x = j['X_centroid']
            y = j['Y_centroid']
            patch = (
                main[(main['Y_centroid'].between(y - half_window_size, y + half_window_size)) &
                        (main['X_centroid'].between(x - half_window_size, x + half_window_size))].copy()
            )
            counts.append(len(patch))

        plot = pd.DataFrame(data={ 'Cell Count': counts})

        # compute mean nuclei count and add to dict
        means[clus] = [plot['Cell Count'].mean()]
        
        q1, q3 = np.percentile(plot['Cell Count'], [25, 75])
        iqr = q3 - q1
        plot['Q1'] = q1
        plot['Q3'] = q3
        plot['IQR'] = iqr
    
        plot.to_csv(os.path.join(out, f'counts{window_size}_cluster{clus}.csv'), index=False)
        
        plt.rcParams['font.family'] = 'Arial'
        sns.set_style('whitegrid')

        fig, ax = plt.subplots(figsize=(2.25,5))

        g = sns.histplot(data=plot, x='Cell Count', discrete=True, stat='percent', color='tab:blue', alpha=1)
        
        # Get the heights of the bars (counts)
        heights = [patch.get_height() for patch in g.patches]
        
        # increase x-tick frequency
        g.axes.xaxis.set_major_locator(MultipleLocator(1))
        
        g.set_xlabel('# nuclei', fontsize=15)
        g.set_ylabel('% patches', fontsize=15)
        
        plt.xticks(fontsize=11)
        plt.yticks(fontsize=11)

        plt.tight_layout()
        plt.savefig(os.path.join(out, f'patch_counts_w{window_size}_cluster{clus}.pdf'))
        plt.close('all')

    else:  # if counts.csv for current cluster has already been saved
            
        plot = pd.read_csv(os.path.join(out, f'counts{window_size}_cluster{clus}.csv'))
        
        means[clus] = [plot['Cell Count'].mean()]
        
        plt.rcParams['font.family'] = 'Arial'
        sns.set_style('whitegrid')

        fig, ax = plt.subplots(figsize=(2.25,5))

        g = sns.histplot(data=plot, x='Cell Count', discrete=True, stat='percent', color='tab:blue', alpha=1)
        
        # Get the heights of the bars (counts)
        heights = [patch.get_height() for patch in g.patches]
        
        # increase x-tick frequency
        g.axes.xaxis.set_major_locator(MultipleLocator(1))
        
        g.set_xlabel('# nuclei', fontsize=15)
        g.set_ylabel('% patches', fontsize=15)
        
        plt.xticks(fontsize=11)
        plt.yticks(fontsize=11)

        plt.tight_layout()
        plt.savefig(os.path.join(out, f'patch_counts_w{window_size}_cluster{clus}.pdf'))
        plt.close('all')

# convert means dict to a dataframe and save
means_df = pd.DataFrame.from_dict(means, orient='index')
means_df.to_csv(os.path.join(out, f'cluster_cell_count_means.csv'), index=True)

In [None]:
# generate bar plot of mean nuclei values per VAE9 image patch cluster

# read mean nuclei dataframe
means_df = pd.read_csv(os.path.join(out, 'cluster_cell_count_means.csv'), index_col=0)
means_df = pd.Series(means_df['0'])
means_df = round(means_df, 3)
means_df.sort_values(inplace=True)

# clusters associated with epithelial, immune, and stromal cells
epithelial = [0, 1, 3, 5, 6, 10, 19, 21, 22] 
immune = [4, 9, 11, 13, 14, 15, 18, 20]
stromal = [2, 3, 7, 8, 12, 16, 17, 23]

# color LUT
mpp_colors = {}
mpp_colors.update({str(k): v for k, v in zip(epithelial, ['tab:blue']*len(epithelial))})
mpp_colors.update({str(k): v for k, v in zip(immune, ['tab:orange']*len(immune))})
mpp_colors.update({str(k): v for k, v in zip(stromal, ['tab:green']*len(stromal))})

# plot
fig, ax = plt.subplots(figsize=(15,4))
ax.bar(
    x=[str(i) for i in means_df.index], height=np.log(means_df.values),
    color=[mpp_colors[str(i)] for i in means_df.index], alpha=0.5, width=0.9
)
plt.ylabel('log(mean nuclei per patch)', fontsize=18)
plt.xlabel('VAE9_VIG7 cluster',fontsize=22, labelpad=15)
ticks = plt.xticks(fontsize=18)

patches = [
    Patch(facecolor=color, alpha=0.5, edgecolor=None) for color in ['tab:blue', 'tab:orange', 'tab:green']
    ]
plt.legend(
    patches, ['epithelial', 'immune', 'stromal'],
    title=None, prop={'size': 10.5},
    labelspacing=0.01, bbox_to_anchor=[1.1, 1.03], loc='upper right'
    )
plt.grid(False)

plt.tight_layout()
plt.savefig(os.path.join(out, f'cluster_cell_count_means.pdf'))
print('Stromal tissue tends to be cellularly diffuse, while lymphoid tissue tends to be cellularly dense, and epithelial tissue is variable') 
plt.show()
plt.close('all')