# Clustered data overview

In [None]:
# Retina quality plots
%config InlineBackend.figure_format = 'retina'

## Load data

In [None]:
import pandas as pd

CLUSTERED_DATA_DIR = 'data/combined/clustered/final'

In [None]:
STUDY = 'Galson_2015a'

In [None]:
data = pd.read_parquet(f'{CLUSTERED_DATA_DIR}/{STUDY}.parquet')

data.info()
data.head()

## Data overview

### Size

In [None]:
from bin.build_clustered_data import CLUSTER_ID_COLUMN_NAME

def size_overview(df):
    print(f'Total number of sequences: {df.shape[0]:,}')
    print(f'Total number of clusters: {df[CLUSTER_ID_COLUMN_NAME].nunique():,}')

In [None]:
size_overview(data)

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

CLUSTER_SIZE_COL_NAME = 'Number of sequences'
CLUSTER_SIZE_PLOT_TITLE = 'Cluster sizes (using log values)'
CLUSTER_SIZE_PLOT_YLABEL = 'Cluster size'
LARGEST_CLUSTERS_CNT = 10

def cluster_size_overview(df_cluster_sizes):
    df_log_cluster_sizes = np.log(df_cluster_sizes + 1)
    sorted_df_log_cluster_sizes = df_log_cluster_sizes.sort_values().reset_index(name=CLUSTER_SIZE_COL_NAME)

    # Largest clusters
    print(f'{LARGEST_CLUSTERS_CNT} largest cluster sizes:')
    print(df_cluster_sizes.sort_values().reset_index(name=CLUSTER_SIZE_COL_NAME).tail(LARGEST_CLUSTERS_CNT).iloc[::-1].to_string(index=False))

    # Distplot
    plt.figure(figsize=(10,6))
    ax = sns.distplot(df_log_cluster_sizes.values, bins=np.arange(df_log_cluster_sizes.min(), df_log_cluster_sizes.max() + 1), kde_kws={'bw': 1})
    ax.set(title=CLUSTER_SIZE_PLOT_TITLE + ' - distribution',
           xlabel= CLUSTER_SIZE_PLOT_YLABEL,
           xticklabels=['{:.2f}'.format(np.exp(v) - 1) for v in ax.get_xticks()])
    plt.show()
    
    # Areaplot
    ax = sorted_df_log_cluster_sizes[CLUSTER_SIZE_COL_NAME].plot.area(figsize=(10,6))
    ax.set(title=CLUSTER_SIZE_PLOT_TITLE, xlabel= 'Cluster', ylabel=CLUSTER_SIZE_PLOT_YLABEL, xticks=[],
           yticklabels=['{:.2f}'.format(np.exp(v) - 1) for v in ax.get_yticks()])

In [None]:
clusters_gr = data.groupby(CLUSTER_ID_COLUMN_NAME)
cluster_sizes = clusters_gr.size()
cluster_size_overview(cluster_sizes)

### Subject counts

In [None]:
cluster_subj_cnt = clusters_gr['Subject'].nunique()
cluster_subj_cnt

In [None]:
from bin.plotting import barplot

ax=barplot(cluster_subj_cnt.value_counts(), title='Number of source subjects in clusters')
ax.set(ylabel='Number of subjects', xlabel='Number of clusters');

### CDR3 logos of the largest clusters

In [None]:
HEBP_TYPE = 'HepB+B-cells'

clusters_specif = clusters_gr.apply(lambda gr: (gr['BType'] == HEBP_TYPE).sum() / len(gr))

In [None]:
import matplotlib.pyplot as plt
import logomaker as lm
import math

LOGOS_CLUSTER_CNT = 10
NCOLS = 2
HEP_B_RATIO_THRESHOLD = 0.5

def cdr3_logos(cluster_ids, mark_hepb=False):
    nrows = math.ceil(len(cluster_ids) / NCOLS)
    
    fig, axs = plt.subplots(nrows, NCOLS, figsize=(10, 8))
    
    for cluster_id, ax in zip(cluster_ids, axs.flat):
        cluster_data = data[data[CLUSTER_ID_COLUMN_NAME] == cluster_id]
        cluster_cdr3_sequences = cluster_data['cdr3'].values
        
        counts_mat = lm.alignment_to_matrix(cluster_cdr3_sequences)

        logo = lm.Logo(counts_mat, color_scheme='chemistry', ax=ax)
        subjects_cnt = len(cluster_data.groupby(['Author', 'Subject']))
        
        title = f'CDR3 - cluster {cluster_id}. Size={cluster_sizes[cluster_id]:,}\nHepB_spec.={"{:.3f}".format(clusters_specif[cluster_id])} - from {subjects_cnt} subjects'
        fontweight = 'bold' if mark_hepb and clusters_specif[cluster_id] > HEP_B_RATIO_THRESHOLD else 'normal'

        ax.set_title(title, fontweight=fontweight)

    fig.tight_layout()

In [None]:
largest_clusters = cluster_sizes.sort_values(ascending=False).head(LOGOS_CLUSTER_CNT).index.to_list()
cdr3_logos(largest_clusters, mark_hepb=True)

### CDR3 logos of the clusters with most subjects

In [None]:
clusters_sorted_subj = cluster_subj_cnt.sort_values()

ax = clusters_sorted_subj.reset_index(drop=True).plot.area()
ax.set(title='Number of subjects in clusters', xlabel='Cluster', ylabel='Number of subjects', xticks=[]);

In [None]:
cdr3_logos(clusters_sorted_subj.tail(LOGOS_CLUSTER_CNT).index)

## HepB data overview

### HepB ratio

In [None]:
RATIO_SIZE_TITLE = 'Cluster HepB ratio'

def cluster_hepb_ratio_overview(specif):
    sorted_specif = specif.sort_values()

    ax = sorted_specif.reset_index().iloc[:,1].plot.area()
    ax.set(title=RATIO_SIZE_TITLE, xlabel='Cluster', ylabel='HepB ratio' , xticks=[])
    plt.show()

In [None]:
cluster_hepb_ratio_overview(clusters_specif)

In [None]:
ax = sns.scatterplot(x=clusters_specif, y=cluster_sizes, size=cluster_sizes)
ax.set(title='Clusters HepB ratio - size', xlabel='Cluster HepB ratio', ylabel='Cluster size')
plt.show()

### HepB data

In [None]:
hepb_clusters_ids = clusters_specif[clusters_specif > HEP_B_RATIO_THRESHOLD].index.to_list()

In [None]:
cluster_sizes_by_hepb = pd.Series([len(hepb_clusters_ids), len(clusters_specif) - len(hepb_clusters_ids)], index=['HepB', 'Non HepB'])
ax = barplot(cluster_sizes_by_hepb, title=f'Number of clusters by HepB (ratio threshold: {HEP_B_RATIO_THRESHOLD})');
ax.set(xlabel='Number of clusters');

### HepB cluster sizes

In [None]:
cluster_size_overview(cluster_sizes[hepb_clusters_ids])

### HepB subject counts

In [None]:
ax=barplot(cluster_subj_cnt.iloc[hepb_clusters_ids].value_counts(), title='Number of source subjects in clusters')
ax.set(ylabel='Number of subjects', xlabel='Number of clusters');

### HepB CDR3 logos

In [None]:
hepb_clusters_df = pd.DataFrame({
    'Size': cluster_sizes,
    'HepB ratio': clusters_specif,
    'HepB seq. count': clusters_specif * cluster_sizes
})

sorted_hepb_clusters_df = hepb_clusters_df.sort_values(by='HepB seq. count', ascending=False)
sorted_hepb_clusters_df.head(10)

In [None]:
HEP_B_RATIO_THRESHOLD = 0.95

best_hepb_clusters = sorted_hepb_clusters_df[sorted_hepb_clusters_df['HepB ratio'] > HEP_B_RATIO_THRESHOLD].head(LOGOS_CLUSTER_CNT)
best_hepb_clusters

In [None]:
cdr3_logos(best_hepb_clusters.index)