# Representation visualization

In [None]:
# Retina quality plots
%config InlineBackend.figure_format = 'retina'

In [None]:
X_TRAIN_PATH = 'data/features_data/fingerprints_subj_split/neg_subs/Circular_(Morgan)/X_train.parquet'
X_VALID_PATH = 'data/features_data/fingerprints_subj_split/valid/Circular_(Morgan)/X_valid_balanced.parquet'
Y_TRAIN_PATH = 'data/features_data/fingerprints_subj_split/neg_subs/Circular_(Morgan)/y_train.parquet'
Y_VALID_PATH = 'data/features_data/fingerprints_subj_split/valid/Circular_(Morgan)/y_valid_balanced.parquet'
FULL_DATA_PATH = 'data/combined/clustered/final/Galson_2015a.parquet'
REPRESENTATION_TYPE = 'Circular fingerprints'

## Load data

In [None]:
import pandas as pd

features_data = pd.concat([
    pd.read_parquet(X_TRAIN_PATH),
    pd.read_parquet(X_VALID_PATH)
])

features_data.info()
features_data.head()

In [None]:
y_train = pd.read_parquet(Y_TRAIN_PATH)['HepB']
y_valid = pd.read_parquet(Y_VALID_PATH)['HepB']

y_data = pd.concat([
    y_train,
    y_valid
])

y_data.head()

In [None]:
pos_features_data = features_data[y_data]

pos_features_data.info()
pos_features_data.head()

In [None]:
clustered_data = pd.read_parquet(FULL_DATA_PATH)

clustered_data.info()
clustered_data.head()

## HepB sequences - t-SNE

In [None]:
from MulticoreTSNE import MulticoreTSNE as TSNE

TSNE_COMPONENTS = 2
RANDOM_STATE = 42

# t-SNE
pos_tsne_output = TSNE(n_components=TSNE_COMPONENTS, random_state=RANDOM_STATE, n_jobs=-1).fit_transform(pos_features_data)

In [None]:
import numpy as np

TRAIN_LABEL = 'Train'
VALID_LABEL = 'Valid'

def tsne_data(tsne, X_data, y_data, train_cnt, valid_cnt):
    cl_grouped = clustered_data[clustered_data['Cluster_ID'].isin(X_data.index)].groupby('Cluster_ID')
    source_subj_clusters = cl_grouped['Subject'].unique().value_counts()
    
    print(f'Single subject clusters: {", ".join(sorted(np.array([subjects for subjects in source_subj_clusters.index if len(subjects) == 1]).flatten()))}')
    
    cl_majority_subjects = cl_grouped['Subject'].agg(lambda x: x.mode()[0])
    cl_majority_subjects = cl_majority_subjects.loc[X_data.index]

    print(f'Majority subjects for clusters: {", ".join(sorted(cl_majority_subjects.unique()))}')
    
    cluster_sizes = cl_grouped.size()
    cluster_sizes = cluster_sizes.loc[X_data.index].rename('Size')
    
    # Dataframe containing all the information
    df = pd.DataFrame(tsne)
    df['Size'] = cluster_sizes.values
    df['Subject'] = cl_majority_subjects.values
    df['HepB'] = y_data.values

    df['Dataset'] = [TRAIN_LABEL for _ in range(train_cnt)] + [VALID_LABEL for _ in range(valid_cnt)]
    
    return df

In [None]:
pos_df = tsne_data(pos_tsne_output, pos_features_data, y_data.loc[pos_features_data.index], (y_train == True).sum(), (y_valid == True).sum())
pos_df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

MARKER_SIZE_DIV = 10
DEFAULT_MARKER_SIZE = 5
ALPHA = 0.5
LARGEST_CLUSTERS_CNT = 5

TRAINING_MARKER = 'o'
VALIDATION_MARKER = 'D'

unique_subjects = sorted(clustered_data['Subject'].unique(), reverse=True)
color_palette = itertools.cycle(sns.color_palette(n_colors=len(unique_subjects)))

# Plot attributes
subject_colors = {
    subject: color
    for subject, color in zip(unique_subjects, color_palette)
}
dataset_markers = {
    VALID_LABEL: VALIDATION_MARKER,
    TRAIN_LABEL: TRAINING_MARKER
}
hepb_color_palette = itertools.cycle(sns.color_palette())
hepb_colors = {
    True: next(hepb_color_palette),
    False: next(hepb_color_palette)
}
hepb_labels = {
    True: 'HepB',
    False: 'Non-HepB'
}

def tsne_plot(tsne_df, hepb_grouping=False):
    _, ax = plt.subplots(figsize=(12,8))

    # Legend entry constructor
    legend_entry = lambda marker, color: plt.plot([],[], marker=marker, color=color, ls='None')[0]

    datasets = list(dataset_markers.keys())
    
    if hepb_grouping:
        grouped_df = tsne_df.groupby(['HepB', 'Dataset'])
        for (hepb, dataset), group in grouped_df:
            group.plot.scatter(ax=ax, x=0, y=1,
                               marker=dataset_markers[dataset],
                               color=hepb_colors[hepb], 
                               s=(group['Size'] / MARKER_SIZE_DIV) + DEFAULT_MARKER_SIZE, 
                               linewidth=0, 
                               alpha=ALPHA)
        
        ax.set(title=f'{REPRESENTATION_TYPE} t-SNE')

        # Legend
        hepb_vals = sorted(hepb_colors.keys())
        handles = [legend_entry('o', hepb_colors[hepb]) for hepb in hepb_vals]
        labels = [hepb_labels[key] for key in hepb_vals] + datasets
    else:
        # Scatter plot
        grouped_df = tsne_df.groupby(['Subject', 'Dataset'])
        for (subject, dataset), group in grouped_df:
            group.plot.scatter(ax=ax, x=0, y=1,
                               marker=dataset_markers[dataset],
                               color=subject_colors[subject], 
                               s=(group['Size'] / MARKER_SIZE_DIV) + DEFAULT_MARKER_SIZE, 
                               linewidth=0, 
                               alpha=ALPHA)

        ax.set(title=f'{REPRESENTATION_TYPE} t-SNE of HepB sequences')
        
        # Legend
        subjects = sorted(list(tsne_df['Subject'].unique()))
        handles = [legend_entry('o', subject_colors[subj]) for subj in subjects]
        labels = subjects + datasets
    
    handles += [legend_entry(dataset_markers[dataset], 'black') for dataset in datasets]
    labels.append(f'Size = size/{MARKER_SIZE_DIV} + {DEFAULT_MARKER_SIZE}')
    handles.append(legend_entry(None, None))

    plt.legend(handles, labels);

In [None]:
tsne_plot(pos_df)

## All sequences - t-SNE

In [None]:
all_tsne_output = TSNE(n_components=TSNE_COMPONENTS, random_state=RANDOM_STATE, n_jobs=-1).fit_transform(features_data)

In [None]:
all_df = tsne_data(all_tsne_output, features_data, y_data, len(y_train), len(y_valid))
all_df

In [None]:
tsne_plot(all_df, hepb_grouping=True)