In [None]:
from collections import defaultdict
import os.path
import re

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import colors
import pandas as pd
from sklearn.manifold import TSNE

In [None]:
all_vs_all_mash_fp = '../emb_stampede2/all-imicrobe-dist.txt'
with open(all_vs_all_mash_fp, 'rt') as all_vs_all_mash_file:
    all_vs_all_mash_file.read(len('#query '))
    all_vs_all_mash_df = pd.read_table(all_vs_all_mash_file)
all_vs_all_mash_df.head()

In [None]:
emb_100_nodes_fp = '../emb_stampede2/all-imicrobe-dist_similarity_limit_100.emb'
with open(emb_100_nodes_fp, 'rt') as emb_file:
    row_count, column_count = [int(i) for i in emb_file.readline().split(' ')]
    emb_100_nodes_df = pd.read_table(
        emb_file,
        sep=' ',
        header=None,
        names=['el{}'.format(v) for v in range(column_count)])
emb_100_nodes_df.head()

In [None]:
emb_tsne = TSNE(n_components=2, init='pca').fit_transform(emb_100_nodes_df)
emb_tsne.shape

In [None]:
project_re = re.compile(r'projects/(?P<project>\d+)/samples/(?P<sample>\d+)')
all_vs_all_mash_df.index[0]
print(project_re.search(all_vs_all_mash_df.index[0]).groupdict())
sample_to_project = {
    int(m.group('sample')): int(m.group('project'))
    for m
    in [project_re.search(r) for r in all_vs_all_mash_df.index]
}
print('sample_to_project has {} keys'.format(len(sample_to_project)))

In [None]:
plt.scatter(emb_tsne[:, 0], emb_tsne[:, 1], c=[sample_to_project[r] for r in emb_100_nodes_df.index])
plt.show()

In [None]:
plt.cm.RdBu(0.5)

In [None]:
def plot_n2v_emb(n2v_emb_fp, sample_to_project):
    with open(n2v_emb_fp, 'rt') as emb_file:
        row_count, column_count = [int(i) for i in emb_file.readline().split(' ')]
        n2v_emb_df = pd.read_table(
            emb_file,
            sep=' ',
            header=None,
            names=['el{}'.format(v) for v in range(column_count)])

    emb_tsne = pd.DataFrame(
        data=TSNE(n_components=2, init='pca').fit_transform(n2v_emb_df),
        index=n2v_emb_df.index,
        columns=('X1', 'X2'))
    
    project_to_sample_list = defaultdict(list)
    for s, p in sample_to_project.items():
        project_to_sample_list[p].append(s)
    
    projects = set([sample_to_project[s] for s in n2v_emb_df.index])
    project_colors = list(colors.get_named_colors_mapping().keys())
    f = plt.figure(figsize=(8, 8))
    for project in sorted(projects, key=lambda p: len(project_to_sample_list[p]), reverse=True):
        samples_in_project = [s for s in n2v_emb_df.index if sample_to_project[s] == project]
        plt.scatter(
            emb_tsne.X1[samples_in_project],
            emb_tsne.X2[samples_in_project],
            s=50,
            label=str(project),
            #c=plt.cm.hot(1.0/project),
            cmap=plt.cm.hot,
            alpha=0.5)
    plt.title(os.path.basename(n2v_emb_fp))
    #plt.legend()
    #plt.legend(bbox_to_anchor=(1.05, 1), loc=2, ncol=3, borderaxespad=0.)
    f.show()
    f.savefig(n2v_emb_fp + '.pdf', format='pdf')
    projects = set([sample_to_project[s] for s in n2v_emb_df.index])
    print('{} projects: {}'.format(len(projects), sorted(projects)))

In [None]:
plot_n2v_emb('../emb_stampede2/all-imicrobe-dist_similarity_limit_100.emb', sample_to_project)

In [None]:
plot_n2v_emb('../emb_stampede2/all-imicrobe-dist_similarity_limit_200.emb', sample_to_project)

In [None]:
plot_n2v_emb('../emb_stampede2/all-imicrobe-dist_similarity_limit_400.emb', sample_to_project)

In [None]:
plot_n2v_emb('../emb_stampede2/all-imicrobe-dist_similarity_limit_800.emb', sample_to_project)

In [None]:
plot_n2v_emb('../emb_stampede2/all-imicrobe-dist_similarity_limit_1600.emb', sample_to_project)