# k-mers Overview

In [None]:
# Retina quality plots
%config InlineBackend.figure_format = 'retina'

## Load k-mers data

In [None]:
KMER_DATA_PATH = '../../data/features_data/kmers/Galson_2015a.parquet'
TARGETS_DATA_PATH = '../../data/targets_data/clusters/Galson_2015a.parquet'

In [None]:
import pandas as pd

data = pd.read_parquet(KMER_DATA_PATH)

data.info()
data.head()

In [None]:
targets = pd.read_parquet(TARGETS_DATA_PATH)

targets.info()
targets.head()

## Subsample data

In [None]:
positive_data = data.loc[targets['HepB']]

In [None]:
print(f'Positive data shape: {positive_data.shape}')

In [None]:
NEGATIVE_SUBSAMPLE_SIZE = positive_data.shape[0]
RANDOM_STATE = 42

negative_clusters = targets['HepB'] == False
negative_data_subs = data.sample(NEGATIVE_SUBSAMPLE_SIZE, weights=negative_clusters, random_state=RANDOM_STATE)

In [None]:
print(f'Negative data shape: {negative_data_subs.shape}')

In [None]:
data_subs = pd.concat([positive_data, negative_data_subs])
targets_subs = targets.loc[data_subs.index]
print(f'All subsampled data shape: {data_subs.shape}')

## t-SNE

In [None]:
from MulticoreTSNE import MulticoreTSNE as TSNE

TSNE_COMPONENTS = 2

# t-SNE
tsne_output = TSNE(n_components=TSNE_COMPONENTS, random_state=RANDOM_STATE, n_jobs=-1).fit_transform(data_subs)

In [None]:
from sklearn.utils import shuffle

# Shuffle data for nicer plot
tsne_output, targets_subs = shuffle(tsne_output, targets_subs, random_state=RANDOM_STATE)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# t-SNE plot
ax = sns.scatterplot(x=tsne_output[:,0], y=tsne_output[:,1], hue=targets_subs['HepB'], linewidth=0, alpha=0.5, s=20)
ax.set_title('k-mers t-SNE');