In [1]:
import math
import cmath
import numpy as np
import pandas as pd
from numpy.linalg import svd
from scipy import io
from scipy.linalg import eigh
from scipy.spatial.distance import pdist, squareform
from sklearn.cluster import KMeans

import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from mpl_toolkits.axes_grid1.inset_locator import mark_inset
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

%matplotlib qt
axis_font = {'size':'16'}
mpl.rcParams['xtick.labelsize'] = 16
mpl.rcParams['ytick.labelsize'] = 16

Load the data files. The 8 non-empty strings are mapped to the unit circle in the complex plane separated by 45 degrees and '--' are mapped to the origin. In this way we hope that the 'pure' patters are separated by the 'hybrid' patters, and '--' is not biased towards any non-empty pattern. Each column of $X$ are shifted to have zero mean.

In [2]:
data = io.loadmat('genomedata.mat')
map_gen = {'AA': complex(0.0, 1.0),
           'CC': complex(-1.0, 0.0),
           'GG': complex(1.0, 0.0),
           'TT': complex(0.0, -1.0),
           'AG': complex(1.0 / math.sqrt(2), 1.0 / math.sqrt(2)),
           'AC': complex(-1.0 / math.sqrt(2), 1.0 / math.sqrt(2)),
           'TC': complex(-1.0 / math.sqrt(2), -1.0 / math.sqrt(2)),
           'TG': complex(1.0 / math.sqrt(2), -1.0 / math.sqrt(2)), 
           '--': complex(0.0, 0.0)}

X_raw = [[string.strip() for string in row[0][0].strip().split('\t')] for row in data['X']]
X = np.array([[map_gen[string] for string in row] for row in X_raw])
n, p = X.shape
X = X - np.mean(X, axis=0) * np.ones(p)

random_state = 8 # The random state used by the KMeans

Method 1: Dimension reduction by PCA.

In [3]:
d_pca = 2 # For PCA we reduce the dimension of X down to 3
k_pca = 2 # The expected number of clusters for PCA

_, s_pca, V_pca = svd(X) # svd of X. Note that this is different from matlab in that X = U diag(s) V.
V_pca = np.conjugate(V_pca.T)

X_pca_complex = np.dot(X, V_pca[:, 0 : d_pca]) # Reduce the dimension of X by linear projection, a complex matrix

# Since KMeans does not seem to support complex value, expand the space to 2*d_pca dim
X_pca = np.empty((n, 2 * d_pca), dtype="float64") 
X_pca[:, 0::2] = X_pca_complex.real
X_pca[:, 1::2] = X_pca_complex.imag

label_pca = KMeans(n_clusters=k_pca, random_state=random_state).fit_predict(X_pca)

Plot the distribution of the singular value and the scattermatrix plot of the reduced dimension data.

In [4]:
fig, ax = plt.subplots(figsize=(7, 6))
axins = inset_axes(ax, 2, 3, loc=1) # zoom-factor: 2.5, location: upper-left

ax.semilogy(np.arange(p), s_pca, linewidth=2)
ax.grid(True)
ax.set_xlabel('dimension', **axis_font)
ax.set_ylabel('singular value', **axis_font)

axins.semilogy(np.arange(p), s_pca, linewidth=2)
axins.grid(True)
x1, x2, y1, y2 = 0, 20, 50, 2000 # specify the limits
axins.set_xlim(x1, x2) # apply the x-limits
axins.set_ylim(y1, y2) # apply the y-limits
axins.set_xticks(np.arange(0, 20, 5))
mark_inset(ax, axins, loc1=2, loc2=3, fc="none", ec="0.5", lw=2)

# Scatterplot matrix
df_pca = pd.DataFrame({**{'cluster': label_pca}, **{col+1: X_pca[:, col] for col in range(2 * d_pca)}})
sns.set()
sns.pairplot(df_pca, hue='cluster', vars=np.arange(1, 1 + 2 * d_pca))

<seaborn.axisgrid.PairGrid at 0x7fe163866da0>

Method 2: Dimension reduction by diffusion maps and/or spectral clustering.

In [5]:
d_dif = 3 # For PCA we reduce the dimension of X down to 3 for 3-D visualization
k_dif = 2 # The expected number of clusters for diffusion map
t_dif = 10 # The time scale for diffusion map
f = 10 # The factor used to set epsilon according to the median of the square pairwise Euclidean distance

distance = pdist(np.concatenate((X.real, X.imag), axis=1), 'euclidean');
epsilon = np.median(distance ** 2) / f;
W = np.exp(-squareform(distance) ** 2 / epsilon)
D_inv_sqrt = np.diag(1 / np.sqrt(np.sum(W, axis=1)))
MS = np.dot(np.dot(D_inv_sqrt, W), D_inv_sqrt) # The symmetric M matrix

l_dif, V_dif = eigh(MS, eigvals=(n-d_dif-1, n-2)) # Remember to ignore the largest eigen vector
l_dif, V_dif = l_dif[::-1], V_dif[:, ::-1] # The eigen value are now in descending order. 

X_dif = np.dot(np.dot(D_inv_sqrt, V_dif), np.diag(l_dif ** t_dif)) # The redued dimension map at time t

label_dif = KMeans(n_clusters=k_dif, random_state=random_state).fit_predict(X_dif)

The scattermatrix plot and the 3-D scatter plot of the reduced-dimension data.

In [8]:
df_dif = pd.DataFrame({**{'cluster': label_dif}, **{col+1: X_dif[:, col] for col in range(d_dif)}})
plot_scatterplot = sns.pairplot(df_dif, hue='cluster', vars=np.arange(1, 1 + d_dif))

fig = plt.figure(1, figsize=(7, 6))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
plt.cla()

y = np.choose(label_dif, ['b', 'g'])
ax.scatter(X_dif[:, 0], X_dif[:, 1], X_dif[:, 2], c=y)
ax.set_xlabel('1', **axis_font)
ax.set_ylabel('2', **axis_font)
ax.set_zlabel('3', **axis_font)

<matplotlib.text.Text at 0x7fe159e168d0>

Spectral clustering method.

In [9]:
k_spec = 2 # The expected number of clusters for spectral clustering

L = np.diag(sum(W)) - W
_, V_spec = eigh(L)
X_spec = V_spec[:, 1:2] # The reduced dimension data, simply the second smallest eigen vector of L
label_spec = KMeans(n_clusters=2, random_state=random_state).fit_predict(X_spec)

The histogram of the second smallest eigen vector of $L$.

In [10]:
fig, ax = plt.subplots(figsize=(7, 6))
ax.hist(X_spec[label_spec==0, 0], 20, normed=1, facecolor='blue', alpha=0.75, label='cluster 0')
ax.hist(X_spec[label_spec==1, 0], 20, normed=1, facecolor='green', alpha=0.75, label='cluster 1')
ax.legend(loc=1, fontsize=16)
ax.set_xlabel('$v$', **axis_font)
ax.set_ylabel('count', **axis_font)

<matplotlib.text.Text at 0x7fe15a0f09b0>