In [15]:
#%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import poisson_tools as pt
import matplotlib.patches as mpatches
from scipy.special import expit
import scipy.cluster.vq as spvq
import scipy.spatial.distance as spdt
import matplotlib.cm as cm
from scipy.optimize import curve_fit
import statsmodels.api as sm
#from scipy.stats import poisson
from scipy.misc import factorial

In [16]:
def sigmoid_sampling(data, weight, bias):
    sum_data = np.dot(data, weight) + bias
    prob = expit(sum_data)
    rdm = np.random.random(prob.shape)
    index_on = rdm < prob
    samples = np.zeros(prob.shape)
    samples[index_on]=1.
    return samples

In [17]:
def avg_distr(a, b, w, sample_num, init_v):
    gibbs_v = np.zeros((sample_num, a.shape[0]))
    gibbs_v[0] = init_v
    for g_step in range(1, sample_num):
        gibbs_h = sigmoid_sampling(gibbs_v[g_step-1], w, b)
        gibbs_v[g_step] = sigmoid_sampling(gibbs_h, w.transpose(), a)
    return gibbs_v, np.average(gibbs_v,0)

In [18]:
def plot_distance(sample_nums, dis_matrix, clabel, **kwargs):
    ax = kwargs.pop('ax', plt.gca())
    mmax = np.max(dis_matrix,0)
    mmin = np.min(dis_matrix,0)
    avg = np.average(dis_matrix,0)
    base_line, = ax.semilogx(sample_nums, avg, linewidth = 2., label = clabel, **kwargs)
    ax.fill_between(sample_nums, mmax, mmin, facecolor=base_line.get_color(), alpha=.4, linewidth=0)
    legend = ax.legend(loc='lower left', shadow=True)

In [19]:
def poisson(k, lamb):
    return (lamb**k/factorial(k)) * np.exp(-lamb)
def gauss(x, A, mu, sigma):
    return A*np.exp(-(x-mu)**2/(2.*sigma**2))

In [20]:
train_x, train_y = pt.get_train_data()
train_x = train_x > 50

digit = 5
label_list = np.array(train_y).astype(int)
index_digit = np.where(label_list==digit)[0]
train_num = len(index_digit) - 1
index_train = index_digit[0:train_num]
Data_v = np.array(train_x[index_train]).astype(float)
k_center = spvq.kmeans(Data_v, 1)
dis_D=[]
for i in range(len(Data_v)):
    dis_D.append(np.linalg.norm(Data_v[i]-k_center[0]))

In [21]:
his = np.histogram(dis_D) #, bins=[0, 1, 2, 3]
mid_bin = 0.5*(his[1][1:] + his[1][:-1])
stat = his[0]*1./sum(his[0])
plt.clf()
plt.bar(his[1][:-1], stat, width=np.average(his[1][1:] - his[1][:-1]))
parameters, cov_matrix = curve_fit(gauss, mid_bin, stat)
x_plot = np.linspace(6, 12, 1000)
plt.xlabel('Distance to the cluster centre')
plt.ylabel('Probability')
plt.plot(x_plot, gauss(x_plot, *parameters), 'g-', lw=2)
plt.draw()
plt.savefig('plot/mnist/train_dist.pdf')
#plt.show()


In [22]:
np.random.seed(0)

In [23]:
init_v = Data_v[0]
para = []
for step in range(100, 1000, 100):
    fname = '/home/liuq/apt/2ndYear/sDBN/cdk/5420_b1000_epoc%05d_cdk.npy'%(step)
    a, b, w = np.load(fname)
    dis = []
    s_num = pow(10, 4)
    data_g, tmp= avg_distr(a, b, w, s_num, init_v)
    for i in range(s_num):
        dis.append(np.linalg.norm(data_g[i]-k_center[0]))
    his_g = np.histogram(dis) #, bins=[0, 1, 2, 3]
    mid_g = 0.5*(his_g[1][1:] + his_g[1][:-1])
    stat_g = his_g[0]*1./sum(his_g[0])
    #parameters, cov_matrix = curve_fit(gauss, mid_g, stat_g)
    #para.append(parameters)
    para.append([0., np.mean(dis), np.var(dis)])
    plt.clf()
    plt.bar(his_g[1][:-1], stat_g, width=np.average(his_g[1][1:] - his_g[1][:-1]))
    plt.xlabel('Distance to the cluster centre')
    plt.ylabel('Probability')
    plt.xlim((6,12))
    plt.ylim((0,0.3))
    plt.draw()
    plt.savefig('plot/mnist/cdk_%d.pdf'%step)
    #plt.show()
para = np.array(para)

In [24]:
ind = np.where(para[:,1]>6)[0]
plt.clf()
plt.plot(para[ind, 1], para[ind, 2])
parameters, cov_matrix = curve_fit(gauss, mid_bin, stat)
plt.plot(parameters[1], parameters[2], 'r.')
plt.title('Parameter searching for 6K steps')
plt.draw()
plt.savefig('plot/mnist/cdk_para.pdf')

In [25]:
init_v = Data_v[0]
para = []
for step in range(100, 1000, 100):
    fname = '/home/liuq/apt/2ndYear/sDBN/theta/5420_b1000_epoc%05d_cd.npy'%(step)
    a, b, w = np.load(fname)
    dis = []
    s_num = pow(10, 4)
    data_g, tmp= avg_distr(a, b, w, s_num, init_v)
    for i in range(s_num):
        dis.append(np.linalg.norm(data_g[i]-k_center[0]))
    his_g = np.histogram(dis) #, bins=[0, 1, 2, 3]
    mid_g = 0.5*(his_g[1][1:] + his_g[1][:-1])
    stat_g = his_g[0]*1./sum(his_g[0])
    #parameters, cov_matrix = curve_fit(gauss, mid_g, stat_g)
    #para.append(parameters)
    para.append([0., np.mean(dis), np.var(dis)])
    plt.clf()
    plt.bar(his_g[1][:-1], stat_g, width=np.average(his_g[1][1:] - his_g[1][:-1]))
    plt.xlabel('Distance to the cluster centre')
    plt.ylabel('Probability')
    plt.xlim((6,12))
    plt.ylim((0,0.3))
    plt.draw()
    plt.savefig('plot/mnist/kcd_%d.pdf'%step)
    #plt.show()
para = np.array(para)

In [26]:
ind = np.where(para[:,1]>6)[0]
plt.clf()
plt.plot(para[ind, 1], para[ind, 2])
parameters, cov_matrix = curve_fit(gauss, mid_bin, stat)
plt.plot(parameters[1], parameters[2], 'r.')
plt.title('Parameter searching for 6K steps')
plt.draw()
plt.savefig('plot/mnist/kcd_para.pdf')

In [27]:
init_v = Data_v[0]
para = []
for step in range(1, 100, 10):
    fname = '/home/liuq/apt/2ndYear/sDBN/theta/5420_b0001_epoc%05d_cd.npy'%(step)
    a, b, w = np.load(fname)
    dis = []
    s_num = pow(10, 4)
    data_g, tmp= avg_distr(a, b, w, s_num, init_v)
    for i in range(s_num):
        dis.append(np.linalg.norm(data_g[i]-k_center[0]))
    his_g = np.histogram(dis) #, bins=[0, 1, 2, 3]
    mid_g = 0.5*(his_g[1][1:] + his_g[1][:-1])
    stat_g = his_g[0]*1./sum(his_g[0])
    #parameters, cov_matrix = curve_fit(gauss, mid_g, stat_g)
    #para.append(parameters)
    para.append([0., np.mean(dis), np.var(dis)])
    plt.clf()
    plt.bar(his_g[1][:-1], stat_g, width=np.average(his_g[1][1:] - his_g[1][:-1]))
    plt.xlabel('Distance to the cluster centre')
    plt.ylabel('Probability')
    plt.xlim((6,12))
    plt.ylim((0,0.3))
    plt.draw()
    plt.savefig('plot/mnist/cd1_%d.pdf'%step)
    #plt.show()
para = np.array(para)

In [28]:
ind = np.where(para[:,1]>6)[0]
plt.clf()
plt.plot(para[ind, 1], para[ind, 2])
parameters, cov_matrix = curve_fit(gauss, mid_bin, stat)
plt.plot(parameters[1], parameters[2], 'r.')
plt.title('Parameter searching for 540K steps')
plt.draw()
plt.savefig('plot/mnist/cd1_para.pdf')

In [None]:
init_v = Data_v[0]
para = []
for step in range(1084, 18971, 542):
    fname = '/home/liuq/apt/2ndYear/sDBN/theta/5420_b1000_epoc%05d_cd.npy'%(step)
    a, b, w = np.load(fname)
    dis = []
    s_num = pow(10, 4)
    data_g, tmp= avg_distr(a, b, w, s_num, init_v)
    for i in range(s_num):
        dis.append(np.linalg.norm(data_g[i]-k_center[0]))
    his_g = np.histogram(dis) #, bins=[0, 1, 2, 3]
    mid_g = 0.5*(his_g[1][1:] + his_g[1][:-1])
    stat_g = his_g[0]*1./sum(his_g[0])
    #parameters, cov_matrix = curve_fit(gauss, mid_g, stat_g)
    #para.append(parameters)
    para.append([0., np.mean(dis), np.var(dis)])
    plt.clf()
    plt.bar(his_g[1][:-1], stat_g, width=np.average(his_g[1][1:] - his_g[1][:-1]))
    plt.xlabel('Distance to the cluster centre')
    plt.ylabel('Probability')
    plt.xlim((6,12))
    plt.ylim((0,0.3))
    plt.draw()
    plt.savefig('plot/mnist/kcd_%d.pdf'%step)
    #plt.show()
para = np.array(para)

In [None]:
ind = np.where(para[:,1]>6)[0]
plt.clf()
plt.plot(para[ind, 1], para[ind, 2])
parameters, cov_matrix = curve_fit(gauss, mid_bin, stat)
plt.plot(parameters[1], parameters[2], 'r.')
plt.title('Parameter searching for 100K steps')
plt.draw()
plt.savefig('plot/mnist/cd1_para.pdf')