In [31]:
#%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import poisson_tools as pt
import matplotlib.patches as mpatches
from scipy.special import expit
import scipy.cluster.vq as spvq
import scipy.spatial.distance as spdt
import matplotlib.cm as cm
from scipy.optimize import curve_fit
import statsmodels.api as sm
#from scipy.stats import poisson
from scipy.misc import factorial

In [3]:
def sigmoid_sampling(data, weight, bias):
    sum_data = np.dot(data, weight) + bias
    prob = expit(sum_data)
    rdm = np.random.random(prob.shape)
    index_on = rdm < prob
    samples = np.zeros(prob.shape)
    samples[index_on]=1.
    return samples

In [4]:
def avg_distr(a, b, w, sample_num, init_v):
    gibbs_v = np.zeros((sample_num, a.shape[0]))
    gibbs_v[0] = init_v
    for g_step in range(1, sample_num):
        gibbs_h = sigmoid_sampling(gibbs_v[g_step-1], w, b)
        gibbs_v[g_step] = sigmoid_sampling(gibbs_h, w.transpose(), a)
    return gibbs_v, np.average(gibbs_v,0)

In [5]:
def plot_distance(sample_nums, dis_matrix, clabel, **kwargs):
    ax = kwargs.pop('ax', plt.gca())
    mmax = np.max(dis_matrix,0)
    mmin = np.min(dis_matrix,0)
    avg = np.average(dis_matrix,0)
    base_line, = ax.semilogx(sample_nums, avg, linewidth = 2., label = clabel, **kwargs)
    ax.fill_between(sample_nums, mmax, mmin, facecolor=base_line.get_color(), alpha=.4, linewidth=0)
    legend = ax.legend(loc='lower left', shadow=True)

In [6]:
train_x, train_y = pt.get_train_data()
train_x = train_x > 50

digit = 5
label_list = np.array(train_y).astype(int)
index_digit = np.where(label_list==digit)[0]
train_num = len(index_digit) - 1
index_train = index_digit[0:train_num]
Data_v = np.array(train_x[index_train]).astype(float)
k_center = spvq.kmeans(Data_v, 1)

In [35]:
#plt.imshow(np.reshape(k_center[0],(28,28)), cmap=cm.gray_r,interpolation='none')
#plt.show()
dis_D=[]
for i in range(len(Data_v)):
    dis_D.append(np.linalg.norm(Data_v[i]-k_center[0]))
his = np.histogram(dis_D) #, bins=[0, 1, 2, 3]
mid_bin = 0.5*(his[1][1:] + his[1][:-1])
plt.plot(mid_bin, his[0])
plt.show()

In [36]:
plt.plot(dis_D, '.')
plt.show()

In [10]:
#popt, pcov = curve_fit(poisson, his[0], his[1][0:-1])
res = sm.Poisson(dis_D,np.ones_like(dis_D)).fit()
print res.summary()

Optimization terminated successfully.
         Current function value: 2.027519
         Iterations 11
                          Poisson Regression Results                          
Dep. Variable:                      y   No. Observations:                 5420
Model:                        Poisson   Df Residuals:                     5419
Method:                           MLE   Df Model:                            0
Date:                Tue, 17 Nov 2015   Pseudo R-squ.:                   0.000
Time:                        18:34:01   Log-Likelihood:                -10989.
converged:                       True   LL-Null:                       -10989.
                                        LLR p-value:                       nan
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          2.1166      0.005    449.025      0.000         2.107     2.126


In [50]:
ld= np.exp(2.1166)
normal = his[0]#*1./sum(his[0])
# poisson function, parameter lamb is the fit parameter
def poisson(k, lamb):
    return (lamb**k/factorial(k)) * np.exp(-lamb)
def gauss(x, A, mu, sigma):
    #A, mu, sigma = p
    return A*np.exp(-(x-mu)**2/(2.*sigma**2))
# fit with curve_fit
parameters, cov_matrix = curve_fit(gauss, mid_bin, normal) 
#popt, pcov = curve_fit(poisson.pmf(his[0], ld), his[0], his[1][0:-1])

# plot poisson-deviation with fitted parameter
x_plot = np.linspace(0, 20, 1000)
plt.plot(mid_bin, normal)
#plt.plot(mid_bin, poisson(mid_bin, np.exp(2.1166)))
plt.plot(x_plot, gauss(x_plot, *parameters), 'r-', lw=2)
plt.show()

In [49]:
parameters

array([  1.12020314e+09,   5.22004121e+08])

In [11]:
fname = '/home/liuq/apt/2ndYear/sDBN/theta/5420_b1000_epoc11924_cd.npy' #903
a_K, b_K, w_K = np.load(fname)
fname = '/home/liuq/apt/2ndYear/sDBN/theta/5420_b0001_epoc00013_cd.npy' #001
a_1, b_1, w_1 = np.load(fname)
#fname = '/home/liuq/apt/2ndYear/sDBN/theta/5420_b0002_epoc00002_cd.npy'
fname = '/home/liuq/apt/2ndYear/sDBN/theta/5420_b1000_epoc00001_cd.npy' #903
a_2, b_2, w_2 = np.load(fname)
init_v = Data_v[0]

In [43]:
dis_D=[]
for i in range(len(Data_v)):
     dis_D.append(np.linalg.norm(Data_v[i]-k_center[0]))
print np.mean(dis_D), np.var(dis_D)

8.30321544205 0.724197642319


In [46]:
dis = {}
for step in range(11924, 542-1, -542):
    fname = '/home/liuq/apt/2ndYear/sDBN/theta/5420_b1000_epoc%05d_cd.npy'%(step)
    a, b, w = np.load(fname)
    i = 3
    s_num = pow(10, i+1)
    dis[str(s_num)] = []
    data_g, tmp= avg_distr(a, b, w, s_num, init_v)
    for i in range(s_num):
        dis[str(s_num)].append(np.linalg.norm(data_g[i]-k_center[0]))
    his = np.histogram(dis[str(s_num)]) #, bins=[0, 1, 2, 3]
    plt.plot(his[1][0:-1], his[0])
    plt.show()

IndexError: index 10000 is out of bounds for axis 0 with size 10000

In [66]:
step = 1084
fname = '/home/liuq/apt/2ndYear/sDBN/theta/5420_b1000_epoc%05d_cd.npy'%(step)
#fname = '/home/liuq/apt/2ndYear/sDBN/theta/5420_b0001_epoc00021_cd.npy' #001
a, b, w = np.load(fname)
for pow_num in range(3, 4):
    print pow_num
    dis = []
    s_num = 1 * pow(10, pow_num+1)
    data_g, tmp= avg_distr(a, b, w, s_num, init_v)
    for i in range(s_num):
        dis.append(np.linalg.norm(data_g[i]-k_center[0]))
    his = np.histogram(dis) #, bins=[0, 1, 2, 3]
    plt.plot(his[1][0:-1], his[0])
    plt.show()

4


In [68]:
s_num = 10000
mean_list = []
var_list = []
#data_g, distr_1 = avg_distr(a_1, b_1, w_1, s_num, init_v)
#dis_D=[]
#for i in range(s_num):
#     dis_D.append(np.linalg.norm(data_g[i]-k_center[0]))
#print np.mean(dis_D), np.var(dis_D)
for i in range(0, 1001, 100):
    data_g, distr_2 = avg_distr(a_2, b_2, w_2, s_num, init_v)
    dis_D=[]
    for i in range(s_num):
         dis_D.append(np.linalg.norm(data_g[i]-k_center[0]))
    mean_list.append(np.mean(dis_D))
    var_list.append(np.var(dis_D))
for i in range(1084, 11925, 542):
    data_g, distr_2 = avg_distr(a_2, b_2, w_2, s_num, init_v)
    dis_D=[]
    for i in range(s_num):
         dis_D.append(np.linalg.norm(data_g[i]-k_center[0]))
    mean_list.append(np.mean(dis_D))
    var_list.append(np.var(dis_D))

plt.plot(mean_list)
plt.savefig('mean.pdf')
plt.clf()
plt.plot(var_list)
plt.savefig('var.pdf')
#data_g, distr_K = avg_distr(a_K, b_K, w_K, s_num, init_v)
#dis_D=[]
#for i in range(s_num):
#     dis_D.append(np.linalg.norm(data_g[i]-k_center[0]))
#print np.mean(dis_D), np.var(dis_D)

In [24]:
trail_num = 5
test_per_trail = 10
sample_nums = np.array([])
dis_matrix1 = np.zeros((test_per_trail, trail_num))
dis_matrix2 = np.zeros((test_per_trail, trail_num))
dis_matrixK = np.zeros((test_per_trail, trail_num))
np.random.seed(0)

In [25]:
for i in range(trail_num):
    print i
    s_num = pow(10, i+1)
    sample_nums = np.append(sample_nums, [s_num])
    for j in range(test_per_trail):
        distr_1 = avg_distr(a_1, b_1, w_1, s_num, init_v)
        distr_2 = avg_distr(a_2, b_2, w_2, s_num, init_v)
        distr_K = avg_distr(a_K, b_K, w_K, s_num, init_v)
        #dis_matrix1[j, i] = np.sum(np.abs(dis_D-distr_1))
        #dis_matrix2[j, i] = np.sum(np.abs(dis_D-distr_2))
        #dis_matrixK[j, i] = np.sum(np.abs(dis_D-distr_K))
        dis_matrix1[j, i] = np.linalg.norm(dis_D-distr_1)
        dis_matrix2[j, i] = np.linalg.norm(dis_D-distr_2)
        dis_matrixK[j, i] = np.linalg.norm(dis_D-distr_K)

0
1
2
3
4


In [27]:
plt.clf()
plot_distance(sample_nums, dis_matrix1, 'CD_1')
#plot_distance(sample_nums, dis_matrix2, 'CD_2')
plot_distance(sample_nums, dis_matrixK, 'CD_1K')
plt.xlabel('Sampling Number')
plt.ylabel('Euclidean Distance')
plt.title('Euclidean Distance Between Mean of Samples and Training Set')
plt.grid('on')
plt.savefig('distr.pdf')
plt.show()