In [2]:
import numpy as np
import mlgrad.model as model
import mlgrad.loss as loss
import mlgrad.func as func
import mlgrad.distance as distance
import mlgrad.avragg as avragg
import mlgrad.gd as gd
# import mlgrad.regular as regular
import mlgrad.weights as weights
import mlgrad.mlocation_scatter as mlocation_scatter

from mlgrad import averager_it, averager_fg, fg, erm_fg, sg, erm_sg, irgd, erm_irgd, erisk

#import sklearn.linear_model as sklm

import rdata

import pandas as pd
import matplotlib.pyplot as plt
plt.style.use(['seaborn-notebook', 'seaborn-white', 'seaborn-ticks'])

ModuleNotFoundError: No module named 'rdata'

In [None]:
parsed = rdata.parser.parse_file("data/HRstars.rda")
data = rdata.conversion.convert(parsed)
hr = data['HRstars']
hr[:10]

In [None]:
plt.hist(hr['Para'], bins=50)

In [None]:
plt.figure(figsize=(15,12))
plt.scatter(hr['V'], hr['BV'], c=hr['BV'], s=9, cmap=plt.cm.nipy_spectral)
plt.xlabel('V')
plt.ylabel('Uncert')
plt.ylim(-0.6,2.5)
plt.minorticks_on()
plt.show()

In [None]:
df = hr
Y = df["V"].values
X = df["BV"].values

Xs = X.reshape((-1,1))

In [None]:
plt.figure(figsize=(6.0, 5.0))
plt.scatter(X, Y, s=1, linewidths=2.)
plt.minorticks_on()
plt.grid(1)
plt.ylabel('V')
plt.xlabel('BV')
plt.show()

In [None]:
loss_func_sq = loss.ErrorLoss(func.Square())
loss_func_abs = loss.ErrorLoss(func.Absolute())

In [None]:
#regular_func = regular.Square()

In [None]:
mod1 = model.LinearModel(1)
mod1.init_param()

In [None]:
er1 = erisk(Xs, Y, mod1, loss_func_sq)
alg_fg1 = erm_fg(er1, h=0.01, tol=1.0e-6, verbose=0)
print("Completed?", alg_fg1.completed)

plt.plot(alg_fg1.lvals)
plt.show()

In [None]:
mod2 = model.LinearModel(1)
mod2.init_param()

In [None]:
er2 = erisk(Xs, Y, mod2, loss_func_sq)
alg_fg2 = fg(er2, h=0.001, tol=1.0e-6)
avg2 = averager_it(func.QuantileFunc(0.5, func.Sqrt(0.001)))
avg2 = avragg.MWAverage(avg2)
wg2 = weights.MWeights(avg2, er2)
irgd2 = erm_irgd(alg_fg2, wg2, n_iter=21, verbose=0)

plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.plot(irgd2.lvals)
plt.subplot(1,2,2)
plt.plot(irgd2.n_iters)
plt.show()

In [None]:
plt.figure(figsize=(10.0, 8.0))
plt.title('HRstars dataset')
sc=plt.scatter(X, Y, s=1, linewidths=2., label='data', c=df['Para'], cmap=plt.cm.gist_rainbow)
plt.colorbar(sc)
X0 = np.linspace(X.min(), X.max(), 20)
plt.plot(X0, mod1.evaluate_all(X0.reshape(-1,1)), c='k', markersize=8, label=r'LS')
plt.plot(X0, mod2.evaluate_all(X0.reshape(-1,1)), color='k', linewidth=1., marker='*', markersize=8, label=r'WM')
plt.minorticks_on()
plt.grid(1)
#plt.xlabel('log.Te')
#plt.ylabel('log.light')
plt.xlim(-0.6, 2.6)
plt.ylim(-1, 20)
plt.legend(loc='best')
plt.show()

In [None]:
res1 = np.abs(Y - mod1.evaluate_all(Xs))
res1.sort()
res2 = np.abs(Y - mod2.evaluate_all(Xs))
res2.sort()
plt.plot(res1, label='1')
plt.plot(res2, label='2')
plt.legend()
plt.show()

In [None]:
37/47

In [None]:
names = list(hr.keys()[1:])
XX = np.c_[tuple(hr[name].values for name in names[:-1])]

In [None]:
avg3 = averager_it(func.QuantileFunc(0.9, func.Sqrt(0.001)))
avg3 = avragg.MWAverage(avg3)

alg = mlocation_scatter.MLocationScatterEstimator(avg3, n_iter=20)
alg.fit(XX)

In [None]:
plt.plot(alg.dvals)
plt.show()

In [None]:
loc, S = alg.loc.base, alg.S.base

loc0 = mlocation_scatter.standard_location(XX)
S0 = mlocation_scatter.standard_covariance(XX, loc0)

distfunc = distance.MahalanobisDistance(np.linalg.inv(S))
distfunc0 = distance.MahalanobisDistance(np.linalg.inv(S0))

In [None]:
D0 = [distfunc0(x, loc0) for x in XX]
D = [distfunc(x, loc) for x in XX]
plt.scatter(D0, D, c='w', edgecolor='k', linewidth=1)
plt.xlabel('Классическое расстояние')
plt.ylabel('Робастное расстояние')
plt.minorticks_on()
plt.xlim(0,10)
plt.ylim(0,5)
# plt.title('Многомерное нормальное (m=%s) с выбросами %s%%' % (m, q))
# plt.savefig('fig/robust_clustering_dran_multinormal_distance_%s_%s.eps' % (m, q))
plt.show()

In [None]:
avg4 = averager_it(func.QuantileFunc(0.5, func.Sqrt(0.001)))
avg4 = avragg.MWAverage(avg4)

alg2 = mlocation_scatter.MLocationsScattersEstimator(avg4, 3)
alg2.fit(XX)

In [None]:
plt.plot(alg2.dvals)
plt.show()

In [None]:
D = alg2.calc_distances()
plt.plot(sorted(D))
# plt.ylim(0,200)
plt.show()

In [None]:
for S in alg2.scatters:
    print(np.linalg.inv(S))