In [None]:
import pandas as pd
import numpy as np
import pickle as pkl

from sklearn import linear_model
from sklearn.metrics  import mean_squared_error as mse

from bokeh.plotting import figure, show, output_notebook
from bokeh.charts import Scatter, show
output_notebook()

# with open('dataframe.pkl', 'rb') as f:
#     df = pkl.load(f)
    
models = ['regr', 'regr_ey', 'regr_log', 'nb1NN', 'nb2NN', 'nb3NN']

In [None]:
# uploading data from files
path = '../messy_data/'
with open(path+'df.pkl', 'rb') as f:
    df_regr = pkl.load(f)
with open(path+'dataframeALL_regr_ey.pkl', 'rb') as f:
    df_ey = pkl.load(f)
with open(path+'dataframeALL_regr_log.pkl', 'rb') as f:
    df_log = pkl.load(f)

In [None]:
# uploading data from folder pkl_data
path = '../pkl_data/'

with open(path+'avg_data.pkl', 'rb') as f:
	avg_data = pkl.load(f)

with open(path+'ind_models.pkl', 'rb') as f:
    ind_models = pkl.load(f)
    
zero = linear_model.LinearRegression()
zero.intercept_, zero.coef_ = 0, np.array([1])

In [None]:
# df_err_*_diff dataframe with differences with and without a model *
with open(path+'df_err_regr.pkl', 'rb') as f:
    df_err_regr = pkl.load(f)
with open(path+'df_err_regr_ey.pkl', 'rb') as f:
    df_err_regr_ey = pkl.load(f)
# with open(path+'df_err_regr_log.pkl', 'rb') as f:
#     df_err_regr_logx = pkl.load(f)
with open(path+'df_err_nb1NN.pkl', 'rb') as f:
    df_err_nb1NN = pkl.load(f)
with open(path+'df_err_nb2NN.pkl', 'rb') as f:
    df_err_nb2NN = pkl.load(f)
with open(path+'df_err_nb3NN.pkl', 'rb') as f:
    df_err_nb3NN = pkl.load(f)
with open(path+'df_err_nomodel.pkl', 'rb') as f:
    df_err_nomodel = pkl.load(f)

In [None]:
def plot_coef(models, title, name):
# function for plotting the linear model's intercepts and coefficients
# but it's not working for k-NN models, because they're not lin models
# models is a DataFrame containing models
    X = [x.intercept_ for x in models.values]
    Y = [y.coef_[0] for y in models.values]
    
    f = figure(title=title)
    f.xaxis.axis_label = 'Intercept'
    f.yaxis.axis_label = 'Coefficient'
    f.scatter(x=X, y=Y, color='navy', size=6, alpha=.2)
    show(f)
    print('intercept standard deviation: ', np.std(X))
    print('coefficient standard deviation: ', np.std(Y))
    
def mse_count(df, model):
# function for counting mean squared error for (d_A^2 + d_B^2)/2
    X = [x for x in df[model+' d_A'].values]
    Y = [y for y in df[model+' d_B'].values]
    mse = 0
    for (x,y) in zip(X,Y):
        mse += (x*x + y*y)/2
    return mse/len(X)

def mse_count_dAB(df, model, d='A'):
# function for counting mean squared error for d_A^2
    X = [x for x in df[model+' d_'+d].values]
    mse = 0
    for x in X:
        mse += x*x
    return mse/len(X)

def ind_mse_plot(df, model):
#fun for plotting individual mse errors
    n = 31
    X = [x*x for x in df[model+' d_A'].values]
    Y = [y*y for y in df[model+' d_B'].values]
    f = figure(title="Individual MSE for "+model)
    for i in range(int(len(X)/n)):
        ind_X = X[i*n:i*(n+1)]
        ind_Y = Y[i*n:i*(n+1)]
        mse = 0
        for (x,y) in zip(ind_X, ind_Y):
            mse += (x*x + y*y)/2
        f.scatter(x=mse/n, color='navy', size=6, alpha=.3)
    show(f)


In [None]:
# intercept / coef plots
plot_coef(ind_models['regr'], 'intercept / coef '+"regr", "regr")

In [None]:
# intercept / coef plots
plot_coef(ind_models['regr_ey'], 'intercept / coef '+"ey", 'regr_ey')

In [None]:
# MSE (d_A^2+d_B^2)/2
for (df, model) in [(df_regr, 'regr'), (df_ey, 'regr_ey'), (df_log, 'regr_log')]:
    print('MSE '+model+': ',mse_count(df, model))

In [None]:
# MSE d_A^2
for (df, model) in [(df_regr, 'regr'), (df_ey, 'regr_ey'), (df_log, 'regr_log')]:
    print('MSE '+model+': ',mse_count_dAB(df, model, d='A'))

In [None]:
# MSE d_B^2
for (df, model) in [(df_regr, 'regr'), (df_ey, 'regr_ey'), (df_log, 'regr_log')]:
    print('MSE '+model+': ',mse_count_dAB(df, model, d='B'))

Wyniki uzyskane w lutym (średnie d_A^2)
MSE regr:     1178.05986567
MSE regr_ey:  3200.93215805
MSE regr_log: 473686394475.0
MSE nb1NN:    2111.55143449
MSE nb2NN:    1618.70770597
MSE nb3NN:    1432.190942
MSE zero:     1970.91485889

In [None]:
# MSE dopasowania poszczególnych modeli dla każdego uczestnika (też z walidacją leave-one-out).
# mean(mse(i) for i in participants) = mse(participants)

def model_mse(models):
    #### mse: A-1(y_A) comparing to avg answers
    temp, ind_mse = [], []
    remains = ind_models['remain'].values
    for i, lin_mod in enumerate(models):
        temp.append(lin_mod.predict(float(remains[i])))
    return mse(temp, list(avg_data['mean'])*int(len(remains)/31))

print('model mse regr:    ', model_mse(ind_models['inv regr'].values))
print('model mse regr_ey: ', model_mse(ind_models['inv regr_ey'].values))
print('model mse zero:    ', model_mse(np.array([zero]*7564)))

# print("ind MSE regr standard deviation: ", np.std(ind_mse_regr))
# print("ind MSE regr_ey standard deviation: ", np.std(ind_mse_regr_ey))
# print("ind MSE regr_ey standard deviation: ", np.std(ind_mse_regr_ey))


In [None]:
# TODO liczy się w nieskończoność
ind_mse_plot(df_regr, 'regr')