In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
np.set_printoptions(precision=3)
ROOT = ".."
filename = f"{ROOT}/data_calculated/x5_sin.csv"
DESCRIPTOR_NAMES = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6']
TARGET_NAME = "y"
df_obs = pd.read_csv(filename)
Xraw = df_obs.loc[:, DESCRIPTOR_NAMES].values
y = df_obs.loc[:, TARGET_NAME].values


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(Xraw)
X = scaler.transform(Xraw)


In [None]:
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
# 計算パラメタ設定
N_SPLITS = 5
SHUFFLE = True
score_function = r2_score
# 性能評価値を保存する変数の定義
train_score_list = []
test_score_list = []
alpha_list = np.logspace(-5, -1, 20)
# hyperparameter loop
for alpha in alpha_list:
    reg = Lasso(alpha=alpha)
    # CV setting
    kf = KFold(n_splits=N_SPLITS, shuffle=SHUFFLE,
               random_state=1)
    # CV loop
    cv_train_score_list = []
    cv_test_score_list = []
    for train, test in kf.split(X):
        Xtrain, ytrain = X[train], y[train]
        Xtest, ytest = X[test], y[test]
        reg.fit(Xtrain, ytrain)
        ytrainp = reg.predict(Xtrain)
        ytestp = reg.predict(Xtest)
        # 評価指標の計算と保存
        trainscore = score_function(ytrain, ytrainp)
        cv_train_score_list.append(trainscore)
        testscore = score_function(ytest, ytestp)
        cv_test_score_list.append(testscore)
    train_score_list.append([np.mean(cv_train_score_list), 
                             np.std(cv_train_score_list), alpha])
    test_score_list.append([np.mean(cv_test_score_list), 
                            np.std(cv_test_score_list), alpha])


In [None]:
columns_list = ["mean(R2)_train", "std(R2)_train", "alpha"]
df_train_score = pd.DataFrame(train_score_list, columns=columns_list)
columns_list = ["mean(R2)_test", "std(R2)_test", "alpha"]
df_test_score = pd.DataFrame(test_score_list, columns=columns_list)
df_score = df_train_score.merge(df_test_score, on="alpha")


In [None]:
df_score

In [None]:
from regression_misc import plot_alpha_yerror
plot_alpha_yerror(df_score)


In [None]:
imax = np.argmax(df_score["mean(R2)_test"])
alpha_opt = df_score.loc[imax, "alpha"]
print("alpha_opt", alpha_opt)
reg = Lasso(alpha=alpha_opt)
reg.fit(X, y)
print(reg.coef_, reg.intercept_)
yp = reg.predict(X)


In [None]:
from regression_misc import plot_y_yp

reg = Lasso(alpha=alpha_opt)
kf = KFold(n_splits=N_SPLITS, shuffle=SHUFFLE, random_state=1)
ytest_list = []
ytestp_list = []
# CV loop
for train, test in kf.split(X):
    Xtrain, ytrain = X[train], y[train]
    Xtest, ytest = X[test], y[test]
    # 学習
    reg.fit(Xtrain, ytrain)
    ytrainp = reg.predict(Xtrain)
    ytestp = reg.predict(Xtest)
    ytest_list.append(ytest)
    ytestp_list.append(ytestp)
# 可視化

plot_y_yp(ytest_list, ytestp_list)


In [None]:
filename_new = "../data_calculated/x5_sin_new.csv"
df_new = pd.read_csv(filename_new)
Xraw_new = df_new.loc[:, DESCRIPTOR_NAMES].values
ynew = df_new.loc[:, TARGET_NAME].values
Xnew = scaler.transform(Xraw_new)
ynewp = reg.predict(Xnew)


In [None]:
plot_y_yp(ynew, ynewp)


In [None]:
from regression_misc import plot_x1_y
plot_x1_y(X, y, yp, Xnew, ynew, ynewp )

In [None]:
from sklearn.linear_model import LassoCV
kf = KFold(N_SPLITS, shuffle=True, random_state=1)
reg = LassoCV(cv=kf, alphas=alpha_list)
reg.fit(X, y)
print("alpha", reg.alpha_)
print("coef", reg.coef_)


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
result = []
for alpha in alpha_list:
    kf = KFold(N_SPLITS, shuffle=True, random_state=1)
    reg = Lasso(alpha=alpha)
    score_test = cross_val_score(reg, X, y, cv=kf, 
                                 scoring=make_scorer(r2_score))
    result.append([alpha, np.mean(score_test), np.std(score_test)])
df_result = pd.DataFrame(result, 
                         columns=["alpha", "mean(R2)_test", "std(R2)_test"])
plot_alpha_yerror(df_result)
