In [None]:
from sklearn.datasets import load_boston
import PreProcess as pp
np.set_printoptions(precision=5, suppress=True, linewidth=500)
# ボストンハウジングデータの読込み
data = np.loadtxt('./housing.data', dtype=str)
x = pp.scaling(data, [0,1,2,4,5,6,7,9,10,11,12])
#x = np.hstack((x, pp.to_dummy(data, 3, ['0','1'])))
x = np.append(x, pp.to_dummy(data, [8])[1], axis=1)
y = data[:, -1].astype(float)
print(pp.describe(x, list(range(11))))

In [None]:
from sklearn.model_selection import ShuffleSplit

ss = ShuffleSplit(n_splits=2, train_size=0.7, random_state=0)
train_idx, test_idx = ss.split(x, y)
x_train, x_test, y_train, y_test = x[train_idx[0]], x[train_idx[1]], y[train_idx[0]], y[train_idx[1]]

In [None]:
import numpy as np

In [None]:
import pandas as pd
columns = {'CRIM':float, 'ZN':float, 'INDUS':float, 'CHAS':int, 'NOX':float, 'RM':float, 'AGE':float, 'DIS':float, 'RAD':object, 'TAX':int, 'PTRATIO':float, 'B':float, 'LSTAT':float, 'TARGET':float}
df = pd.read_csv('./housing.data', header=None, sep='\s+', na_values='na', names=columns.keys(), dtype=columns)
y = df['TARGET']
tp = df[['CHAS','RAD']]
df = df.drop(['TARGET','CHAS','RAD'], axis=1)
df = (df - df.mean()) / df.std(ddof=0)
tp = pd.get_dummies(tp, dummy_na=False, columns=['RAD'])
df = pd.concat([df,tp], axis=1, sort=False)
df.head()

In [None]:
from sklearn.model_selection import ShuffleSplit

ss = ShuffleSplit(n_splits=1, train_size=0.7, random_state=0)
train_idx, test_idx = next(ss.split(df, y))
df = df.to_numpy()
y = y.to_numpy()
x_train, x_test, t_train, t_test = df[train_idx], df[test_idx], y[train_idx], y[test_idx]

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor
from sklearn import tree

estimators = [
    ('LNR', LinearRegression()),
    ('RDG', Ridge()),
    ('LAS', Lasso()),
    ('SVR', SVR(kernel='linear')),
    ('RBF', SVR(kernel='rbf')),
    ('BAG', BaggingRegressor(tree.DecisionTreeRegressor())),
    ('SGD', SGDRegressor(max_iter=1000))
]

In [None]:
grid_params = {
    'LNR':{},
    'RDG':{},
    'LAS':{},
    'SVR':{},
    'RBF':{},
    'BAG':{},
    'SGD':{}
}

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import MLUtils as ut

scaler = None#ut.scaler(0)
reductor = None#ut.reductor(0)

scores = {}
for name, pipeline in ut.create_pipelines(estimators, scaler, reductor).items():
    #学習
    est = GridSearchCV(pipeline, ut.get_params(pipeline, grid_params[name]), cv=3, scoring='r2', return_train_score=False, n_jobs=-1)
    est.fit(x_train, t_train)
    #スコア（訓練）
    train_pred = est.predict(x_train)
    scores[(name, 'train_mse')] = mean_squared_error(t_train, train_pred)
    scores[(name, 'train_r2')] = r2_score(t_train, train_pred)
    #スコア（テスト）
    test_pred = est.predict(x_test)
    scores[(name, 'test_mse')] = mean_squared_error(t_test, test_pred)
    scores[(name, 'test_r2')] = r2_score(t_test, test_pred)

#スコア表示
for k, v in scores.items():
    print(k, v)

In [None]:
from sklearn.metrics import mean_absolute_error
a = LinearRegression()
a.fit(x_train, y_train)
pred = a.predict(x_train)
print(r2_score(y_train, pred))
print(mean_squared_error(y_train, pred))
print(mean_absolute_error(y_train, pred))

In [None]:
import Evaluation as ev
print(ev.R2(y_train, pred))
print(ev.MSE(y_train, pred))
print(ev.MAE(y_train, pred))