# Sprint２課題 機械学習スクラッチ入門¶

In [20]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import random

from sklearn.datasets import load_iris
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDRegressor

## 【問題1】train_test_splitのスクラッチ
スクラッチの練習として、scikit-learnのtrain_test_splitを自作してみます。以下の雛形をベースとして関数を完成させてください。

In [21]:
def scratch_train_test_split(X, y, train_size=0.8,):
    """
    検証用データを分割する。

    Parameters
    ----------
    X : 次の形のndarray, shape (n_samples, n_features)
      学習データ
    y : 次の形のndarray, shape (n_samples, )
      正解値
    train_size : float (0<train_size<1)
      何割をtrainとするか指定

    Returns
    ----------
    X_train : 次の形のndarray, shape (n_samples, n_features)
      学習データ
    X_test : 次の形のndarray, shape (n_samples, n_features)
      検証データ
    y_train : 次の形のndarray, shape (n_samples, )
      学習データの正解値
    y_test : 次の形のndarray, shape (n_samples, )
      検証データの正解値
    """
    #ここにコードを書く
    X = np.array(X)
    y = np.array(y)
    i = random.sample(range(len(X)), k=round(len(X)*train_size))
    e = list(range(len(X)))
    w = list(set(e) - set(i))
    X_train = X_test = np.empty(X.shape[1])
    y_train = y_test = np.empty(())
    
    for a in i:
        X_train = np.vstack([X_train, X[a,:]])
        y_train = np.vstack([y_train, y[a]])
    for b in w:
        X_test = np.vstack([X_test, X[b,:]])
        y_test = np.vstack([y_test, y[b]])
        
    X_train = X_train[1:, :]
    y_train = y_train[1:, :]
    X_test = X_test[1:, :]
    y_test = y_test[1:, :]

    return X_train, X_test, y_train, y_test

In [22]:
data = load_iris()

In [23]:
iris_data = pd.DataFrame(data.data, columns=data.feature_names)
iris_data["species"] = data.target
iris_data = iris_data[iris_data["species"].isin(["1", "2"])]

In [24]:
X_iris = iris_data.drop("species", axis=1)
y_iris = iris_data["species"]

In [25]:
np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]

f0 = np.random.multivariate_normal(f0, cov, int(n_samples/2))
f1 = np.random.multivariate_normal(f1, cov, int(n_samples/2))

X_sample1 = np.concatenate((f0, f1))
y_sample1 = np.concatenate((np.ones((int(n_samples/2))), np.ones((int(n_samples/2))) *(-1))).astype(np.int)

random_index = np.random.permutation(np.arange(n_samples))
X_sample1 = X_sample1[random_index]
y_sample1 = y_sample1[random_index]

In [26]:

X_sample2 = np.array([[-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
       [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
       [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
       [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
       [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
       [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
       [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
       [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
       [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
       [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
       [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
       [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
       [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
       [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
       [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
       [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
       [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
       [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
       [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
       [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ]])
y_sample2 = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [27]:
X_train_iris, X_test_iris, y_train_iris, y_test_iris = scratch_train_test_split(X_iris, y_iris)

In [28]:
X_train_sample1, X_test_sample1, y_train_sample1, y_test_sample1 = scratch_train_test_split(X_sample1, y_sample1)

In [29]:
X_train_sample2, X_test_sample2, y_train_sample2, y_test_sample2 = scratch_train_test_split(X_sample2, y_sample2)


## 【問題2】 分類問題を解くコードの作成
上記3種類の手法で3種類のデータセットを学習・推定するコードを作成してください。

In [30]:
SGD = SGDClassifier(loss="log")
Dtree = DecisionTreeClassifier()
Svm = SVC()

In [31]:
SGD.fit(X_train_iris, y_train_iris.ravel())
Dtree.fit(X_train_iris, y_train_iris.ravel())
Svm.fit(X_train_iris, y_train_iris.ravel())



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [32]:
y_iris_SGD = SGD.predict(X_test_iris)
y_iris_Dtree = Dtree.predict(X_test_iris)
y_iris_Svm = Svm.predict(X_test_iris)

In [33]:
SGD_1 = SGDClassifier(loss="log")
Dtree_1 = DecisionTreeClassifier()
Svm_1 = SVC()

In [34]:
SGD_1.fit(X_train_sample1, y_train_sample1.ravel())
Dtree_1.fit(X_train_sample1, y_train_sample1.ravel())
Svm_1.fit(X_train_sample1, y_train_sample1.ravel())



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [35]:
y_sample1_SGD = SGD_1.predict(X_test_sample1)
y_sample1_Dtree = Dtree_1.predict(X_test_sample1)
y_sampel1_Svm = Svm_1.predict(X_test_sample1)

In [36]:
SGD_2 = SGDClassifier(loss="log")
Dtree_2 = DecisionTreeClassifier()
Svm_2 = SVC()

In [37]:
SGD_2.fit(X_train_sample2, y_train_sample2.ravel())
Dtree_2.fit(X_train_sample2, y_train_sample2.ravel())
Svm_2.fit(X_train_sample2, y_train_sample2.ravel())



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [46]:
#ロジスティック回帰の推定
y_sample2_SGD = SGD_2.predict(X_test_sample2)
print("ロジスティック回帰の推定",y_sample2_SGD)

#SVMの推定
y_sample2_Svm = Svm_2.predict(X_test_sample2)
print("SVMの推定",y_sample2_Svm)

#決定木の推定
y_sample2_Dtree = Dtree_2.predict(X_test_sample2)
print("決定木の推定",y_sample2_Dtree)

ロジスティック回帰の推定 [0. 0. 1. 0. 1. 0. 1. 0.]
SVMの推定 [0. 0. 1. 0. 1. 0. 1. 1.]
決定木の推定 [0. 0. 1. 0. 1. 0. 1. 0.]


## 【問題3】 回帰問題を解くコードの作成
線形回帰でHouse Pricesデータセットを学習・推定するコードを作成してください。

In [40]:
data_raw = pd.read_csv("./train.csv")

In [41]:
data = data_raw.copy(deep=True)

In [42]:
Target = data["SalePrice"]
X_house = data.loc[:, ["GrLivArea", "YearBuilt"]]
X_train_house, X_test_house, y_train_house, y_test_house = scratch_train_test_split(X_house, Target)
SGDR = SGDRegressor()

In [43]:
SGDR.fit(X_train_house, y_train_house.ravel())

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=1000,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [45]:
y_house_SGDR = SGDR.predict(X_test_house)
print(y_house_SGDR)

[3.51367946e+14 4.02947950e+14 3.76713602e+14 3.21722363e+14
 3.26023821e+14 4.03857776e+14 4.30725921e+14 2.88391961e+14
 2.92389302e+14 4.61118383e+14 3.95791019e+14 3.17161769e+14
 4.08711289e+14 4.11004772e+14 3.17352080e+14 3.60449392e+14
 3.85744107e+14 4.41119943e+14 4.14198869e+14 2.74011528e+14
 4.59494736e+14 4.28516437e+14 4.07623075e+14 4.17843676e+14
 4.11879822e+14 3.88640318e+14 4.05917955e+14 3.34787240e+14
 3.98426105e+14 4.07149781e+14 4.07839219e+14 3.98941398e+14
 4.12112857e+14 4.42249162e+14 3.45579768e+14 3.73508307e+14
 4.32547828e+14 3.36546283e+14 3.75920026e+14 4.31566196e+14
 3.55925560e+14 3.55903701e+14 3.05935690e+14 3.89821203e+14
 4.23325026e+14 3.96799746e+14 4.44304104e+14 3.47241171e+14
 4.16931595e+14 3.91339987e+14 3.97905844e+14 4.23884036e+14
 3.86536421e+14 4.47950899e+14 3.09896268e+14 3.58557935e+14
 3.87129214e+14 4.39120641e+14 3.93730115e+14 4.31974908e+14
 2.26809756e+14 4.23765531e+14 4.27841903e+14 4.39208345e+14
 4.02265923e+14 4.414069