# Term1 Sprint1 授業前課題 
## コーディング課題：train_test_split, 分類・回帰パイプラインのスクラッチ

## 1. train_test_splitのスクラッチ
スクラッチの練習として、scikit-learnのtrain_test_splitを自作してみる。  
Jupyter Notebookでコーディングを進め、完成後はpyファイルとする。  
utilsディレクトリの中にsplit.pyを作る。  
scikit-learnのtrain_test_splitと同じ動作をしているか必ずテストをする。  
[sklearn.model_selection.train_test_split — scikit-learn 0.20.0 documentation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html "sklearn.model_selection.train_test_split — scikit-learn 0.20.0 documentation")

In [None]:
# 以下をutilsディレクトリにsplit.pyとして作成済
def train_test_split(X,y,test_size=0.25,
                     random_state=None,shuffle=True,stratify=None):
    """
    Split the data to be learned and tested.

    Parameters
    ----------
    X : ndarray, shape (n_samples, n_features)
      data to be learned and tested
    y : ndarray, shape (n_samples, )
      objective labels
    test_size : float (0<test_size<1)(default: 0.25)
      set the rate of test size
    random_state : int
      set the pseudo-random number to be used in RandomStateGenerator
    shuffle : boolean (default:True)
      shuffle before split or not. If False, set stratify as None.
    stratify : array-like or None
      array for stratified sampling

    Returns
    ----------
    X_train : ndarray, shape (n_samples, n_features)
      data to be learned
    X_test : ndarray, shape (n_samples, n_features)
      data to be tested
    y_train : ndarray, shape (n_samples, )
      labels for X_train
    y_test : ndarray, shape (n_samples, )
      labels for X_test
    """
    import numpy as np
    # Error if feature samples number does not corresponds to y number.
    if X.shape[0] != y.shape[0]:
        raise ValueError("X samples number({}) is not same as y {}.".format(
                X.shape[0], y.shape[0]))
    
    # make several parameters to be used
    n_samples = X.shape[0]
    n_train = np.floor((1-test_size) * n_samples).astype(int)
    n_test = n_samples - n_train
    classes = np.unique(y)
    n_classes = len(classes)
    class_counts = np.bincount(y)
    class_indices = np.split(np.argsort(y, kind='mergesort'),
                             np.cumsum(class_counts)[:-1])
    
    # Case1: Shuffle=False and stratify=None
    if shuffle is False and stratify is None:
        X_test = X[:n_test]
        X_train = X[n_test:(n_test + n_train)]
        y_test = y[:n_test]
        y_train = y[n_test:(n_test + n_train)]
        
        return X_train, X_test, y_train, y_test
    
    # Case2: Shuffle=False and stratify=y
    elif shuffle is False and stratify is not None:
        raise ValueError("If 'shuffle' parameter is False, "
                         "then 'stratify' parameter should be None.")
    
    # Case3: Shuffle=True and stratify=None
    elif shuffle is True and stratify is None:
        rng = np.random.RandomState(seed=random_state)
        # shuffle and split
        permutation = rng.permutation(n_samples)
        ind_test = permutation[:n_test]
        ind_train = permutation[n_test:(n_test + n_train)]
        
        X_train = X[ind_train]
        X_test = X[ind_test]
        y_train = y[ind_train]
        y_test = y[ind_test]
        
        yield X_train
        yield X_test
        yield y_train
        yield y_test
    
    # Case4: Shuffle=True and stratify=y
    else:
        def extracting_func(class_counts, n_draws, rng):
            """
            Stratified sampling at random a certain number(n_draws) of samples 
            from population in class_counts.
            
            """
            # assign each number of samples to be extracted per each class
            continuous = n_draws * (class_counts / class_counts.sum())
            floored = np.floor(continuous)
            need_to_add = int(n_draws - floored.sum())
            # determine which classes should be added one more because of flooring
            if need_to_add > 0:
                remainder = continuous - floored
                # sort the remaining values in an unascending manner
                values = np.sort(np.unique(remainder))[::-1]
                for value in values:
                    inds, = np.where(remainder == value)
                    # set the number of value to be added
                    add_now = min(len(inds), need_to_add)
                    # determine at random where should be added
                    inds = rng.choice(inds, size=add_now, replace=False)
                    floored[inds] += 1
                    # repeat until when 'need to add' becomes 0
                    need_to_add -= add_now
                    if need_to_add == 0:
                        break
            return floored.astype(np.int)
        
        # set a number of samples to be selected per each class
        rng = np.random.RandomState(seed=random_state)
        n_i = extracting_func(class_counts, n_train, rng)
        class_counts_remaining = class_counts - n_i
        t_i = extracting_func(class_counts_remaining, n_test, rng)
        
        train = []
        test = []
        
        # select at random which indices should be assigned to train and test set
        for i in range(n_classes):
            permutation = rng.permutation(class_counts[i])
            perm_indices_class_i = class_indices[i].take(
                    permutation,mode='clip')
            train.extend(perm_indices_class_i[:n_i[i]])
            test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]])
        
        ind_train = rng.permutation(train)
        ind_test = rng.permutation(test)
        
        X_train = X[ind_train]
        X_test = X[ind_test]
        y_train = y[ind_train]
        y_test = y[ind_test]
        
        yield X_train
        yield X_test
        yield y_train
        yield y_test


In [1]:
import os, sys
import numpy as np
from sklearn.datasets import load_iris

# scratchのtrain_test_splitのimport
sys.path.append("../ml-scratch/utils/")
import split
from importlib import reload
reload(split)

# sklearnのtrain_test_splitのimport
from sklearn.model_selection import train_test_split as sk_train_test_split

iris = load_iris()
X = iris.data
y = iris.target

X_train_scratch, X_test_scratch, y_train_scratch, y_test_scratch = split.train_test_split(
    X, y, test_size=0.25, random_state=0)
X_train_sklearn, X_test_sklearn, y_train_sklearn, y_test_sklearn = sk_train_test_split(
    X, y, test_size=0.25, random_state=0)

# 以下が全て0であれば実装成功
print("------test_size=0.25, stratify=None------")
print("X_trainの各要素との差の絶対値の合計：", np.abs(X_train_scratch - X_train_sklearn).sum())
print("X_testの各要素との差の絶対値の合計：", np.abs(X_test_scratch - X_test_sklearn).sum())
print("y_trainの各要素との差の絶対値の合計：", np.abs(y_train_scratch - y_train_sklearn).sum())
print("y_testの各要素との差の絶対値の合計：", np.abs(y_test_scratch - y_test_sklearn).sum())


# test_sizeを0.02から0.50までで試行（test_size=0.01はクラス数以下となるので設定不可）
print("------test_size: 0.02~0.50, stratify=None------")
results = []
for i in np.arange(0.02, 0.51, 0.01):
    X_train_scratch, X_test_scratch, y_train_scratch, y_test_scratch = split.train_test_split(
        X, y, test_size=i, random_state=0)
    X_train_sklearn, X_test_sklearn, y_train_sklearn, y_test_sklearn = sk_train_test_split(
        X, y, test_size=i, random_state=0)
    results.append(np.abs(X_train_scratch - X_train_sklearn).sum())
    results.append(np.abs(X_test_scratch - X_test_sklearn).sum())
    results.append(np.abs(y_train_scratch - y_train_sklearn).sum())
    results.append(np.abs(y_test_scratch - y_test_sklearn).sum())
print("全てのtest_sizeにおける、全変数の各要素との差の絶対値の合計：", sum(results))


# stratify=yで同様に試行
print("------test_size: 0.02~0.50, stratify=y------")
results = []
for i in np.arange(0.02, 0.51, 0.01):
    X_train_scratch, X_test_scratch, y_train_scratch, y_test_scratch = split.train_test_split(
        X, y, test_size=i, random_state=0, stratify=y)
    X_train_sklearn, X_test_sklearn, y_train_sklearn, y_test_sklearn = sk_train_test_split(
        X, y, test_size=i, random_state=0, stratify=y)
    results.append(np.abs(X_train_scratch - X_train_sklearn).sum())
    results.append(np.abs(X_test_scratch - X_test_sklearn).sum())
    results.append(np.abs(y_train_scratch - y_train_sklearn).sum())
    results.append(np.abs(y_test_scratch - y_test_sklearn).sum())
print("全てのtest_sizeにおける、全変数の各要素との差の絶対値の合計：", sum(results))

------test_size=0.25, stratify=None------
X_trainの各要素との差の絶対値の合計： 0.0
X_testの各要素との差の絶対値の合計： 0.0
y_trainの各要素との差の絶対値の合計： 0
y_testの各要素との差の絶対値の合計： 0
------test_size: 0.02~0.50, stratify=None------
全てのtest_sizeにおける、全変数の各要素との差の絶対値の合計： 0.0
------test_size: 0.02~0.50, stratify=y------
全てのtest_sizeにおける、全変数の各要素との差の絶対値の合計： 0.0


## 2. 分類パイプラインの作成
分類は3種類の手法を扱う。pyファイルで実行できる分類のパイプラインを作成する。
- ロジスティック回帰
- SVM
- 決定木

データセットは3種類用意する。

1つ目は事前学習期間同様にirisデータセットとする。  
[sklearn.datasets.load_iris — scikit-learn 0.20.2 documentation](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html "sklearn.datasets.load_iris — scikit-learn 0.20.2 documentation")

2値分類としたいため、以下の2つの目的変数のみ利用し、特徴量は4種類全て使う。
- versicolor
- virginica

また、残り2つは可視化が可能な特徴量が2つのデータセットを人工的に用意する。続くコードで説明変数X,目的変数yが作成可能。「シンプルデータセット1」、「シンプルデータセット2」とする。

In [None]:
# 以下をmodelsディレクトリにpipe_clf.pyとして作成済
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


def log_reg(X_train, X_test, y_train, y_test):
    estimators = [('sc', StandardScaler()),
                  ('lr', LogisticRegression())]
    parameters = {"lr__penalty" : ["l1","l2"],
                  "lr__C" : np.logspace(-3, 3, 7).tolist(), 
                  "lr__solver" : ["liblinear"]}
    pl = Pipeline(estimators)
    clf = GridSearchCV(pl, parameters, n_jobs=-1, cv=5)
    clf.fit(X_train, y_train)
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)
    print("------Logistic Regression------")
    print("訓練データの正解率: ", accuracy_score(y_train, y_train_pred))
    print("テストデータの正解率: ", accuracy_score(y_test, y_test_pred))
    print("テストデータの適合率: ", precision_score(y_test, y_test_pred))
    print("テストデータの再現率: ", recall_score(y_test, y_test_pred))
    print("テストデータのf1スコア: ", f1_score(y_test, y_test_pred))
    
def svc(X_train, X_test, y_train, y_test):
    estimators = [('sc', StandardScaler()),
                  ('svc', SVC())]
    parameters = {"svc__C" : np.logspace(-3, 3, 7).tolist()}
    pl = Pipeline(estimators)
    clf = GridSearchCV(pl, parameters, n_jobs=-1, cv=5)
    clf.fit(X_train, y_train)
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)
    print("------SVC------")
    print("訓練データの正解率: ", accuracy_score(y_train, y_train_pred))
    print("テストデータの正解率: ", accuracy_score(y_test, y_test_pred))
    print("テストデータの適合率: ", precision_score(y_test, y_test_pred))
    print("テストデータの再現率: ", recall_score(y_test, y_test_pred))
    print("テストデータのf1スコア: ", f1_score(y_test, y_test_pred))

def dec_tree(X_train, X_test, y_train, y_test):
    estimators = [('sc', StandardScaler()),
                  ('dtc', DecisionTreeClassifier())]
    parameters = {"dtc__max_depth" : np.arange(1,11).tolist()}
    pl = Pipeline(estimators)
    clf = GridSearchCV(pl, parameters, n_jobs=-1, cv=5)
    clf.fit(X_train, y_train)
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)
    print("------Decision Tree Classifier------")
    print("訓練データの正解率: ", accuracy_score(y_train, y_train_pred))
    print("テストデータの正解率: ", accuracy_score(y_test, y_test_pred))
    print("テストデータの適合率: ", precision_score(y_test, y_test_pred))
    print("テストデータの再現率: ", recall_score(y_test, y_test_pred))
    print("テストデータのf1スコア: ", f1_score(y_test, y_test_pred))

def random_forest(X_train, X_test, y_train, y_test):
    estimators = [('sc', StandardScaler()),
                  ('rfc', RandomForestClassifier())]
    parameters = {"rfc__max_depth" : np.arange(1,11).tolist(), 
                  "rfc__n_estimators" : np.arange(1,21).tolist()}
    pl = Pipeline(estimators)
    clf = GridSearchCV(pl, parameters, n_jobs=-1, cv=5)
    clf.fit(X_train, y_train)
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)
    print("------Random Forest Classifier------")
    print("訓練データの正解率: ", accuracy_score(y_train, y_train_pred))
    print("テストデータの正解率: ", accuracy_score(y_test, y_test_pred))
    print("テストデータの適合率: ", precision_score(y_test, y_test_pred))
    print("テストデータの再現率: ", recall_score(y_test, y_test_pred))
    print("テストデータのf1スコア: ", f1_score(y_test, y_test_pred))


In [2]:
# irisデータセット
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data[50:] # versicolorとverginicaのみに該当する特徴量
y = iris.target[50:] # versicolorとverginica
y = np.where(y==1,0,1) # ラベルを0と1にリセット

In [9]:
# 実際に実行
import sys
from importlib import reload

# scratchの分類器パイプラインファイルをimport
sys.path.append("../ml-scratch/models/")
import pipe_clf
reload(pipe_clf)

X_train_scratch, X_test_scratch, y_train_scratch, y_test_scratch = sk_train_test_split(
    X, y, test_size=0.25, random_state=0)

pipe_clf.log_reg(X_train_scratch, X_test_scratch, y_train_scratch, y_test_scratch)
pipe_clf.svc(X_train_scratch, X_test_scratch, y_train_scratch, y_test_scratch)
pipe_clf.dec_tree(X_train_scratch, X_test_scratch, y_train_scratch, y_test_scratch)
pipe_clf.random_forest(X_train_scratch, X_test_scratch, y_train_scratch, y_test_scratch)



------Logistic Regression------
訓練データの正解率:  0.9866666666666667
テストデータの正解率:  0.96
テストデータの適合率:  0.9230769230769231
テストデータの再現率:  1.0
テストデータのf1スコア:  0.9600000000000001
------SVC------
訓練データの正解率:  0.9866666666666667
テストデータの正解率:  0.92
テストデータの適合率:  0.8571428571428571
テストデータの再現率:  1.0
テストデータのf1スコア:  0.923076923076923




------Decision Tree Classifier------
訓練データの正解率:  0.9733333333333334
テストデータの正解率:  0.84
テストデータの適合率:  0.75
テストデータの再現率:  1.0
テストデータのf1スコア:  0.8571428571428571
------Random Forest Classifier------
訓練データの正解率:  0.9733333333333334
テストデータの正解率:  0.88
テストデータの適合率:  0.8
テストデータの再現率:  1.0
テストデータのf1スコア:  0.888888888888889




In [10]:
# シンプルデータセット1
import numpy as np

np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]

f0 = np.random.multivariate_normal(f0, cov, int(n_samples/2))
f1 = np.random.multivariate_normal(f1, cov, int(n_samples/2))

X = np.concatenate((f0, f1))
y = np.concatenate((np.ones((int(n_samples/2))), np.ones((int(n_samples/2))) *(-1))).astype(np.int)

random_index = np.random.permutation(np.arange(n_samples))
X = X[random_index]
y = y[random_index]

In [13]:
X_train_scratch, X_test_scratch, y_train_scratch, y_test_scratch = sk_train_test_split(
    X, y, test_size=0.25, random_state=0)

pipe_clf.log_reg(X_train_scratch, X_test_scratch, y_train_scratch, y_test_scratch)
pipe_clf.svc(X_train_scratch, X_test_scratch, y_train_scratch, y_test_scratch)
pipe_clf.dec_tree(X_train_scratch, X_test_scratch, y_train_scratch, y_test_scratch)
pipe_clf.random_forest(X_train_scratch, X_test_scratch, y_train_scratch, y_test_scratch)

------Logistic Regression------
訓練データの正解率:  1.0
テストデータの正解率:  1.0
テストデータの適合率:  1.0
テストデータの再現率:  1.0
テストデータのf1スコア:  1.0
------SVC------
訓練データの正解率:  1.0
テストデータの正解率:  1.0
テストデータの適合率:  1.0
テストデータの再現率:  1.0
テストデータのf1スコア:  1.0




------Decision Tree Classifier------
訓練データの正解率:  1.0
テストデータの正解率:  0.992
テストデータの適合率:  0.9866666666666667
テストデータの再現率:  1.0
テストデータのf1スコア:  0.9932885906040269
------Random Forest Classifier------
訓練データの正解率:  0.9813333333333333
テストデータの正解率:  0.984
テストデータの適合率:  1.0
テストデータの再現率:  0.972972972972973
テストデータのf1スコア:  0.9863013698630138




In [14]:
# シンプルデータセット2
X = np.array([[-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
       [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
       [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
       [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
       [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
       [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
       [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
       [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
       [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
       [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
       [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
       [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
       [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
       [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
       [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
       [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
       [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
       [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
       [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
       [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ]])
y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [15]:
X_train_scratch, X_test_scratch, y_train_scratch, y_test_scratch = sk_train_test_split(
    X, y, test_size=0.25, random_state=0)

pipe_clf.log_reg(X_train_scratch, X_test_scratch, y_train_scratch, y_test_scratch)
pipe_clf.svc(X_train_scratch, X_test_scratch, y_train_scratch, y_test_scratch)
pipe_clf.dec_tree(X_train_scratch, X_test_scratch, y_train_scratch, y_test_scratch)
pipe_clf.random_forest(X_train_scratch, X_test_scratch, y_train_scratch, y_test_scratch)

------Logistic Regression------
訓練データの正解率:  0.6333333333333333
テストデータの正解率:  0.3
テストデータの適合率:  0.3333333333333333
テストデータの再現率:  0.4
テストデータのf1スコア:  0.3636363636363636
------SVC------
訓練データの正解率:  0.7333333333333333
テストデータの正解率:  0.5
テストデータの適合率:  0.5
テストデータの再現率:  0.4
テストデータのf1スコア:  0.4444444444444445
------Decision Tree Classifier------
訓練データの正解率:  1.0
テストデータの正解率:  0.7
テストデータの適合率:  0.75
テストデータの再現率:  0.6
テストデータのf1スコア:  0.6666666666666665
------Random Forest Classifier------
訓練データの正解率:  0.9666666666666667
テストデータの正解率:  0.7
テストデータの適合率:  0.75
テストデータの再現率:  0.6
テストデータのf1スコア:  0.6666666666666665


## 3. 回帰パイプラインの作成
回帰は1種類を扱う。pyファイルで実行できる回帰のパイプラインを作成する。
- 線形回帰

データセットは事前学習期間同様にHouse Pricesコンペティションのものを使う。  
[House Prices: Advanced Regression Techniques](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data "House Prices: Advanced Regression Techniques")

train.csvをダウンロードし、目的変数としてSalePrice、説明変数としてGrLivAreaとYearBuiltを使う。



In [None]:
# 以下をmodelsディレクトリにpipe_reg.pyとして作成済
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression


def lin_reg(X_train, X_test, y_train, y_test):
    estimators = [('sc', StandardScaler()),
                  ('lr', LinearRegression())]
    parameters = {"lr__fit_intercept" : [False, True]}
    pl = Pipeline(estimators)
    reg = GridSearchCV(pl, parameters, n_jobs=-1, cv=5)
    reg.fit(X_train, y_train)
    y_test_pred = reg.predict(X_test)
    print("------Linear Regression------")
    print("テストデータのMSE: ", mean_squared_error(y_test, y_test_pred))
    print("テストデータのR2_score: ", r2_score(y_test, y_test_pred))

In [19]:
import pandas as pd
train = pd.read_csv("train.csv")
X = train[["GrLivArea","YearBuilt"]].values
y = train["SalePrice"]

In [20]:
# 実際に実行
import sys
from importlib import reload

# scratchの回帰パイプラインファイルをimport
sys.path.append("../ml-scratch/models/")
import pipe_reg
reload(pipe_reg)

X_train_scratch, X_test_scratch, y_train_scratch, y_test_scratch = sk_train_test_split(
    X, y, test_size=0.25, random_state=0)

pipe_reg.lin_reg(X_train_scratch, X_test_scratch, y_train_scratch, y_test_scratch)

------Linear Regression------
テストデータのMSE:  2725908099.0636373
テストデータのR2_score:  0.5871035359601553


