# Sprint2 機械学習スクラッチ入門

## 【問題1】train_test_splitのスクラッチ

In [142]:
def scratch_train_test_split(X, y, train_size=0.8,):
    """
    検証用データを分割する。

    Parameters
    ----------
    X : 次の形のndarray, shape (n_samples, n_features)
      学習データ
    y : 次の形のndarray, shape (n_samples, )
      正解値
    train_size : float (0<train_size<1)
      何割をtrainとするか指定

    Returns
    ----------
    X_train : 次の形のndarray, shape (n_samples, n_features)
      学習データ
    X_test : 次の形のndarray, shape (n_samples, n_features)
      検証データ
    y_train : 次の形のndarray, shape (n_samples, )
      学習データの正解値
    y_test : 次の形のndarray, shape (n_samples, )
      検証データの正解値
    """
    #ここにコードを書く
    Xy=np.concatenate([X, y.reshape(-1,1)], axis=1)
    np.random.shuffle(Xy)
    train_index=int(X.shape[0]*train_size)
    X_train=Xy[:train_index, :-1]
    X_test=Xy[train_index:, :-1]
    y_train=Xy[:train_index, -1]
    y_test=Xy[train_index:, -1]

    return X_train, X_test, y_train, y_test

In [143]:
import numpy as np
X=np.arange(1,101).reshape(50,2)
y=np.arange(1,51)

In [144]:
y

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50])

In [145]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2)
print(X_train)
print(y_train)

[[ 65  66]
 [ 87  88]
 [ 35  36]
 [ 73  74]
 [ 33  34]
 [ 71  72]
 [ 21  22]
 [ 83  84]
 [  1   2]
 [ 37  38]
 [  9  10]
 [ 99 100]
 [ 97  98]
 [ 51  52]
 [ 23  24]
 [ 27  28]
 [ 93  94]
 [ 55  56]
 [ 41  42]
 [ 77  78]
 [ 19  20]
 [ 95  96]
 [ 25  26]
 [ 13  14]
 [ 81  82]
 [ 45  46]
 [ 85  86]
 [ 59  60]
 [ 89  90]
 [  3   4]
 [ 57  58]
 [ 91  92]
 [  7   8]
 [ 39  40]
 [ 63  64]
 [ 69  70]
 [ 61  62]
 [ 47  48]
 [ 53  54]
 [ 43  44]]
[33 44 18 37 17 36 11 42  1 19  5 50 49 26 12 14 47 28 21 39 10 48 13  7
 41 23 43 30 45  2 29 46  4 20 32 35 31 24 27 22]


In [146]:
X_train, X_test, y_train, y_test=scratch_train_test_split(X, y, train_size=0.8,)
print(X_train.shape, y_train.shape)

(40, 2) (40,)


## 【問題2】 分類問題を解くコードの作成

### iris data

In [147]:
import pandas as pd
from sklearn.datasets import load_iris
iris=load_iris()
X=iris.data
y=iris.target

In [148]:
X=pd.DataFrame(X)
X.columns=iris.feature_names

In [149]:
y=pd.Series(y)

In [150]:
X=X.iloc[50:, :].values
y=y[50:].values

In [151]:
y=y-1

0: virgicolor    1:virginica

In [152]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [204]:
def learn_predict(X, y, model):
    X_train, X_test, y_train, y_test=scratch_train_test_split(X, y, train_size=0.8,)
    #X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2,)
    model.fit(X_train, y_train)
    y_pred=model.predict(X_test)
    accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)
    precision=precision_score(y_test, y_pred)
    recall=recall_score(y_test, y_pred)
    f1=f1_score(y_test, y_pred)
    con_mat=confusion_matrix(y_test, y_pred)
    print('accuracy: ', accuracy)
    print('precision: ', precision)
    print('recall: ', recall)
    print('f1_score: ', f1)

### ロジスティック回帰(iris)

In [205]:
log=SGDClassifier(loss="log")
learn_predict(X, y, log)

accuracy:  0.625
precision:  0.4
recall:  1.0
f1_score:  0.5714285714285715


### SVM(iris)

In [206]:
svm=SVC()
learn_predict(X, y, svm)

accuracy:  0.75
precision:  0.6
recall:  1.0
f1_score:  0.7499999999999999




### 決定木(iris)

In [207]:
tree=DecisionTreeClassifier()
learn_predict(X, y, tree)

accuracy:  0.625
precision:  0.3333333333333333
recall:  0.5
f1_score:  0.4


### データセット１

In [208]:
import numpy as np

np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]

f0 = np.random.multivariate_normal(f0, cov, int(n_samples/2))
f1 = np.random.multivariate_normal(f1, cov, int(n_samples/2))

X = np.concatenate((f0, f1))
y = np.concatenate((np.ones((int(n_samples/2))), np.ones((int(n_samples/2))) *(-1))).astype(np.int)

random_index = np.random.permutation(np.arange(n_samples))
X = X[random_index]
y = y[random_index]

In [209]:
X.shape

(500, 2)

### ロジスティック回帰（データセット１）

In [210]:
log=SGDClassifier(loss="log")
learn_predict(X, y, log)

accuracy:  1.0
precision:  1.0
recall:  1.0
f1_score:  1.0


### SVM（データセット１）

In [211]:
svm=SVC()
learn_predict(X, y, svm)

accuracy:  1.0
precision:  1.0
recall:  1.0
f1_score:  1.0




### 決定木（データセット１）

In [212]:
tree=DecisionTreeClassifier()
learn_predict(X, y, tree)

accuracy:  1.0
precision:  1.0
recall:  1.0
f1_score:  1.0


### データセット２

In [213]:
X = np.array([[-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
       [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
       [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
       [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
       [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
       [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
       [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
       [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
       [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
       [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
       [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
       [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
       [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
       [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
       [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
       [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
       [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
       [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
       [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
       [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ]])
y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [214]:
X.shape

(40, 2)

### ロジスティック回帰（データセット2）

In [215]:
log=SGDClassifier(loss="log")
learn_predict(X, y, log)

accuracy:  0.125
precision:  0.0
recall:  0.0
f1_score:  0.0


### SVM（データセット2）

In [216]:
svm=SVC()
learn_predict(X, y, svm)

accuracy:  0.75
precision:  0.75
recall:  0.75
f1_score:  0.75




### 決定木（データセット2）

In [217]:
tree=DecisionTreeClassifier()
learn_predict(X, y, tree)

accuracy:  0.625
precision:  0.6
recall:  0.75
f1_score:  0.6666666666666665


# 【問題3】 回帰問題を解くコードの作成

In [218]:
df=pd.read_csv("train.csv")

In [229]:
y=df["SalePrice"].values
X=df[["GrLivArea", "YearBuilt"]].values

In [230]:
X

array([[1710, 2003],
       [1262, 1976],
       [1786, 2001],
       ...,
       [2340, 1941],
       [1078, 1950],
       [1256, 1965]])

In [231]:
from sklearn.linear_model import SGDRegressor
lin=SGDRegressor()
X_train, X_test, y_train ,y_test=scratch_train_test_split(X, y, train_size=0.8,)
lin.fit(X_train, y_train)
y_pred=lin.predict(X_test)


In [234]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
metrics_names=['平均二乗誤差(MSE)', '平均絶対誤差(MAE)', '二乗平均平方根誤差(RMSE)', '決定係数(R2)']


mse=mean_squared_error(y_test, y_pred)
mae=mean_absolute_error(y_test, y_pred)
rmse=np.sqrt(mean_squared_error(y_test, y_pred))
r2=r2_score(y_test, y_pred)
metrics=[mse, mae, rmse, r2]
for i in range(4):
    print("{}: ".format(metrics_names[i]), metrics[i])

平均二乗誤差(MSE):  3.4640386595337706e+30
平均絶対誤差(MAE):  1786237499275577.8
二乗平均平方根誤差(RMSE):  1861192805577587.2
決定係数(R2):  -5.906909569555743e+20
