## 機械学習スクラッチ入門

In [197]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

## 【問題1】train_test_splitのスクラッチ

In [198]:
def scratch_train_test_split(X, y, train_size=0.8,):
    """
    検証用データを分割する。

    Parameters
    ----------
    X : 次の形のndarray, shape (n_samples, n_features)
      学習データ
    y : 次の形のndarray, shape (n_samples, )
      正解値
    train_size : float (0<train_size<1)
      何割をtrainとするか指定

    Returns
    ----------
    X_train : 次の形のndarray, shape (n_samples, n_features)
      学習データ
    X_test : 次の形のndarray, shape (n_samples, n_features)
      検証データ
    y_train : 次の形のndarray, shape (n_samples, )
      学習データの正解値
    y_test : 次の形のndarray, shape (n_samples, )
      検証データの正解値
    """
    #ここにコードを書く
    Xy = np.concatenate([X , y.reshape(-1 , 1)] , axis =1) 
    np.random.shuffle(Xy)
    train_index = int(train_size * Xy.shape[0])
    
    
    X_train = Xy[:train_index , :-1]
    X_test = Xy[train_index: , :-1]
    y_train = Xy[:train_index , -1]
    y_test = Xy[train_index: , -1]
    

    return X_train, X_test, y_train, y_test

In [199]:
X = np.arange(1, 101).reshape(50,2)
y = np.arange(1,51)

In [200]:
X_train , X_test , y_train , y_test = scratch_train_test_split(
                X, y, train_size=0.8,)

In [201]:
X_train.shape

(40, 2)

In [202]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(
               X , y , test_size = 0.2)

In [203]:
X_train.shape

(40, 2)

## 【問題2】 分類問題を解くコードの作成

In [204]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target

In [205]:
X = pd.DataFrame(X)
y = pd.DataFrame(y)

In [206]:
X.columns = ["sepal_length" , "sepal_width" , "petal_length" , "petal_width"]
y.columns = ["Species"]

In [207]:
df = pd.concat([X , y] , axis = 1)

In [208]:
df_cn = df.query("Species == [1,2]").loc[: , :]
X = df_cn.iloc[: , :4].values
y = df_cn.iloc[: , -1].values

In [209]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [210]:
from sklearn.metrics import accuracy_score , precision_score , recall_score, f1_score ,  confusion_matrix

In [211]:
def learn_model(X , y , model):
    X_train , X_test , y_train , y_test = scratch_train_test_split(
                    X , y , train_size = 0.8)
    model.fit(X_train , y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test , y_pred)
    precision = precision_score(y_test , y_pred)
    recall = recall_score(y_test , y_pred)
    f1 = f1_score(y_test , y_pred)
    confusion = confusion_matrix(y_test , y_pred)
    print("accuracy : " , accuracy)
    print("precision : " , precision)
    print("recall : " , recall)
    print("f1 : " , f1 )

## データセット１ iris

In [212]:
#ロジスティック回帰
log = SGDClassifier(loss = "log")
learn_model(X , y , log)

accuracy :  1.0
precision :  1.0
recall :  1.0
f1 :  1.0




In [213]:
#SVM
svm = SVC()
learn_model(X , y , svm)

accuracy :  1.0
precision :  1.0
recall :  1.0
f1 :  1.0


In [214]:
#決定木
tree = DecisionTreeClassifier()
learn_model(X , y , tree)

accuracy :  0.8
precision :  0.875
recall :  0.7
f1 :  0.7777777777777777


## データセット２

In [215]:
np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]

f0 = np.random.multivariate_normal(f0, cov, int(n_samples/2))
f1 = np.random.multivariate_normal(f1, cov, int(n_samples/2))

X = np.concatenate((f0, f1))
y = np.concatenate((np.ones((int(n_samples/2))), np.ones((int(n_samples/2))) *(-1))).astype(np.int)

random_index = np.random.permutation(np.arange(n_samples))
X = X[random_index]
y = y[random_index]

In [216]:
X.shape

(500, 2)

In [217]:
y_train.shape

(40,)

In [218]:
#ロジスティック回帰
log = SGDClassifier(loss = "log")
learn_model(X , y , log)

accuracy :  1.0
precision :  1.0
recall :  1.0
f1 :  1.0




In [219]:
#SVM
svm = SVC()
learn_model(X , y , svm)

accuracy :  1.0
precision :  1.0
recall :  1.0
f1 :  1.0


In [220]:
#決定木
tree = DecisionTreeClassifier()
learn_model(X , y , tree)

accuracy :  1.0
precision :  1.0
recall :  1.0
f1 :  1.0


## データセット２

In [221]:
X = np.array([[-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
       [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
       [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
       [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
       [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
       [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
       [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
       [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
       [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
       [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
       [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
       [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
       [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
       [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
       [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
       [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
       [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
       [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
       [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
       [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ]])
y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [222]:
X.shape

(40, 2)

In [223]:
y.shape

(40,)

In [224]:
#ロジスティック回帰
log = SGDClassifier(loss = "log")
learn_model(X , y , log)

accuracy :  0.125
precision :  0.0
recall :  0.0
f1 :  0.0




In [225]:
#SVM
svm = SVC()
learn_model(X , y , svm)

accuracy :  0.75
precision :  0.75
recall :  0.75
f1 :  0.75


In [226]:
#決定木
tree = DecisionTreeClassifier()
learn_model(X , y , tree)

accuracy :  0.625
precision :  0.6
recall :  0.75
f1 :  0.6666666666666665


## 【問題3】 回帰問題を解くコードの作成

In [227]:
df = pd.read_csv("train.csv")

In [234]:
X = df.loc[: , ["GrLivArea" , "YearBuilt"]].values
y = df.loc[: , "SalePrice"].values

In [235]:
X.shape

(1460, 2)

In [236]:
y.shape

(1460,)

In [252]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor

def learn_sg(X , y):
    X_train , X_test , y_train , y_test = scratch_train_test_split(
        X , y , train_size = 0.8)
    sgdr = SGDRegressor()
    sgdr.fit(X_train , y_train)
    y_pred = sgdr.predict(X_test)
    mse_mse = mean_squared_error(y_test , y_pred)
    return ("平均二乗誤差 : ", mse_mse)

In [253]:
learn_sg(X , y)



('平均二乗誤差 : ', 4.4521624694143665e+31)