In [259]:
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# 【問題1】train_test_split のスクラッチ
まずは、scikit-learnの train_test_split をスクラッチしてみます。以下の雛形をベースに関数を実装してください。

なお、作成した関数がscikit-learnの train_test_split と同じ動作をするか必ず確認をしましょう。

In [185]:
def scratch_train_test_split(X, y, train_size=0.8):
    """検証データを分割する。
    Parameters
    ----------
    X : ndarray
      訓練データ (n_samples, n_features)
    y : ndarray
      正解値 (n_samples,)
    train_size : float
      何割をtrainとするか指定 (0 < train_size < 1)
    Returns
    -------
    X_train : ndarray
      訓練データ (n_samples, n_features)
    X_test : ndarray
      検証データ (n_samples, n_features)
    y_train : ndarray
      訓練データの正解値 (n_samples,)
    y_test : ndarray
      検証データの正解値 (n_samples,)
    """
    index_list = random.sample(range(len(X) - 1), k=int(len(X) * train_size))
    
    X_train = X[index_list,:]
    y_train = y[index_list,:]

    X_test = np.delete(X, index_list, 0)
    y_test = np.delete(y, index_list, 0)
    
    return X_train, X_test, y_train, y_test

In [186]:
X = np.arange(14).reshape(7,2)
y = np.arange(14).reshape(7,2)

#スクラッチ実装版
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, train_size=0.8)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5, 2)
(5, 2)
(2, 2)
(2, 2)


In [187]:
#モジュール版
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5, 2)
(5, 2)
(2, 2)
(2, 2)


# 【問題2】 分類問題を解くコードの作成
上記3種類の手法で3種類のデータセットを学習・推定するコードを作成してください。

In [260]:
#irisデータ
from sklearn.datasets import load_iris
data = load_iris()
X = np.array(data.data)
y = np.array(data.target).reshape(150,1)

temp = np.concatenate((X,y),axis=1)
temp = temp[temp[:,4] > 0] #virgicolorとvirginica抽出

X = temp[:,:4]
y = temp[:,-1].reshape(100,1)

In [316]:
def learning_and_result(X, y, model, loss='log'):
  '''
  ①データ分割
  ②学習、判定
  ③スコア出力
  -----
  引数：
  X = 説明変数
  y = 目的変数
  model = 学習モデル
  loss = 学習時の変数（SGDClassifierのみ適用）
  -----
  戻り値：
  なし
  '''
  X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, train_size=0.8)

  if model == 'SGDClassifier': #SGDClassifierのみloss変数適用
    clf = model(loss=loss)
  else:
    clf = model()

  clf.fit(X_train, y_train)
  Y_pred = clf.predict(X_test)
  
  print(f'accuracy_score: {accuracy_score(y_test, Y_pred)}')
  print(f'precision_score: {precision_score(y_test, Y_pred)}')
  print(f'recall_score: {recall_score(y_test, Y_pred)}')
  print(f'f1_score: {f1_score(y_test, Y_pred)}')
  print('confusion_matrix:') 
  print(confusion_matrix(y_test, Y_pred))

SGDClassifier

In [262]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier

scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

learning_and_result(X, y, SGDClassifier, loss='log')

accuracy_score: 0.9
precision_score: 0.8571428571428571
recall_score: 0.8571428571428571
f1_score: 0.8571428571428571
confusion_matrix:
[[ 6  1]
 [ 1 12]]


  y = column_or_1d(y, warn=True)


SVM

In [263]:
from sklearn.svm import SVC

learning_and_result(X, y, SVC)

accuracy_score: 1.0
precision_score: 1.0
recall_score: 1.0
f1_score: 1.0
confusion_matrix:
[[ 9  0]
 [ 0 11]]


  y = column_or_1d(y, warn=True)


決定木

In [264]:
from sklearn.tree import DecisionTreeClassifier

learning_and_result(X, y, DecisionTreeClassifier)

accuracy_score: 0.95
precision_score: 1.0
recall_score: 0.8888888888888888
f1_score: 0.9411764705882353
confusion_matrix:
[[ 8  1]
 [ 0 11]]


In [265]:
#シンプルデータセット1作成コード
np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]
f0 = np.random.multivariate_normal(f0, cov, n_samples // 2)
f1 = np.random.multivariate_normal(f1, cov, n_samples // 2)
X = np.concatenate([f0, f1])
y = np.concatenate([
    np.full(n_samples // 2, 1),
    np.full(n_samples // 2, -1)
])

y = y.reshape(500,1)

SGDClassifier

In [266]:
learning_and_result(X, y, SGDClassifier, loss='log')

accuracy_score: 1.0
precision_score: 1.0
recall_score: 1.0
f1_score: 1.0
confusion_matrix:
[[54  0]
 [ 0 46]]


  y = column_or_1d(y, warn=True)


SVM

In [267]:
learning_and_result(X, y, SVC)

accuracy_score: 1.0
precision_score: 1.0
recall_score: 1.0
f1_score: 1.0
confusion_matrix:
[[54  0]
 [ 0 46]]


  y = column_or_1d(y, warn=True)


決定木

In [268]:
learning_and_result(X, y, DecisionTreeClassifier)

accuracy_score: 1.0
precision_score: 1.0
recall_score: 1.0
f1_score: 1.0
confusion_matrix:
[[49  0]
 [ 0 51]]


In [271]:
#シンプルデータセット2作成コード
X = np.array([
    [-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
    [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
    [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
    [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
    [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
    [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
    [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
    [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
    [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
    [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
    [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
    [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
    [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
    [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
    [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
    [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
    [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
    [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
    [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
    [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ],
])
y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

y = y.reshape(40,1)

SGDClassifier

In [272]:
learning_and_result(X, y, SGDClassifier, loss='log')

accuracy_score: 0.375
precision_score: 0.5
recall_score: 0.4
f1_score: 0.4444444444444445
confusion_matrix:
[[1 2]
 [3 2]]


  y = column_or_1d(y, warn=True)


SVM

In [273]:
learning_and_result(X, y, SVC)

accuracy_score: 0.25
precision_score: 1.0
recall_score: 0.14285714285714285
f1_score: 0.25
confusion_matrix:
[[1 0]
 [6 1]]


  y = column_or_1d(y, warn=True)


決定木

In [274]:
learning_and_result(X, y, DecisionTreeClassifier)

accuracy_score: 0.75
precision_score: 0.6666666666666666
recall_score: 1.0
f1_score: 0.8
confusion_matrix:
[[2 2]
 [0 4]]


# 【問題3】 回帰問題を解くコードの作成
線形回帰でHouse Pricesデータセットを学習・推定するコードを作成してください。

In [None]:
import pandas as pd

df = pd.read_csv('drive/My Drive/data/house-prices-advanced-regression-techniques_train.csv')

In [342]:
X = df[['GrLivArea','YearBuilt']].values
y = df['SalePrice'].values.reshape(1460,1)

In [343]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

# scaler = StandardScaler()
# scaler.fit(X)
# X = scaler.transform(X)

X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, train_size=0.8)

clf = SGDRegressor()
clf.fit(X_train, y_train)
Y_pred = clf.predict(X_test)

print(f'MSE: {mean_squared_error(y_test, Y_pred)}')
# print(f'MSLE: {mean_squared_log_error(y_test, Y_pred)}') 
# print(f'RMSLE: {np.sqrt(mean_squared_log_error(y_test, Y_pred))}')
print(f'MAE: {mean_absolute_error(y_test, Y_pred)}')
print(f'R2: {r2_score(y_test, Y_pred)}')

MSE: 1.0060196436676033e+31
MAE: 2985363391778918.5
R2: -1.4666736463934042e+21


  y = column_or_1d(y, warn=True)
