# 【問題1】train_test_splitのスクラッチ

In [58]:
def scratch_train_test_split(X, y, train_size=0.8,):
    """
    検証用データを分割する。

    Parameters
    ----------
    X : 次の形のndarray, shape (n_samples, n_features)
      学習データ
    y : 次の形のndarray, shape (n_samples, )
      正解値
    train_size : float (0<train_size<1)
      何割をtrainとするか指定

    Returns
    ----------
    X_train : 次の形のndarray, shape (n_samples, n_features)
      学習データ
    X_test : 次の形のndarray, shape (n_samples, n_features)
      検証データ
    y_train : 次の形のndarray, shape (n_samples, )
      学習データの正解値
    y_test : 次の形のndarray, shape (n_samples, )
      検証データの正解値
    """
    import numpy as np
    if train_size > 1:
        raise ValueError("０−１のfloat型を指定してください。")
    if X.shape[0] == 0:
        raise ValueError("少なくとも１列以上必要です。")
    if y.shape[0] == 0:
        raise ValueError("少なくとも１要素以上必要です。")
    
    X = np.array(X)
    y = np.array(y)
    
    X_nrows = X.shape[0]
    y_nrows = y.shape[0]
    
    X_list = np.arange(0, X_nrows)
    y_list = np.arange(0, y_nrows)
    
    np.random.shuffle(X_list)
    np.random.shuffle(y_list)
    
    X_train_size = round(X_nrows * train_size, 0)
    y_train_size = round(y_nrows * train_size, 0)
    
    X_train_num = X_list[:int(X_train_size)]
    X_test_num = X_list[int(X_train_size):]
    
    y_train_num = y_list[:int(y_train_size)]
    y_test_num = y_list[int(y_train_size):]
    
    X_train = X[X_train_num,]
    X_test = X[X_test_num,]
    y_train = y[y_train_num]
    y_test = y[y_test_num]

    return X_train, X_test, y_train, y_test

In [47]:
import numpy as np

def make_random(random_state=0):
    np.random.seed(seed=random_state)
    n_samples = 500
    f0 = [-1, 2]
    f1 = [2, -1]
    cov = [[1.0,0.8], [0.8, 1.0]]

    f0 = np.random.multivariate_normal(f0, cov, int(n_samples/2))
    f1 = np.random.multivariate_normal(f1, cov, int(n_samples/2))

    X = np.concatenate((f0, f1))
    y = np.concatenate((np.ones((int(n_samples/2))), np.ones((int(n_samples/2))) *(-1))).astype(np.int)

    random_index = np.random.permutation(np.arange(n_samples))
    X = X[random_index]
    y = y[random_index]
    return X, y

In [48]:
import pandas as pd

# 【問題2】 分類問題を解くコードの作成
上記3種類の手法で3種類のデータセットを学習・推定するコードを作成してください。

分類問題
分類は3種類の手法をスクラッチします。

ロジスティック回帰
SVM
決定木

In [49]:
#分類問題
from sklearn.datasets import load_iris
iris = load_iris()
X1, y1 = make_random(random_state=0)
X2, y2 = make_random(random_state=100)

In [50]:
df_iris = pd.DataFrame(iris.data, columns=iris.feature_names)

In [51]:
df_iris["target"] = iris.target

In [52]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [53]:
df_data = df_iris[df_iris["target"]!=0]

In [54]:
X_iris = df_data.iloc[:, 0:4]
y_iris = df_data["target"]

In [62]:
X_iris_train, X_iris_test, y_iris_train, y_iris_test = scratch_train_test_split(
                                                       X_iris, y_iris, train_size=0.8)
X1_train, X1_test, y1_train, y1_test = scratch_train_test_split(X1, y1, train_size=0.8)
X2_train, X2_test, y2_train, y2_test = scratch_train_test_split(X2, y2, train_size=0.8)

In [67]:
data_iris = [X_iris_train, X_iris_test, y_iris_train, y_iris_test]
data_1 = [X1_train, X1_test, y1_train, y1_test]
data_2 = [X2_train, X2_test, y2_train, y2_test]
datasets = [data_iris, data_1, data_2]
names = ["iris", "random1", "random2"] 

In [87]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [88]:
logi = SGDClassifier(loss="log")
svc = SVC()
tree = DecisionTreeClassifier()
models = [logi, svc, tree]
model_names = ["LogisticRegression", "SVC", "DecisionTreeClassifier"]

In [89]:
for dataset_name, dataset in zip(names, datasets):
    for model_name, model in zip(model_names, models):
        X_train, X_test, y_train, y_test = dataset
        model.fit(X_train, y_train)
        print("Score({}-{}):{:.3f}".format(dataset_name, model_name, model.score(X_test, y_test)))

Score(iris-LogisticRegression):0.500
Score(iris-SVC):0.550
Score(iris-DecisionTreeClassifier):0.400
Score(random1-LogisticRegression):0.520
Score(random1-SVC):0.540
Score(random1-DecisionTreeClassifier):0.500
Score(random2-LogisticRegression):0.430
Score(random2-SVC):0.400
Score(random2-DecisionTreeClassifier):0.500


# 【問題3】 回帰問題を解くコードの作成
線形回帰でHouse Pricesデータセットを学習・推定するコードを作成してください。

In [84]:
#回帰問題
import pandas as pd
df_home = pd.read_csv("/Users/niikurasayaka/diveintocode-ml/Week3/data/train.csv")

In [86]:
df_home.head(4)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000


In [92]:
df_data = df_home[["GrLivArea", "YearBuilt", "SalePrice"]]

In [94]:
df_data.isnull().sum()

GrLivArea    0
YearBuilt    0
SalePrice    0
dtype: int64

In [96]:
X = df_data[["GrLivArea", "YearBuilt"]]
y = df_data["SalePrice"]

In [97]:
X_train, X_test, y_train, y_test = scratch_train_test_split(X, y, train_size=0.8)

In [102]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error

In [111]:
sgdr = SGDRegressor(random_state=0)
sgdr.fit(X_train, y_train)
y_train_pred = sgdr.predict(X_train)
y_test_pred = sgdr.predict(X_test)
print("MSE_Score(train):{:.3e}".format(mean_squared_error(y_train, y_train_pred)))
print("MSE_Score(test):{:.3e}".format(mean_squared_error(y_test, y_test_pred)))

MSE_Score(train):7.619e+29
MSE_Score(test):7.954e+29


標準化

In [112]:
from sklearn.preprocessing import StandardScaler

In [114]:
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [115]:
sgdr = SGDRegressor(random_state=0)
sgdr.fit(X_train_std, y_train)
y_train_pred = sgdr.predict(X_train_std)
y_test_pred = sgdr.predict(X_test_std)
print("MSE_Score(train):{:.3e}".format(mean_squared_error(y_train, y_train_pred)))
print("MSE_Score(test):{:.3e}".format(mean_squared_error(y_test, y_test_pred)))

MSE_Score(train):5.895e+09
MSE_Score(test):7.932e+09
