# Loading Datasts
我們目前引入sklearn的boston房價資料庫

# 觀察資料型態

In [1]:
import numpy as np
from  sklearn.datasets import load_boston    # sklearn is a gigantic package, so it is not a good idea to load the entire package
boston = load_boston()                       # LOAD 起來
print(boston.keys())

dict_keys(['data', 'target', 'feature_names', 'DESCR'])


# 資料集簡介

In [2]:
print(boston.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

# Feature

In [0]:
print(boston.data)
print(boston.data.shape)

# Label

In [0]:
print(boston.target)
print(boston.target.shape)

# 整理資料

In [0]:
import pandas as pd

df = pd.DataFrame(boston.data,columns = boston.feature_names)
df['PRICE'] = boston.target
df.head()

# EDA
### 以房間數量為例子


### 以 DIS 為例子

In [0]:
import matplotlib.pyplot as plt

df = pd.DataFrame(boston.data,columns = boston.feature_names)
df['PRICE'] = boston.target

print(boston.feature_names)
dis=df['DIS']
plt.hist(dis,bins=50)

In [0]:
plt.scatter(df['PRICE'], df['CRIM'],alpha=0.7)
plt.ylabel('NOX')
plt.xlabel('PRICE')


In [0]:
plt.scatter(df['PRICE'], df['RM'],alpha=0.7)
plt.ylabel('NOX')
plt.xlabel('PRICE')


In [0]:
plt.scatter(df['PRICE'], df['NOX'],alpha=0.7)
plt.ylabel('NOX')
plt.xlabel('PRICE')


### 以B為例子

In [0]:
bk=df['B']
plt.hist(bk,bins=50)

# 將資料分割成訓練集與測試集
*   test_size表示測試集在資料庫的比重 ( 通常設定0.2~0.3 )
*   shuffle = True代表打亂順序
*   可以觀察training data 與 test data 的shape發生變化


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()
scaled_data = min_max_scaler.fit_transform(boston.data)    # scales everything down to values in between 0 and 1

X_train, X_test, y_train, y_test = train_test_split(scaled_data, boston.target, test_size=0.3, random_state= 12, shuffle=True)
print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)
print(X_train)

(354, 13) (152, 13)
(354,) (152,)
[[2.33528471e-03 2.20000000e-01 1.97947214e-01 ... 6.91489362e-01
  9.49997478e-01 5.13245033e-02]
 [3.54870660e-02 0.00000000e+00 6.46627566e-01 ... 8.08510638e-01
  8.42402542e-01 3.42163355e-01]
 [3.88906897e-03 0.00000000e+00 7.85557185e-01 ... 9.14893617e-01
  9.92889203e-01 3.54856512e-01]
 ...
 [7.81818521e-02 0.00000000e+00 6.46627566e-01 ... 8.08510638e-01
  9.93771748e-01 4.24392936e-01]
 [3.96616248e-02 0.00000000e+00 7.00879765e-01 ... 2.23404255e-01
  2.21115538e-01 3.66721854e-01]
 [4.93537813e-04 3.50000000e-01 2.05278592e-01 ... 4.57446809e-01
  9.92737909e-01 2.95253863e-01]]


# 訓練流程
1. 選擇模型
2. 建立模型
3. 訓練模型
4. 模型預測


In [0]:
from sklearn.linear_model import LinearRegression #選擇模型
model = LinearRegression()#建立模型
model.fit(X_train,y_train)#訓練模型
training = model.predict(X_train)
y_test_pred = model.predict(X_test)#模型預測

# 模型成效檢驗 Evaluation


 by R2 score



In [7]:
from sklearn import metrics
print('R2 score for training set:%.2f'%metrics.r2_score(y_train, training))
print('R2 score for accuracy:%.2f'%metrics.r2_score(y_test, y_test_pred))

R2 score for training set:0.75
R2 score for accuracy:0.71


# 改進資料

In [8]:
from  sklearn.datasets import load_boston
boston = load_boston()
boston.data = [np.hstack([boston.data[i],boston.data[i][1]/boston.data[i][7],
                          boston.data[i][1]/boston.data[i][2],boston.data[i][12]/boston.data[i][5],
                          boston.data[i][3]/boston.data[i][4],boston.data[i][6]/boston.data[i][4]])
               for i in range(boston.data.shape[0])]

print(len(boston.data[0]))

18


# 重新訓練模型

In [9]:
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(boston.data, boston.target, test_size=0.3, random_state= 12, shuffle=True)
print(y_train_new.shape)
print(y_test_new.shape)

(354,)
(152,)


In [10]:
model = LinearRegression()
model.fit(X_train_new,y_train_new)
training_new = model.predict(X_train_new)
y_test_pred_new = model.predict(X_test_new)
print('train evaluation :')
print('R2 score for training set:%.2f'%metrics.r2_score(y_train_new, training_new))
print('R2 score for accuracy:%.2f'%metrics.r2_score(y_test_new, y_test_pred_new))

train evaluation :
R2 score for training set:0.81
R2 score for accuracy:0.77


# 那如果 我們再增加更多feature呢？

In [11]:
from  sklearn.datasets import load_boston
boston = load_boston()
boston.data = [np.hstack([boston.data[i],np.log(boston.data[i]+1),
                                   np.power(boston.data[i],2),np.power(boston.data[i],3),
                                   np.power(boston.data[i],4),np.power(boston.data[i],5),
                                   np.power(boston.data[i],5),np.power(boston.data[i],7)])
                        for i in range(boston.data.shape[0])]
print(len(boston.data[0]))

104


In [0]:
X_train_new2, X_test_new2, y_train_new2, y_test_new2 = train_test_split(boston.data, boston.target, test_size=0.3, random_state= 12, shuffle=True)

In [13]:
model = LinearRegression()
model.fit(X_train_new2,y_train_new2)
training_new2 = model.predict(X_train_new2)
y_test_pred_new2 = model.predict(X_test_new2)
print('train evaluation :')
print('R2 score for training set:%.2f'%metrics.r2_score(y_train_new2, training_new2))
print('R2 score:%.2f'%metrics.r2_score(y_test_new2, y_test_pred_new2))

train evaluation :
R2 score for training set:0.57
R2 score:0.49


# Lasso - Regularization

In [0]:
X_train_new2, X_test_new2, y_train_new2, y_test_new2 = train_test_split(boston.data, boston.target, test_size=0.3, random_state= 12, shuffle=True)

In [15]:
from sklearn.linear_model import Lasso,Ridge
model = Lasso(alpha = 0.01,tol = 1, max_iter=1000, normalize=True)
model.fit(X_train_new2,y_train_new2)
training_new2 = model.predict(X_train_new2)
y_test_pred = model.predict(X_test_new2)
print('train evaluation :')
print('R2 score for training set:%.2f'%metrics.r2_score(y_train_new2, training_new2))
print('R2 score:%.2f'%metrics.r2_score(y_test_new2, y_test_pred))

train evaluation :
R2 score for training set:0.79
R2 score:0.75


# Ridge- Regularization

In [0]:
X_train_new2, X_test_new2, y_train_new2, y_test_new2 = train_test_split(boston.data, boston.target, test_size=0.3, random_state= 12, shuffle=True)

In [18]:
model = Ridge(alpha = 100, normalize=True, tol=1)
model.fit(X_train_new2,y_train_new2)
training_new2 = model.predict(X_train_new2)
y_test_pred = model.predict(X_test_new2)
print('train evaluation :')
print('R2 score for training set:%.2f'%metrics.r2_score(y_train_new2, training_new2))
print('R2 score:%.2f'%metrics.r2_score(y_test_new2, y_test_pred))

train evaluation :
R2 score for training set:0.23
R2 score:0.22
