## 練習時間
試著使用 sklearn datasets 的其他資料集 (wine, boston, ...)，來訓練自己的線性迴歸模型。

### HINT: 注意 label 的型態，確定資料集的目標是分類還是回歸，在使用正確的模型訓練！

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston, load_wine, load_breast_cancer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

### Boston Housing pricing  

In [2]:
boston = load_boston()
boston_df = pd.DataFrame(data=boston.data, columns=boston.feature_names)
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [3]:
categorical_feature = ['CHAS', 'RAD']
numeric_df = boston_df.loc[:,~boston_df.columns.isin(categorical_feature)]
categircal_df = boston_df[categorical_feature].values

std_scaler = StandardScaler()
std_df = std_scaler.fit_transform(numeric_df)

trans_boston = np.hstack((std_df, categircal_df))

x_train, x_test, y_train, y_test = train_test_split(trans_boston, boston.target, test_size=0.3)

In [4]:
LR = LinearRegression()
model = LR.fit(x_train, y_train)

model.fit(x_train, y_train)
y_pred = model.predict(x_test)

r_square = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print('R square score: {}, MSE: {}'.format(r_square, mse))

R square score: 0.6909191585057455, MSE: 26.242852255362664


### Breast Cancer

In [5]:
breast_cancer = load_breast_cancer()

In [6]:
std_scaler = StandardScaler()
trans_cancer = std_scaler.fit_transform(breast_cancer.data)
x_train, x_test, y_train, y_test = train_test_split(trans_cancer, breast_cancer.target, test_size=0.3)

LogReg = LogisticRegression(solver='lbfgs')
avg_score = cross_val_score(LogReg, x_train, y_train, cv=5).mean()

LogReg_model = LogReg.fit(x_train, y_train)
y_pred = LogReg_model.predict(x_test)

acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print('Cross Validation average score: {}'.format(avg_score))
print('Accuray: {}, Precision: {}, Recall: {}'.format(acc, precision, recall))

Cross Validation average score: 0.9824050632911392
Accuray: 0.9707602339181286, Precision: 0.9619047619047619, Recall: 0.9901960784313726


### Wine

In [8]:
wine = load_wine()
X, y = wine['data'], wine['target']

std_scale = StandardScaler()
std_x = std_scale.fit_transform(X)

x_train, x_test, y_train, y_test = train_test_split(std_x, y, test_size=0.3)

print('x train shape: {}'.format(x_train.shape))
print('x test shape: {}'.format(x_test.shape))
print('y train shape: {}'.format(x_train.shape))
print('y test shape: {}'.format(y_test.shape))

In [9]:
LogReg = LogisticRegression(multi_class='ovr', solver='newton-cg')
avg_score = cross_val_score(LogReg, x_train, y_train, cv=5).mean()

In [10]:
LogReg_model = LogReg.fit(x_train, y_train)
y_pred = LogReg_model.predict(x_test)

In [11]:
acc = accuracy_score(y_test, y_pred)
print('Cross Validation average score: {}'.format(avg_score))
print('Accuray: {}'.format(acc))

Cross Validation average score: 0.9839743589743591
Accuray: 0.9814814814814815
