# 第6回課題
前処理と特徴量選択により，SVM の最初のスコアよりも1割程度良いテストスコアを出してください．
ちょこっといじった程度では，線形回帰はスコアが変わらなかったので，参考程度に使ってください

### 必須事項
- 前処理：正規化，標準化，外れ値の排除など
- 特徴量選択: 検証は必須．増やす・減らす・変えないの結果は自由
- テストスコアの向上: mse（平均二乗誤差（MSE：Mean Square Error））で 0.41 くらいは出ると思います

### 自由事項
- 指標の変更
- パラメータの変更（モデルの変更は想定してません）

### 余談
特徴量選択で正解を用意するのは，やはり難しいなと解答を作る時に感じました．解答の方は最低限の考察と検証をしていますが，4時間かかりました( ;∀;)

## 注意事項
特徴量の分析時に分割したデータを使ってない時点でせこいというか，おかしいです．ここでは無視していますが，本来はできません．

In [53]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import Ridge
from sklearn.svm import SVC, SVR
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, make_scorer
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

For more information, read [Cortez et al., 2009].
Input variables (based on physicochemical tests):

- fixed acidity
- volatile acidity
- citric acid
- residual sugar
- chlorides
- free sulfur dioxide
- total sulfur dioxide
- density
- pH
- sulphates
- alcohol

Output variable (based on sensory data):
- quality (score between 0 and 10)

In [54]:
wine_quality_df = pd.read_csv("winequality-red.csv",delimiter=";")
print(wine_quality_df.shape)
wine_quality_df.head()

(1599, 12)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [55]:
wine_quality_df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [56]:
feature_names = list(np.copy(wine_quality_df.columns))
feature_names.remove("quality")

In [57]:
X_train, X_test, y_train, y_test = \
    train_test_split(wine_quality_df[feature_names], wine_quality_df["quality"], 
                     test_size=0.3, random_state=0)
X_train.shape, X_test.shape

((1119, 11), (480, 11))

## 注意
ここで，test score まで同時に出してしまっているんですが，1つの関数で実行してしまう方が楽だったという理由だけで，これらは分けた方が良いです．
パラメータや特徴量について考えるときには，CV だけで調整すべきです

In [58]:
kfold = KFold(n_splits=5, random_state=0)
def cross_validation(model, test=True):
    global X_train, X_test, y_train, y_test, feature_names
    scores = cross_val_score(model, X_train[feature_names], y_train, cv=kfold, 
                             scoring=make_scorer(mean_squared_error))
    # 各分割におけるスコア
    print('Cross-Validation scores: {}'.format(scores))
    # スコアの平均値
    print('Average score: {}'.format(np.mean(scores)))
    if test:
        model.fit(X_train[feature_names], y_train)
        pred = model.predict(X_test[feature_names])
        print('Test score: {}'.format(mean_squared_error(y_test, pred)))

# モデルによる予測

In [59]:
linear_reg = Ridge(random_state=0)
cross_validation(linear_reg)

Cross-Validation scores: [0.48812538 0.48841541 0.42327305 0.42566563 0.36728215]
Average score: 0.43855232598926497
Test score: 0.4010466305154149


In [60]:
svm_clf = SVC(kernel="rbf", random_state=0)
cross_validation(svm_clf)

Cross-Validation scores: [0.67857143 0.82142857 0.71428571 0.75446429 0.62780269]
Average score: 0.719310538116592
Test score: 0.68125


In [61]:
svm_reg = SVR(kernel="rbf")
cross_validation(svm_reg)

Cross-Validation scores: [0.5715145  0.68175944 0.57909369 0.61098986 0.50245379]
Average score: 0.5891622546486783
Test score: 0.5085468469038228


# 以降にコードを追加

In [42]:
#特徴量間の相関関係を調べる
corr = wine_quality_df.corr().values
[[wine_quality_df.columns[i], wine_quality_df.columns[j], corr[i][j]] for i in range(wine_quality_df.shape[1]-1) for j in range(wine_quality_df.shape[1]-1) if abs(corr[i][j]) >= 0.4 and corr[i][j] != 1]

[['fixed acidity', 'citric acid', 0.6717034347641041],
 ['fixed acidity', 'density', 0.6680472921189711],
 ['fixed acidity', 'pH', -0.6829781945685299],
 ['volatile acidity', 'citric acid', -0.5524956845595839],
 ['citric acid', 'fixed acidity', 0.6717034347641041],
 ['citric acid', 'volatile acidity', -0.5524956845595839],
 ['citric acid', 'pH', -0.5419041447395132],
 ['free sulfur dioxide', 'total sulfur dioxide', 0.6676664504810212],
 ['total sulfur dioxide', 'free sulfur dioxide', 0.6676664504810212],
 ['density', 'fixed acidity', 0.6680472921189711],
 ['density', 'alcohol', -0.49617977024170085],
 ['pH', 'fixed acidity', -0.6829781945685299],
 ['pH', 'citric acid', -0.5419041447395132],
 ['alcohol', 'density', -0.49617977024170085]]

特徴量間に特に高い相関関係はみられないため、特徴量選択は行わないことにする。

In [44]:
#欠損値の確認
wine_quality_df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

欠損値がないことを確認

In [62]:
#正規化
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()

X_train = pd.DataFrame(min_max_scaler.fit_transform(X_train), columns=feature_names)
X_test = pd.DataFrame(min_max_scaler.fit_transform(X_test), columns=feature_names)
X_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.348214,0.253425,0.29,0.075342,0.163606,0.253521,0.448763,0.523495,0.149606,0.98773,0.215385
1,0.294643,0.041096,0.37,0.0,0.06177,0.492958,0.363958,0.0,0.11811,0.042945,0.661538
2,0.1875,0.376712,0.0,0.068493,0.113523,0.295775,0.116608,0.509545,0.519685,0.226994,0.2
3,0.482143,0.130137,0.35,0.047945,0.105175,0.112676,0.077739,0.488253,0.393701,0.282209,0.430769
4,0.232143,0.226027,0.36,0.342466,0.103506,0.15493,0.286219,0.567548,0.464567,0.282209,0.323077


In [63]:
linear_reg = Ridge(random_state=0)
cross_validation(linear_reg)

Cross-Validation scores: [0.4892121  0.48830701 0.42580411 0.42564279 0.36217685]
Average score: 0.4382285727903389
Test score: 0.4492016440328416


In [64]:
svm_clf = SVC(kernel="rbf", random_state=0)
cross_validation(svm_clf)

Cross-Validation scores: [0.55803571 0.51785714 0.52232143 0.54910714 0.4529148 ]
Average score: 0.5200472453555414
Test score: 0.5145833333333333


In [65]:
svm_reg = SVR(kernel="rbf")
cross_validation(svm_reg)

Cross-Validation scores: [0.4538687  0.42541329 0.38508794 0.43251404 0.3540613 ]
Average score: 0.4101890514465901
Test score: 0.43664961125857593


In [66]:
#標準化
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()

X_train = pd.DataFrame(standard_scaler.fit_transform(X_train), columns=feature_names)
X_test = pd.DataFrame(standard_scaler.fit_transform(X_test), columns=feature_names)
X_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.148946,-0.21806,0.091982,-0.371725,0.450263,0.256185,2.498011,0.234476,-2.444015,7.385412,-0.573058
1,-0.195652,-1.890462,0.502669,-1.108115,-0.780595,1.838977,1.797243,-3.520329,-2.701983,-1.234462,2.151676
2,-0.884848,0.753012,-1.396757,-0.438669,-0.155077,0.535501,-0.246662,0.134418,0.587115,0.444734,-0.667015
3,1.010441,-1.189132,0.399998,-0.639503,-0.255967,-0.674868,-0.567847,-0.018302,-0.444759,0.948493,0.742331
4,-0.597683,-0.433854,0.451333,2.239114,-0.276145,-0.395552,1.154873,0.550448,0.13567,0.948493,0.084636


In [67]:
linear_reg = Ridge(random_state=0)
cross_validation(linear_reg)

Cross-Validation scores: [0.49239137 0.48985979 0.43040695 0.42268956 0.36400115]
Average score: 0.43986976362616464
Test score: 0.4054752405190062


In [68]:
svm_clf = SVC(kernel="rbf", random_state=0)
cross_validation(svm_clf)

Cross-Validation scores: [0.56696429 0.52678571 0.5        0.52678571 0.4529148 ]
Average score: 0.5146901024983984
Test score: 0.4583333333333333


In [69]:
svm_reg = SVR(kernel="rbf")
cross_validation(svm_reg)

Cross-Validation scores: [0.45148575 0.42211498 0.38000374 0.4269028  0.3493321 ]
Average score: 0.40596787342893653
Test score: 0.39248236816100573


標準化の前処理で、評価の向上が確認できた。特にSVRを使ったものでは、評価の大きな向上が確認できた。