## [作業重點]
了解如何使用 Sklearn 中的 hyper-parameter search 找出最佳的超參數

### 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

![Kaggle Result](Kaggle.PNG)

In [1]:
import os
print(os.listdir("data"))

['test.csv', 'train.csv', 'trainLabels.csv']


In [2]:
import pandas as pd
TrSx = pd.read_csv('data/train.csv',header = None)
TrSy = pd.read_csv('data/trainLabels.csv',header = None)
TeSx =  pd.read_csv('data/test.csv',header = None)
TrSx.shape,TeSx.shape,TrSy.shape

((1000, 40), (9000, 40), (1000, 1))

In [3]:
TrSx.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.025596,-0.024526,-0.024088,-0.002271,1.092329,-0.00625,0.497342,-0.037883,0.026391,-0.003597,...,0.030651,0.022951,-0.542491,-0.011608,-0.483507,0.033371,0.567185,0.006849,-0.892659,0.609451
std,1.008282,1.016298,0.979109,0.970575,4.538834,0.989128,2.118819,2.232256,1.001064,1.01352,...,1.011645,1.001375,2.239939,1.022456,2.121281,1.007044,2.227876,0.997635,2.022022,2.045439
min,-3.365711,-3.492086,-2.695602,-3.460471,-16.421901,-3.04125,-7.224761,-6.509084,-3.145588,-2.749812,...,-3.379194,-2.971125,-7.84089,-2.999564,-7.124105,-2.952358,-5.452254,-3.473913,-8.051722,-7.799086
25%,-0.66901,-0.693937,-0.69883,-0.617557,-1.801997,-0.732265,-0.838619,-1.604037,-0.677562,-0.68222,...,-0.659457,-0.696032,-2.121943,-0.66455,-1.879247,-0.642861,-1.059786,-0.691162,-2.220126,-0.565041
50%,0.027895,-0.033194,0.008145,0.002327,0.862818,0.027041,0.582321,0.018809,0.022092,-0.03611,...,0.049416,0.049778,-0.568262,-0.028097,-0.493575,0.037732,0.455474,0.038284,-0.85547,0.779944
75%,0.76252,0.682753,0.661434,0.640743,3.843172,0.671456,1.913664,1.438304,0.74131,0.665364,...,0.747031,0.699917,0.939348,0.651374,1.005795,0.6918,2.122157,0.693535,0.388698,1.992193
max,3.326246,3.58387,2.546507,3.088738,17.565345,3.102997,7.592666,7.130097,3.145258,3.919426,...,2.844792,3.688047,7.160379,3.353631,6.005818,3.420561,6.603499,3.492548,5.77412,6.803984


In [4]:
from sklearn.model_selection import train_test_split
trsx,vasx,trsy,vasy = train_test_split(TrSx, TrSy, test_size = 0.30, random_state = 101)
trsx.shape,vasx.shape,trsy.shape,vasy.shape

((700, 40), (300, 40), (700, 1), (300, 1))

In [5]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier  # KNN
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
for model in [
    GaussianNB()
    ,KNeighborsClassifier()
    ,RandomForestClassifier(n_estimators=100,random_state=99,criterion='gini',max_depth=None,min_samples_split=2,min_samples_leaf=1)
    ,LogisticRegression(solver = 'saga')
    ,SVC(gamma = 'auto')
    ,DecisionTreeClassifier(criterion='gini',max_depth=None,min_samples_split=2,min_samples_leaf=1)
]:
    model.fit(trsx, trsy.values.ravel())
    my = model.predict(vasx)
    print("Acuuracy:", accuracy_score(vasy, my), "-"*6,type(model).__name__)

Acuuracy: 0.8066666666666666 ------ GaussianNB
Acuuracy: 0.9166666666666666 ------ KNeighborsClassifier
Acuuracy: 0.86 ------ RandomForestClassifier
Acuuracy: 0.82 ------ LogisticRegression
Acuuracy: 0.9033333333333333 ------ SVC
Acuuracy: 0.7333333333333333 ------ DecisionTreeClassifier


In [6]:
from sklearn.model_selection import cross_val_score
for model in [
    GaussianNB()
    ,KNeighborsClassifier()
    ,RandomForestClassifier(n_estimators=100,random_state=99,criterion='gini',max_depth=None,min_samples_split=2,min_samples_leaf=1)
    ,LogisticRegression(solver = 'saga')
    ,SVC(gamma = 'auto')
    ,DecisionTreeClassifier(criterion='gini',max_depth=None,min_samples_split=2,min_samples_leaf=1)
]:
    print("cross_val_score:", cross_val_score(model,TrSx,TrSy.values.ravel(), cv=10).mean(), "-"*6,type(model).__name__)

cross_val_score: 0.8160000000000001 ------ GaussianNB
cross_val_score: 0.906 ------ KNeighborsClassifier
cross_val_score: 0.8690000000000001 ------ RandomForestClassifier
cross_val_score: 0.8210000000000001 ------ LogisticRegression
cross_val_score: 0.915 ------ SVC
cross_val_score: 0.7779999999999999 ------ DecisionTreeClassifier


In [7]:
from sklearn.model_selection import GridSearchCV
n_estimators = [50, 280, 300, 320, 600]
max_depth = [None, 1, 3, 10]
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)
model = RandomForestClassifier(n_estimators=100,random_state=99,criterion='gini',max_depth=None,min_samples_split=2,min_samples_leaf=1)
grid_search = GridSearchCV(model, param_grid, scoring="accuracy", n_jobs=-1, verbose=1, cv=3, iid=True)
grid_result = grid_search.fit(trsx, trsy)
print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    8.8s finished
  self.best_estimator_.fit(X, y, **fit_params)


Best Accuracy: 0.861429 using {'max_depth': None, 'n_estimators': 280}


In [8]:
param_grid = [
    {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [90, 100, 110]}
    , {'kernel': ['linear'], 'C': [95, 100, 105]}
]
model = SVC(gamma = 'auto')
grid_search = GridSearchCV(model, param_grid, scoring="accuracy", n_jobs=-1, verbose=1, cv=3, iid=True)
grid_result = grid_search.fit(trsx, trsy)
print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  27 | elapsed:    0.0s remaining:    0.0s


Best Accuracy: 0.881429 using {'C': 90, 'gamma': 0.001, 'kernel': 'rbf'}


[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  1.2min finished
  y = column_or_1d(y, warn=True)


In [9]:
from sklearn.preprocessing import Normalizer  # Feature Scaling
norm = Normalizer()
norm_train_data = norm.fit_transform(TrSx)
for model in [
    GaussianNB()
    ,KNeighborsClassifier()
    ,RandomForestClassifier(n_estimators=100,random_state=99,criterion='gini',max_depth=None,min_samples_split=2,min_samples_leaf=1)
    ,LogisticRegression(solver = 'saga')
    ,SVC(gamma = 'auto')
    ,DecisionTreeClassifier(criterion='gini',max_depth=None,min_samples_split=2,min_samples_leaf=1)
]:
    print("cross_val_score:", cross_val_score(model,norm_train_data,TrSy.values.ravel(), cv=10).mean(), "-"*6,type(model).__name__)

cross_val_score: 0.808 ------ GaussianNB
cross_val_score: 0.9019999999999999 ------ KNeighborsClassifier
cross_val_score: 0.8699999999999999 ------ RandomForestClassifier
cross_val_score: 0.8220000000000001 ------ LogisticRegression
cross_val_score: 0.808 ------ SVC
cross_val_score: 0.7990000000000002 ------ DecisionTreeClassifier


In [10]:
from sklearn.decomposition import PCA  # Principal Component Analysis
pca = PCA(n_components=12)
pca_train_data = pca.fit_transform(TrSx)
explained_variance = pca.explained_variance_ratio_
print(explained_variance)
print(pca_train_data.shape)
for model in [
    GaussianNB()
    ,KNeighborsClassifier()
    ,RandomForestClassifier(n_estimators=100,random_state=99,criterion='gini',max_depth=None,min_samples_split=2,min_samples_leaf=1)
    ,LogisticRegression(solver = 'saga')
    ,SVC(gamma = 'auto')
    ,DecisionTreeClassifier(criterion='gini',max_depth=None,min_samples_split=2,min_samples_leaf=1)
]:
    print("cross_val_score:", cross_val_score(model,pca_train_data,TrSy.values.ravel(), cv=10).mean(), "-"*6,type(model).__name__)

[0.25054403 0.2055048  0.08026473 0.05033658 0.04895951 0.04489903
 0.0417078  0.03127934 0.02309798 0.02100099 0.01619278 0.01269123]
(1000, 12)
cross_val_score: 0.841 ------ GaussianNB
cross_val_score: 0.909 ------ KNeighborsClassifier
cross_val_score: 0.908 ------ RandomForestClassifier
cross_val_score: 0.8220000000000001 ------ LogisticRegression
cross_val_score: 0.905 ------ SVC
cross_val_score: 0.799 ------ DecisionTreeClassifier


In [11]:
# 分數最高的是直接使用 KNN
Rule = KNeighborsClassifier()
Rule.fit(trsx, trsy.values.ravel())
ry = Rule.predict(vasx)
print("Acuuracy:", accuracy_score(vasy, ry), "-"*6,type(Rule).__name__)

Acuuracy: 0.9166666666666666 ------ KNeighborsClassifier


In [12]:
RY = Rule.predict(TeSx)

In [13]:
import numpy as np
submit = pd.DataFrame()
submit['Id'] = np.arange(1,RY.shape[0]+1)
submit['Solution'] = RY
submit.head()

Unnamed: 0,Id,Solution
0,1,1
1,2,0
2,3,1
3,4,0
4,5,0


In [14]:
submit.to_csv("JL.csv",index=False)