In [1]:
import os
TITANIC_PATH = os.path.join("datasets", "titanic")

In [4]:
import pandas as pd

def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

In [5]:
train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")

In [6]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [10]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [11]:
train_data["Survived"].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [13]:
train_data["Pclass"].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [14]:
train_data["SibSp"].value_counts()

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

In [15]:
train_data["Sex"].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [16]:
train_data["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [17]:
train_data_copy = train_data.copy()

In [18]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_name):
        self.attribute_name = attribute_name
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_name]

In [19]:
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])),
        ("imputer", SimpleImputer(strategy="median"))
    ])

In [20]:
num_pipeline.fit_transform(train_data)

array([[22.    ,  1.    ,  0.    ,  7.25  ],
       [38.    ,  1.    ,  0.    , 71.2833],
       [26.    ,  0.    ,  0.    ,  7.925 ],
       ...,
       [28.    ,  1.    ,  2.    , 23.45  ],
       [26.    ,  0.    ,  0.    , 30.    ],
       [32.    ,  0.    ,  0.    ,  7.75  ]])

In [26]:
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X], index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [22]:
from sklearn.preprocessing import OneHotEncoder

In [27]:
cat_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["Pclass", "Sex", "Embarked"])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False))
    ])

In [28]:
cat_pipeline.fit_transform(train_data)

array([[0., 0., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 1.],
       ...,
       [0., 0., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 1., 0.]])

In [30]:
from sklearn.pipeline import FeatureUnion

preprocess_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline)
])

In [31]:
X_train = preprocess_pipeline.fit_transform(train_data)
X_train

array([[22.,  1.,  0., ...,  0.,  0.,  1.],
       [38.,  1.,  0., ...,  1.,  0.,  0.],
       [26.,  0.,  0., ...,  0.,  0.,  1.],
       ...,
       [28.,  1.,  2., ...,  0.,  0.,  1.],
       [26.,  0.,  0., ...,  1.,  0.,  0.],
       [32.,  0.,  0., ...,  0.,  1.,  0.]])

In [34]:
y_train = train_data["Survived"]

In [36]:
from sklearn.svm import SVC

svm_clf = SVC()
svm_clf.fit(X_train, y_train)

SVC()

In [37]:
from sklearn.model_selection import cross_val_score

svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)

In [38]:
svm_scores

array([0.62222222, 0.58426966, 0.68539326, 0.74157303, 0.69662921,
       0.69662921, 0.68539326, 0.71910112, 0.70786517, 0.6741573 ])

In [39]:
svm_scores.mean()

0.6813233458177278

In [40]:
# Tinh chỉnh siêu tham số gamma cho mô hình SVC
svm_clf = SVC(gamma="auto")
svm_clf.fit(X_train, y_train)
svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_scores.mean()

0.7329588014981274

In [66]:
from sklearn.model_selection import GridSearchCV
# GridSearchCV for SVC
svc_param_grid = [{"kernel":["linear", "rbf", "sigmoid"],
                   "gamma": ["scale", "auto"],
                  }]

svc_grid_search = GridSearchCV(svm_clf, svc_param_grid, cv=5, verbose=3)
svc_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END ........gamma=scale, kernel=linear;, score=0.804 total time=   8.5s
[CV 2/5] END ........gamma=scale, kernel=linear;, score=0.809 total time=   4.8s
[CV 3/5] END ........gamma=scale, kernel=linear;, score=0.787 total time=   5.1s
[CV 4/5] END ........gamma=scale, kernel=linear;, score=0.753 total time=  27.5s
[CV 5/5] END ........gamma=scale, kernel=linear;, score=0.787 total time=  34.9s
[CV 1/5] END ...........gamma=scale, kernel=rbf;, score=0.592 total time=   0.0s
[CV 2/5] END ...........gamma=scale, kernel=rbf;, score=0.713 total time=   0.0s
[CV 3/5] END ...........gamma=scale, kernel=rbf;, score=0.691 total time=   0.0s
[CV 4/5] END ...........gamma=scale, kernel=rbf;, score=0.685 total time=   0.0s
[CV 5/5] END ...........gamma=scale, kernel=rbf;, score=0.691 total time=   0.0s
[CV 1/5] END .......gamma=scale, kernel=sigmoid;, score=0.587 total time=   0.0s
[CV 2/5] END .......gamma=scale, kernel=sigmoid;,

GridSearchCV(cv=5, estimator=SVC(gamma='auto'),
             param_grid=[{'gamma': ['scale', 'auto'],
                          'kernel': ['linear', 'rbf', 'sigmoid']}],
             verbose=3)

In [67]:
svc_grid_search.best_params_

{'gamma': 'scale', 'kernel': 'linear'}

In [69]:
svc_grid_search.best_score_

0.7878601468834348

In [42]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(random_state=42)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

0.8126466916354558

In [57]:
# GridSearchCV for RandomForestClassifier
forest_clf_param_grid = [{"n_estimators":[10, 50, 75, 100], 
               "criterion": ["gini", "entropy"], 
               "bootstrap":[True, False]
              }]

forest_clf_grid_search = GridSearchCV(forest_clf, forest_clf_param_grid, cv=5, verbose=3)
forest_clf_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END bootstrap=True, criterion=gini, n_estimators=10;, score=0.754 total time=   0.0s
[CV 2/5] END bootstrap=True, criterion=gini, n_estimators=10;, score=0.787 total time=   0.0s
[CV 3/5] END bootstrap=True, criterion=gini, n_estimators=10;, score=0.831 total time=   0.0s
[CV 4/5] END bootstrap=True, criterion=gini, n_estimators=10;, score=0.798 total time=   0.0s
[CV 5/5] END bootstrap=True, criterion=gini, n_estimators=10;, score=0.820 total time=   0.0s
[CV 1/5] END bootstrap=True, criterion=gini, n_estimators=50;, score=0.760 total time=   0.0s
[CV 2/5] END bootstrap=True, criterion=gini, n_estimators=50;, score=0.798 total time=   0.0s
[CV 3/5] END bootstrap=True, criterion=gini, n_estimators=50;, score=0.854 total time=   0.0s
[CV 4/5] END bootstrap=True, criterion=gini, n_estimators=50;, score=0.775 total time=   0.0s
[CV 5/5] END bootstrap=True, criterion=gini, n_estimators=50;, score=0.831 total time=   0.0s

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid=[{'bootstrap': [True, False],
                          'criterion': ['gini', 'entropy'],
                          'n_estimators': [10, 50, 75, 100]}],
             verbose=3)

In [58]:
forest_clf_grid_search.best_params_

{'bootstrap': True, 'criterion': 'entropy', 'n_estimators': 50}

In [59]:
forest_clf_grid_search.best_score_

0.805881614462369

In [71]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf_scores = cross_val_score(sgd_clf, X_train, y_train, cv=10)
sgd_clf_scores.mean()

0.6936704119850188

In [72]:
# GridSearchCV for SGDClassifier
sgd_clf_param_grid = [{"alpha":[0.00001, 0.0001, 0.001],
                       "fit_intercept": [True, False],
                       "shuffle":[True, False],
                       "epsilon":[0.001, 0.01, 0.1],
                       "random_state":[42],
                       "learning_rate":["constant", "optimal", "invscaling", "adaptive"]
                      }]
sgd_clf_grid_search = GridSearchCV(sgd_clf, sgd_clf_param_grid, cv=5, verbose=3)
sgd_clf_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV 1/5] END alpha=1e-05, epsilon=0.001, fit_intercept=True, learning_rate=constant, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 2/5] END alpha=1e-05, epsilon=0.001, fit_intercept=True, learning_rate=constant, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 3/5] END alpha=1e-05, epsilon=0.001, fit_intercept=True, learning_rate=constant, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 4/5] END alpha=1e-05, epsilon=0.001, fit_intercept=True, learning_rate=constant, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 5/5] END alpha=1e-05, epsilon=0.001, fit_intercept=True, learning_rate=constant, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 1/5] END alpha=1e-05, epsilon=0.001, fit_intercept=True, learning_rate=constant, random_state=42, shuffle=False;, score=nan total time=   0.0s
[CV 2/5] END alpha=1e-05, epsilon=0.001, fit_intercept=True,

[CV 5/5] END alpha=1e-05, epsilon=0.001, fit_intercept=False, learning_rate=optimal, random_state=42, shuffle=False;, score=0.742 total time=   0.0s
[CV 1/5] END alpha=1e-05, epsilon=0.001, fit_intercept=False, learning_rate=invscaling, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 2/5] END alpha=1e-05, epsilon=0.001, fit_intercept=False, learning_rate=invscaling, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 3/5] END alpha=1e-05, epsilon=0.001, fit_intercept=False, learning_rate=invscaling, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 4/5] END alpha=1e-05, epsilon=0.001, fit_intercept=False, learning_rate=invscaling, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 5/5] END alpha=1e-05, epsilon=0.001, fit_intercept=False, learning_rate=invscaling, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 1/5] END alpha=1e-05, epsilon=0.001, fit_intercept=False, learning_rate=invscaling, random_state=42, s

[CV 2/5] END alpha=1e-05, epsilon=0.01, fit_intercept=False, learning_rate=optimal, random_state=42, shuffle=True;, score=0.646 total time=   0.0s
[CV 3/5] END alpha=1e-05, epsilon=0.01, fit_intercept=False, learning_rate=optimal, random_state=42, shuffle=True;, score=0.736 total time=   0.0s
[CV 4/5] END alpha=1e-05, epsilon=0.01, fit_intercept=False, learning_rate=optimal, random_state=42, shuffle=True;, score=0.742 total time=   0.0s
[CV 5/5] END alpha=1e-05, epsilon=0.01, fit_intercept=False, learning_rate=optimal, random_state=42, shuffle=True;, score=0.764 total time=   0.0s
[CV 1/5] END alpha=1e-05, epsilon=0.01, fit_intercept=False, learning_rate=optimal, random_state=42, shuffle=False;, score=0.765 total time=   0.0s
[CV 2/5] END alpha=1e-05, epsilon=0.01, fit_intercept=False, learning_rate=optimal, random_state=42, shuffle=False;, score=0.781 total time=   0.0s
[CV 3/5] END alpha=1e-05, epsilon=0.01, fit_intercept=False, learning_rate=optimal, random_state=42, shuffle=False;,

[CV 4/5] END alpha=1e-05, epsilon=0.1, fit_intercept=True, learning_rate=adaptive, random_state=42, shuffle=False;, score=nan total time=   0.0s
[CV 5/5] END alpha=1e-05, epsilon=0.1, fit_intercept=True, learning_rate=adaptive, random_state=42, shuffle=False;, score=nan total time=   0.0s
[CV 1/5] END alpha=1e-05, epsilon=0.1, fit_intercept=False, learning_rate=constant, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 2/5] END alpha=1e-05, epsilon=0.1, fit_intercept=False, learning_rate=constant, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 3/5] END alpha=1e-05, epsilon=0.1, fit_intercept=False, learning_rate=constant, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 4/5] END alpha=1e-05, epsilon=0.1, fit_intercept=False, learning_rate=constant, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 5/5] END alpha=1e-05, epsilon=0.1, fit_intercept=False, learning_rate=constant, random_state=42, shuffle=True;, score=nan tota

[CV 2/5] END alpha=0.0001, epsilon=0.001, fit_intercept=True, learning_rate=invscaling, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 3/5] END alpha=0.0001, epsilon=0.001, fit_intercept=True, learning_rate=invscaling, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 4/5] END alpha=0.0001, epsilon=0.001, fit_intercept=True, learning_rate=invscaling, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 5/5] END alpha=0.0001, epsilon=0.001, fit_intercept=True, learning_rate=invscaling, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 1/5] END alpha=0.0001, epsilon=0.001, fit_intercept=True, learning_rate=invscaling, random_state=42, shuffle=False;, score=nan total time=   0.0s
[CV 2/5] END alpha=0.0001, epsilon=0.001, fit_intercept=True, learning_rate=invscaling, random_state=42, shuffle=False;, score=nan total time=   0.0s
[CV 3/5] END alpha=0.0001, epsilon=0.001, fit_intercept=True, learning_rate=invscaling, random_state=42,

[CV 2/5] END alpha=0.0001, epsilon=0.01, fit_intercept=True, learning_rate=optimal, random_state=42, shuffle=False;, score=0.792 total time=   0.0s
[CV 3/5] END alpha=0.0001, epsilon=0.01, fit_intercept=True, learning_rate=optimal, random_state=42, shuffle=False;, score=0.775 total time=   0.0s
[CV 4/5] END alpha=0.0001, epsilon=0.01, fit_intercept=True, learning_rate=optimal, random_state=42, shuffle=False;, score=0.781 total time=   0.0s
[CV 5/5] END alpha=0.0001, epsilon=0.01, fit_intercept=True, learning_rate=optimal, random_state=42, shuffle=False;, score=0.764 total time=   0.0s
[CV 1/5] END alpha=0.0001, epsilon=0.01, fit_intercept=True, learning_rate=invscaling, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 2/5] END alpha=0.0001, epsilon=0.01, fit_intercept=True, learning_rate=invscaling, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 3/5] END alpha=0.0001, epsilon=0.01, fit_intercept=True, learning_rate=invscaling, random_state=42, shuffle=

[CV 3/5] END alpha=0.0001, epsilon=0.1, fit_intercept=True, learning_rate=optimal, random_state=42, shuffle=True;, score=0.758 total time=   0.0s
[CV 4/5] END alpha=0.0001, epsilon=0.1, fit_intercept=True, learning_rate=optimal, random_state=42, shuffle=True;, score=0.730 total time=   0.0s
[CV 5/5] END alpha=0.0001, epsilon=0.1, fit_intercept=True, learning_rate=optimal, random_state=42, shuffle=True;, score=0.770 total time=   0.0s
[CV 1/5] END alpha=0.0001, epsilon=0.1, fit_intercept=True, learning_rate=optimal, random_state=42, shuffle=False;, score=0.782 total time=   0.0s
[CV 2/5] END alpha=0.0001, epsilon=0.1, fit_intercept=True, learning_rate=optimal, random_state=42, shuffle=False;, score=0.792 total time=   0.0s
[CV 3/5] END alpha=0.0001, epsilon=0.1, fit_intercept=True, learning_rate=optimal, random_state=42, shuffle=False;, score=0.775 total time=   0.0s
[CV 4/5] END alpha=0.0001, epsilon=0.1, fit_intercept=True, learning_rate=optimal, random_state=42, shuffle=False;, score

[CV 5/5] END alpha=0.0001, epsilon=0.1, fit_intercept=False, learning_rate=adaptive, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 1/5] END alpha=0.0001, epsilon=0.1, fit_intercept=False, learning_rate=adaptive, random_state=42, shuffle=False;, score=nan total time=   0.0s
[CV 2/5] END alpha=0.0001, epsilon=0.1, fit_intercept=False, learning_rate=adaptive, random_state=42, shuffle=False;, score=nan total time=   0.0s
[CV 3/5] END alpha=0.0001, epsilon=0.1, fit_intercept=False, learning_rate=adaptive, random_state=42, shuffle=False;, score=nan total time=   0.0s
[CV 4/5] END alpha=0.0001, epsilon=0.1, fit_intercept=False, learning_rate=adaptive, random_state=42, shuffle=False;, score=nan total time=   0.0s
[CV 5/5] END alpha=0.0001, epsilon=0.1, fit_intercept=False, learning_rate=adaptive, random_state=42, shuffle=False;, score=nan total time=   0.0s
[CV 1/5] END alpha=0.001, epsilon=0.001, fit_intercept=True, learning_rate=constant, random_state=42, shuffle=True;, sc

[CV 5/5] END alpha=0.001, epsilon=0.001, fit_intercept=False, learning_rate=optimal, random_state=42, shuffle=True;, score=0.730 total time=   0.0s
[CV 1/5] END alpha=0.001, epsilon=0.001, fit_intercept=False, learning_rate=optimal, random_state=42, shuffle=False;, score=0.715 total time=   0.0s
[CV 2/5] END alpha=0.001, epsilon=0.001, fit_intercept=False, learning_rate=optimal, random_state=42, shuffle=False;, score=0.787 total time=   0.0s
[CV 3/5] END alpha=0.001, epsilon=0.001, fit_intercept=False, learning_rate=optimal, random_state=42, shuffle=False;, score=0.725 total time=   0.0s
[CV 4/5] END alpha=0.001, epsilon=0.001, fit_intercept=False, learning_rate=optimal, random_state=42, shuffle=False;, score=0.753 total time=   0.0s
[CV 5/5] END alpha=0.001, epsilon=0.001, fit_intercept=False, learning_rate=optimal, random_state=42, shuffle=False;, score=0.742 total time=   0.0s
[CV 1/5] END alpha=0.001, epsilon=0.001, fit_intercept=False, learning_rate=invscaling, random_state=42, sh

[CV 3/5] END alpha=0.001, epsilon=0.01, fit_intercept=False, learning_rate=optimal, random_state=42, shuffle=False;, score=0.725 total time=   0.0s
[CV 4/5] END alpha=0.001, epsilon=0.01, fit_intercept=False, learning_rate=optimal, random_state=42, shuffle=False;, score=0.753 total time=   0.0s
[CV 5/5] END alpha=0.001, epsilon=0.01, fit_intercept=False, learning_rate=optimal, random_state=42, shuffle=False;, score=0.742 total time=   0.0s
[CV 1/5] END alpha=0.001, epsilon=0.01, fit_intercept=False, learning_rate=invscaling, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 2/5] END alpha=0.001, epsilon=0.01, fit_intercept=False, learning_rate=invscaling, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 3/5] END alpha=0.001, epsilon=0.01, fit_intercept=False, learning_rate=invscaling, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 4/5] END alpha=0.001, epsilon=0.01, fit_intercept=False, learning_rate=invscaling, random_state=42, shuffle=

[CV 4/5] END alpha=0.001, epsilon=0.1, fit_intercept=False, learning_rate=constant, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 5/5] END alpha=0.001, epsilon=0.1, fit_intercept=False, learning_rate=constant, random_state=42, shuffle=True;, score=nan total time=   0.0s
[CV 1/5] END alpha=0.001, epsilon=0.1, fit_intercept=False, learning_rate=constant, random_state=42, shuffle=False;, score=nan total time=   0.0s
[CV 2/5] END alpha=0.001, epsilon=0.1, fit_intercept=False, learning_rate=constant, random_state=42, shuffle=False;, score=nan total time=   0.0s
[CV 3/5] END alpha=0.001, epsilon=0.1, fit_intercept=False, learning_rate=constant, random_state=42, shuffle=False;, score=nan total time=   0.0s
[CV 4/5] END alpha=0.001, epsilon=0.1, fit_intercept=False, learning_rate=constant, random_state=42, shuffle=False;, score=nan total time=   0.0s
[CV 5/5] END alpha=0.001, epsilon=0.1, fit_intercept=False, learning_rate=constant, random_state=42, shuffle=False;, score=nan

540 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\leatu\anaconda3\envs\py39\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\leatu\anaconda3\envs\py39\lib\site-packages\sklearn\linear_model\_stochastic_gradient.py", line 883, in fit
    return self._fit(
  File "C:\Users\leatu\anaconda3\envs\py39\lib\site-packages\sklearn\linear_model\_stochastic_gradient.py", line 649, in _fit
    self._validate_params()
  File "C:\Users\leatu\anaconda3\envs\py39\lib\site-packages\sklearn\linear_model\_stochastic_gradient

GridSearchCV(cv=5, estimator=SGDClassifier(random_state=42),
             param_grid=[{'alpha': [1e-05, 0.0001, 0.001],
                          'epsilon': [0.001, 0.01, 0.1],
                          'fit_intercept': [True, False],
                          'learning_rate': ['constant', 'optimal', 'invscaling',
                                            'adaptive'],
                          'random_state': [42], 'shuffle': [True, False]}],
             verbose=3)

In [73]:
sgd_clf_grid_search.best_params_

{'alpha': 0.0001,
 'epsilon': 0.001,
 'fit_intercept': True,
 'learning_rate': 'optimal',
 'random_state': 42,
 'shuffle': False}

In [74]:
sgd_clf_grid_search.best_score_

0.7788964911179461

In [78]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
knn_clf_param_grid = [{"n_neighbors":[5, 10, 20, 50, 100],
                       "weights": ["uniform", "distance"],
                       "algorithm":["auto", "ball_tree", "kd_tree", "brute"]
                      }]
knn_clf_grid_search = GridSearchCV(knn_clf, knn_clf_param_grid, cv=5, verbose=3)
knn_clf_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 1/5] END algorithm=auto, n_neighbors=5, weights=uniform;, score=0.670 total time=   0.0s
[CV 2/5] END algorithm=auto, n_neighbors=5, weights=uniform;, score=0.674 total time=   0.0s
[CV 3/5] END algorithm=auto, n_neighbors=5, weights=uniform;, score=0.719 total time=   0.0s
[CV 4/5] END algorithm=auto, n_neighbors=5, weights=uniform;, score=0.730 total time=   0.0s
[CV 5/5] END algorithm=auto, n_neighbors=5, weights=uniform;, score=0.730 total time=   0.0s
[CV 1/5] END algorithm=auto, n_neighbors=5, weights=distance;, score=0.698 total time=   0.0s
[CV 2/5] END algorithm=auto, n_neighbors=5, weights=distance;, score=0.708 total time=   0.0s
[CV 3/5] END algorithm=auto, n_neighbors=5, weights=distance;, score=0.725 total time=   0.0s
[CV 4/5] END algorithm=auto, n_neighbors=5, weights=distance;, score=0.697 total time=   0.0s
[CV 5/5] END algorithm=auto, n_neighbors=5, weights=distance;, score=0.736 total time=   0.0s
[CV

[CV 3/5] END algorithm=ball_tree, n_neighbors=100, weights=uniform;, score=0.674 total time=   0.0s
[CV 4/5] END algorithm=ball_tree, n_neighbors=100, weights=uniform;, score=0.691 total time=   0.0s
[CV 5/5] END algorithm=ball_tree, n_neighbors=100, weights=uniform;, score=0.680 total time=   0.0s
[CV 1/5] END algorithm=ball_tree, n_neighbors=100, weights=distance;, score=0.642 total time=   0.0s
[CV 2/5] END algorithm=ball_tree, n_neighbors=100, weights=distance;, score=0.753 total time=   0.0s
[CV 3/5] END algorithm=ball_tree, n_neighbors=100, weights=distance;, score=0.730 total time=   0.0s
[CV 4/5] END algorithm=ball_tree, n_neighbors=100, weights=distance;, score=0.742 total time=   0.0s
[CV 5/5] END algorithm=ball_tree, n_neighbors=100, weights=distance;, score=0.742 total time=   0.0s
[CV 1/5] END algorithm=kd_tree, n_neighbors=5, weights=uniform;, score=0.670 total time=   0.0s
[CV 2/5] END algorithm=kd_tree, n_neighbors=5, weights=uniform;, score=0.674 total time=   0.0s
[CV

[CV 1/5] END algorithm=brute, n_neighbors=50, weights=uniform;, score=0.581 total time=   0.0s
[CV 2/5] END algorithm=brute, n_neighbors=50, weights=uniform;, score=0.725 total time=   0.0s
[CV 3/5] END algorithm=brute, n_neighbors=50, weights=uniform;, score=0.691 total time=   0.0s
[CV 4/5] END algorithm=brute, n_neighbors=50, weights=uniform;, score=0.713 total time=   0.0s
[CV 5/5] END algorithm=brute, n_neighbors=50, weights=uniform;, score=0.702 total time=   0.0s
[CV 1/5] END algorithm=brute, n_neighbors=50, weights=distance;, score=0.631 total time=   0.0s
[CV 2/5] END algorithm=brute, n_neighbors=50, weights=distance;, score=0.764 total time=   0.0s
[CV 3/5] END algorithm=brute, n_neighbors=50, weights=distance;, score=0.742 total time=   0.0s
[CV 4/5] END algorithm=brute, n_neighbors=50, weights=distance;, score=0.742 total time=   0.0s
[CV 5/5] END algorithm=brute, n_neighbors=50, weights=distance;, score=0.725 total time=   0.0s
[CV 1/5] END algorithm=brute, n_neighbors=100

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid=[{'algorithm': ['auto', 'ball_tree', 'kd_tree',
                                        'brute'],
                          'n_neighbors': [5, 10, 20, 50, 100],
                          'weights': ['uniform', 'distance']}],
             verbose=3)

In [79]:
knn_clf_grid_search.best_params_

{'algorithm': 'auto', 'n_neighbors': 20, 'weights': 'distance'}

In [80]:
knn_clf_grid_search.best_score_

0.7262444291004959