In [1]:
from sklearn.linear_model import Ridge
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
boston = load_boston()

X, y = boston.data,  boston.target

X_train, X_test, y_train, y_test = train_test_split(X, y)

scaler = StandardScaler()

In [3]:
scaler.fit(X_train)
scaler.transform(X_train)

array([[ 0.60398631, -0.49919921,  1.03348522, ...,  0.80429621,
        -3.27266294,  1.58215899],
       [-0.38880673,  0.35563644, -1.12055367, ..., -1.61715132,
         0.22177164, -1.33403425],
       [ 0.39298676, -0.49919921,  1.03348522, ...,  0.80429621,
        -2.87489278,  1.2278422 ],
       ...,
       [ 0.00859406, -0.49919921,  1.03348522, ...,  0.80429621,
        -3.70025757,  0.62301237],
       [ 1.0683238 , -0.49919921,  1.03348522, ...,  0.80429621,
        -3.18175351,  2.95790463],
       [-0.36130738, -0.49919921, -0.16239284, ..., -0.0180822 ,
         0.37233002,  0.78633027]])

In [4]:
#以下代码的功能和上面单元格一样
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled

array([[ 0.60398631, -0.49919921,  1.03348522, ...,  0.80429621,
        -3.27266294,  1.58215899],
       [-0.38880673,  0.35563644, -1.12055367, ..., -1.61715132,
         0.22177164, -1.33403425],
       [ 0.39298676, -0.49919921,  1.03348522, ...,  0.80429621,
        -2.87489278,  1.2278422 ],
       ...,
       [ 0.00859406, -0.49919921,  1.03348522, ...,  0.80429621,
        -3.70025757,  0.62301237],
       [ 1.0683238 , -0.49919921,  1.03348522, ...,  0.80429621,
        -3.18175351,  2.95790463],
       [-0.36130738, -0.49919921, -0.16239284, ..., -0.0180822 ,
         0.37233002,  0.78633027]])

In [5]:
ridge = Ridge().fit(X_train_scaled , y_train)

In [6]:
X_test_scaled = scaler.transform(X_test)
ridge.score(X_test_scaled, y_test)

0.671907485319964

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import RidgeCV
import numpy as np
scores = cross_val_score(RidgeCV(), X_train, y_train, cv=10)
np.mean(scores), np.std(scores)

(0.7208983348527787, 0.10679841325869681)

In [8]:
scores = cross_val_score(RidgeCV(), X_train_scaled, y_train, cv=10)
np.mean(scores), np.std(scores)

(0.7195548803312096, 0.10542202848857646)

In [9]:
from sklearn.neighbors import KNeighborsRegressor
scores = cross_val_score(KNeighborsRegressor(), X_train, y_train, cv=10)
np.mean(scores), np.std(scores)

(0.4841041916456734, 0.18917235505485738)

In [10]:
scores = cross_val_score(KNeighborsRegressor(), X_train_scaled, y_train, cv=10)
np.mean(scores), np.std(scores)

(0.7021981056695604, 0.09895154886785795)

In [11]:
from sklearn.linear_model import Ridge

In [12]:
scores = cross_val_score(Ridge(), X_train, y_train, cv=10)
np.mean(scores), np.std(scores)

(0.7186957960440477, 0.10799154462165003)

In [13]:
from sklearn.linear_model import Ridge
X, y = boston.data, boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
ridge = Ridge().fit(X_train_scaled, y_train)
X_test_scaled = scaler.transform(X_test)
ridge.score(X_test_scaled, y_test)

0.6345884564889053

In [14]:
from sklearn.pipeline import make_pipeline

In [15]:
pipe = make_pipeline(StandardScaler(), Ridge())

In [16]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('ridge',
                 Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                       max_iter=None, normalize=False, random_state=None,
                       solver='auto', tol=0.001))],
         verbose=False)

In [17]:
pipe.score(X_test, y_test)

0.6345884564889053

* Naming Steps

In [18]:
from sklearn.pipeline import make_pipeline
knn_pipe = make_pipeline(StandardScaler(), KNeighborsRegressor())
print(knn_pipe.steps)

[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('kneighborsregressor', KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform'))]


In [19]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([('scaler', StandardScaler()), ('regressor', KNeighborsRegressor())])
print(pipe.steps)

[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('regressor', KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform'))]


In [20]:
from sklearn.neighbors import KNeighborsRegressor
#knn_pipe = Pipeline([('scaler', StandardScaler()), ('kneighborregressor', KNeighborsRegressor())])
knn_pipe = make_pipeline(StandardScaler(), KNeighborsRegressor())

In [21]:
scores = cross_val_score(knn_pipe, X_train, y_train, cv=10)
np.mean(scores), np.std(scores)

(0.7455608822349746, 0.10625173604969355)

* Pipeline and GridSearchCV

In [22]:
from sklearn.model_selection import GridSearchCV

In [23]:
knn_pipe = Pipeline([('scaler', StandardScaler()), ('regressor', KNeighborsRegressor())])
#knn_pipe = make_pipeline(StandardScaler(), KNeighborsRegressor())

In [24]:
param_grid = {'regressor__n_neighbors':range(1,10)} #Here， 字段名应该跟pipeline里面name相匹配, name__n_neighbors
#param_grid = {'kneighborsregressor__n_neighbors':range(1,10)}

In [25]:
grid = GridSearchCV(knn_pipe, param_grid=param_grid, cv=10)

In [26]:
grid.fit(X_train, y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('regressor',
                                        KNeighborsRegressor(algorithm='auto',
                                                            leaf_size=30,
                                                            metric='minkowski',
                                                            metric_params=None,
                                                            n_jobs=None,
                                                            n_neighbors=5, p=2,
                                                            weights='uniform'))],
                                verbose=Fal

In [27]:
grid.best_params_

{'regressor__n_neighbors': 7}

In [28]:
grid.score(X_test, y_test)

0.5999825126971097

* Categorical Variables

In [29]:
import pandas as pd
df = pd.DataFrame({
    'boro':['Manhattan', 'Queens', 'Manhattan', 'Brooklyn', 'Brooklyn', 'Bronx'],
    'vegan':['No', 'No', 'No', 'Yes', 'Yes', 'No']
})

In [30]:
df

Unnamed: 0,boro,vegan
0,Manhattan,No
1,Queens,No
2,Manhattan,No
3,Brooklyn,Yes
4,Brooklyn,Yes
5,Bronx,No


In [31]:
c=df['boro'].astype('category') #transfrom to category c is a series

In [32]:
c

0    Manhattan
1       Queens
2    Manhattan
3     Brooklyn
4     Brooklyn
5        Bronx
Name: boro, dtype: category
Categories (4, object): [Bronx, Brooklyn, Manhattan, Queens]

In [33]:
c.cat.codes # cat means category accessator, codes means code number

0    2
1    3
2    2
3    1
4    1
5    0
dtype: int8

In [34]:
df.boro_ordinal = df.boro.astype('category').cat.codes

  """Entry point for launching an IPython kernel.


In [35]:
df

Unnamed: 0,boro,vegan
0,Manhattan,No
1,Queens,No
2,Manhattan,No
3,Brooklyn,Yes
4,Brooklyn,Yes
5,Bronx,No


* However, tranforming above isn't good

In [36]:
pd.get_dummies(df['boro'])

Unnamed: 0,Bronx,Brooklyn,Manhattan,Queens
0,0,0,1,0
1,0,0,0,1
2,0,0,1,0
3,0,1,0,0
4,0,1,0,0
5,1,0,0,0


In [37]:
df.drop(labels='boro_ordinal', axis=1, inplace=True)

KeyError: "['boro_ordinal'] not found in axis"

In [38]:
pd.get_dummies(df)

Unnamed: 0,boro_Bronx,boro_Brooklyn,boro_Manhattan,boro_Queens,vegan_No,vegan_Yes
0,0,0,1,0,1,0
1,0,0,0,1,1,0
2,0,0,1,0,1,0
3,0,1,0,0,0,1
4,0,1,0,0,0,1
5,1,0,0,0,1,0


In [39]:
pd.get_dummies(df, columns=['boro'])

Unnamed: 0,vegan,boro_Bronx,boro_Brooklyn,boro_Manhattan,boro_Queens
0,No,0,0,1,0
1,No,0,0,0,1
2,No,0,0,1,0
3,Yes,0,1,0,0
4,Yes,0,1,0,0
5,No,1,0,0,0


In [40]:
df = pd.DataFrame({'salary':[103,89,142,54,63,219],
                  'boro':[0,1,0,2,2,3]})
df

Unnamed: 0,salary,boro
0,103,0
1,89,1
2,142,0
3,54,2
4,63,2
5,219,3


In [41]:
pd.get_dummies(df)

Unnamed: 0,salary,boro
0,103,0
1,89,1
2,142,0
3,54,2
4,63,2
5,219,3


In [42]:
pd.get_dummies(df, columns=['boro', 'salary'])

Unnamed: 0,boro_0,boro_1,boro_2,boro_3,salary_54,salary_63,salary_89,salary_103,salary_142,salary_219
0,1,0,0,0,0,0,0,1,0,0
1,0,1,0,0,0,0,1,0,0,0
2,1,0,0,0,0,0,0,0,1,0
3,0,0,1,0,1,0,0,0,0,0
4,0,0,1,0,0,1,0,0,0,0
5,0,0,0,1,0,0,0,0,0,1


* Pandas Categorical Columns

In [43]:
import pandas as pd

In [44]:
df = pd.DataFrame({'salary':[103,89,142,54,63,219],
                  'boro':['Manhattan', 'Queens', 'Manhattan', 'Brooklyn', 'Brooklyn', 'Bronx']})

In [45]:
df

Unnamed: 0,salary,boro
0,103,Manhattan
1,89,Queens
2,142,Manhattan
3,54,Brooklyn
4,63,Brooklyn
5,219,Bronx


In [46]:
df.dtypes

salary     int64
boro      object
dtype: object

In [47]:
df['boro'] = pd.Categorical(df.boro, categories=['Manhattan', 'Queens', 'Brooklyn' , 'Bronx', 'Staten Island'])

In [48]:
df

Unnamed: 0,salary,boro
0,103,Manhattan
1,89,Queens
2,142,Manhattan
3,54,Brooklyn
4,63,Brooklyn
5,219,Bronx


In [49]:
df.dtypes

salary       int64
boro      category
dtype: object

In [50]:
pd.get_dummies(df)

Unnamed: 0,salary,boro_Manhattan,boro_Queens,boro_Brooklyn,boro_Bronx,boro_Staten Island
0,103,1,0,0,0,0
1,89,0,1,0,0,0
2,142,1,0,0,0,0
3,54,0,0,1,0,0
4,63,0,0,1,0,0
5,219,0,0,0,1,0


In [60]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

* The Future is OneHotEncoderEncoder+ColumnTransformer

In [61]:
categorical = df.dtypes == object

In [62]:
categorical

salary    False
boro      False
dtype: bool

In [66]:
preprocess = make_column_transformer(
        (StandardScaler(), ~categorical),
        (OneHotEncoder(), categorical))
model = make_pipeline(preporcess, LogisticRegression())
model

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('standardscaler',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True),
                                                  salary    True
boro      True
dtype: bool),
                                                 ('onehotencoder',
                                                  OneHotEncoder(categorical_features=None,
                                                                categories=None,
                                                                drop=None,
                                          

In [69]:
# another way
preprocess = ColumnTransformer([('scaler', StandardScaler(), ~categorical), ('one-hot', OneHotEncoder(), categorical)])
model = Pipeline([('preprocess', preprocess), ('regression', LogisticRegression())])
model

Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('scaler',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True),
                                                  salary    True
boro      True
dtype: bool),
                                                 ('one-hot',
                                                  OneHotEncoder(categorical_features=None,
                                                                categories=None,
                                                                drop=None,
                                                               