https://scikit-learn.org/stable/modules/classes.html


## SimpleImputer

In [1]:
import pandas as pd
import numpy as np

In [58]:
# Create a radnom datset of 10 rows and 4 columns
df = pd.DataFrame(np.random.randn(10, 4), columns=list('ABCD'))

# Randomly set some values as null
df = df.mask(np.random.random((10, 4)) < .15)

# Duplicate two cells with same values
df['B'][8] = df['B'][9]
df

Unnamed: 0,A,B,C,D
0,-1.225225,-0.441663,-0.639521,0.239312
1,-1.189839,0.244999,,0.237496
2,0.046567,2.044863,-0.492386,1.120441
3,-1.260646,-0.759354,0.084205,-0.302208
4,0.448148,0.450981,-0.341173,-0.160521
5,-0.784799,1.132403,-0.257447,1.106731
6,0.576272,0.590631,1.622067,0.961268
7,2.368502,1.003604,-0.783473,0.227584
8,1.206851,-0.533282,-1.206051,0.021001
9,1.943065,-0.533282,0.007462,1.70983


In [20]:
from sklearn.impute import SimpleImputer

mean_imputer = SimpleImputer(strategy='mean')

result_mean_imputer = mean_imputer.fit_transform(df)

df = pd.DataFrame(result_mean_imputer, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
0,0.702285,1.384076,-0.1055,-0.630958
1,-0.638733,0.330033,1.005881,-0.192835
2,0.440931,0.330033,-0.482339,-0.243206
3,0.192037,0.086825,-1.881553,-0.013967
4,-0.079799,0.269569,-0.311806,-0.321948
5,1.072228,0.330033,-0.757691,-0.017213
6,0.190925,-0.594727,1.191518,-0.567384
7,-0.262046,0.737842,-0.067765,-0.036221
8,0.631517,0.213324,1.650627,-1.164322
9,-1.276295,0.213324,-0.919025,-0.676354


# OneHotEncoder

In [32]:
from sklearn.preprocessing import OneHotEncoder

In [10]:
df = pd.read_csv('datasets/housing.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [44]:
# Не работает с Series только [[],[]] 

In [38]:
df_ocean = df['ocean_proximity']

encoder = OneHotEncoder()

df_ocean_onehot = encoder.fit_transform(df_ocean.to_numpy().reshape(-1,1))

df_ocean_onehot.toarray()

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [43]:
df_ocean.to_numpy().reshape(-1,1)

array([['NEAR BAY'],
       ['NEAR BAY'],
       ['NEAR BAY'],
       ...,
       ['INLAND'],
       ['INLAND'],
       ['INLAND']], dtype=object)

# Масштабирование признаков

MinMax

In [24]:
from sklearn.preprocessing import MinMaxScaler 

In [25]:
scaler = MinMaxScaler(feature_range = (0,2))
pd.DataFrame(scaler.fit_transform(df), columns = list('ABCD'))

Unnamed: 0,A,B,C,D
0,1.684957,2.0,1.005641,0.927303
1,0.542947,0.934667,1.63493,1.689021
2,1.462387,0.934667,0.792267,1.601446
3,1.25043,0.688853,0.0,2.0
4,1.018935,0.873555,0.888826,1.464546
5,2.0,0.934667,0.636356,1.994357
6,1.249482,0.0,1.740042,1.037832
7,0.863734,1.346844,1.027008,1.96131
8,1.624691,0.816707,2.0,0.0
9,0.0,0.816707,0.545005,0.848377


Стандартизация

In [14]:
from sklearn.preprocessing import StandardScaler

In [26]:
standard = StandardScaler()

pd.DataFrame(standard.fit_transform(df), columns = list('ABCD'))

Unnamed: 0,A,B,C,D
0,0.919734,2.2358,-0.036976,-0.693947
1,-1.118978,1.177484e-16,1.052053,0.549459
2,0.522404,1.177484e-16,-0.406236,0.406504
3,0.144018,-0.5158853,-1.777309,1.057091
4,-0.269246,-0.1282544,-0.239133,0.183032
5,1.482149,1.177484e-16,-0.67605,1.047879
6,0.142327,-1.961571,1.233957,-0.513522
7,-0.546311,0.8650307,0.0,0.993935
8,0.812148,-0.2475599,1.683833,-2.207648
9,-2.088246,-0.2475599,-0.83414,-0.822783


# "Конвееры трансформации"

In [27]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [65]:
num_pipelines = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('std_scaller', StandardScaler())
])

pd.DataFrame(num_pipelines.fit_transform(df), columns = list('ABCD'))

Unnamed: 0,A,B,C,D
0,-1.135309,-0.885416,-0.583725,-0.440542
1,-1.107374,-0.087176,0.0,-0.443433
2,-0.131303,2.00515,-0.377564,0.961918
3,-1.163272,-1.254728,0.430341,-1.30246
4,0.185723,0.152276,-0.165687,-1.076942
5,-0.787618,0.944423,-0.048372,0.940096
6,0.286869,0.314618,2.585155,0.708568
7,1.701733,0.794696,-0.785427,-0.459209
8,0.784676,-0.991922,-1.377533,-0.788021
9,1.365874,-0.991922,0.322812,1.900026


Так же можно сразу обрабатывать столбцы численные и качественные. 

In [67]:
from copy import deepcopy
df_c = deepcopy(df)
df_c['E'] = list('ABCABCABCA')
df_c

Unnamed: 0,A,B,C,D,E
0,-1.225225,-0.441663,-0.639521,0.239312,A
1,-1.189839,0.244999,,0.237496,B
2,0.046567,2.044863,-0.492386,1.120441,C
3,-1.260646,-0.759354,0.084205,-0.302208,A
4,0.448148,0.450981,-0.341173,-0.160521,B
5,-0.784799,1.132403,-0.257447,1.106731,C
6,0.576272,0.590631,1.622067,0.961268,A
7,2.368502,1.003604,-0.783473,0.227584,B
8,1.206851,-0.533282,-1.206051,0.021001,C
9,1.943065,-0.533282,0.007462,1.70983,A


In [69]:
from sklearn.compose import ColumnTransformer
oHe = OneHotEncoder()
full_pipeline = ColumnTransformer([
    ('num', num_pipelines, list('ABCD')),
    ('cat', oHe, ['E'])
])


pd.DataFrame(full_pipeline.fit_transform(df_c))


Unnamed: 0,0,1,2,3,4,5,6
0,-1.135309,-0.885416,-0.583725,-0.440542,1.0,0.0,0.0
1,-1.107374,-0.087176,0.0,-0.443433,0.0,1.0,0.0
2,-0.131303,2.00515,-0.377564,0.961918,0.0,0.0,1.0
3,-1.163272,-1.254728,0.430341,-1.30246,1.0,0.0,0.0
4,0.185723,0.152276,-0.165687,-1.076942,0.0,1.0,0.0
5,-0.787618,0.944423,-0.048372,0.940096,0.0,0.0,1.0
6,0.286869,0.314618,2.585155,0.708568,1.0,0.0,0.0
7,1.701733,0.794696,-0.785427,-0.459209,0.0,1.0,0.0
8,0.784676,-0.991922,-1.377533,-0.788021,0.0,0.0,1.0
9,1.365874,-0.991922,0.322812,1.900026,1.0,0.0,0.0
