<a href="https://colab.research.google.com/github/jarekwan/jarwan_projekt/blob/main/kod_200925.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

titanic_url = ('https://raw.githubusercontent.com/amueller/'
               'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')

data = pd.read_csv(titanic_url)

num_features = ["age", "fare"]
num_pipe = Pipeline([
    ("imputation", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_features = ["sex", "embarked"]
cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OneHotEncoder(drop = "first", handle_unknown = "ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("numeric", num_pipe, num_features),
    ("categorical", cat_pipe, cat_features)
])

preprocessor.fit(data[num_features+cat_features])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LogisticRegression())
])

In [None]:
pipeline

In [None]:
X = data.drop(["survived"], axis=1)
y = data["survived"]

In [None]:
pipeline.fit(X, y)

In [None]:
pipeline.predict(X)

array([1, 0, 1, ..., 0, 0, 0])

Gotowe transformatory:
* https://scikit-learn.org/stable/api/sklearn.preprocessing.html
* https://feature-engine.trainindata.com/en/1.8.x/user_guide/wrappers/Wrapper.html

Nie są to jedyne źródła.

In [None]:
import numpy as np
A = np.array([
    [1, 2],
    [2, 3],
    [3, 7]
])

In [None]:
class MyStandardScaler:

    def fit(self, X, y=None): # tu gromadzimy statystyki potrzebne do wykonania transformacji
        self.means_ = X.mean(0)
        self.stds_ = X.std(0)
        return self # na końcu zawsze musimy zwrócić siebie


In [None]:
scaler = MyStandardScaler()

In [None]:
scaler.fit(A)

<__main__.MyStandardScaler at 0x7fbd6911d2d0>

In [None]:
scaler.means_

array([2., 4.])

In [None]:
scaler.stds_

array([0.81649658, 2.1602469 ])

In [None]:
class MyStandardScaler:

    def fit(self, X, y=None): # tu gromadzimy statystyki potrzebne do wykonania transformacji
        self.means_ = X.mean(0)
        self.stds_ = X.std(0)
        return self # na końcu zawsze musimy zwrócić siebie

    def transform(self, X): # tylko wykonanie transformacji i zwrócenie nowej postaci danych
        return (X - self.means_) / self.stds_


In [None]:
scaler = MyStandardScaler()
scaler.fit(A)

scaler.transform(A)

array([[-1.22474487, -0.9258201 ],
       [ 0.        , -0.46291005],
       [ 1.22474487,  1.38873015]])

In [None]:
num_pipe = Pipeline([
    ("imputation", SimpleImputer(strategy="median")),
    ("scaler", MyStandardScaler())
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OneHotEncoder(drop = "first", handle_unknown = "ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("numeric", num_pipe, num_features),
    ("categorical", cat_pipe, cat_features)
])

preprocessor.fit(data[num_features+cat_features])

In [None]:
preprocessor.transform(data[num_features+cat_features])

array([[-0.03900549,  3.44258413,  0.        ,  0.        ,  1.        ],
       [-2.21595217,  2.2866387 ,  1.        ,  0.        ,  1.        ],
       [-2.13197749,  2.2866387 ,  0.        ,  0.        ,  1.        ],
       ...,
       [-0.23279919, -0.50377442,  1.        ,  0.        ,  0.        ],
       [-0.19404045, -0.50377442,  1.        ,  0.        ,  0.        ],
       [-0.03900549, -0.49120717,  1.        ,  0.        ,  1.        ]])

In [None]:
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LogisticRegression())
])

pipeline.fit(X, y)

In [None]:
# num_pipe = Pipeline([
#     ("imputation", SimpleImputer(strategy="median")),
#     ("scaler", 'passthrough')
# ])

# cat_pipe = Pipeline([
#     ("imputer", SimpleImputer(strategy = "most_frequent")),
#     ("encoder", OneHotEncoder(drop = "first", handle_unknown = "ignore"))
# ])

# preprocessor = ColumnTransformer(transformers=[
#     ("numeric", num_pipe, num_features),
#     ("categorical", cat_pipe, cat_features)
# ])

# preprocessor.fit(data[num_features+cat_features])

In [None]:
class MyStandardScaler:

    def fit(self, X, y=None): # tu gromadzimy statystyki potrzebne do wykonania transformacji
        self.means_ = X.mean(0)
        self.stds_ = X.std(0)
        return self # na końcu zawsze musimy zwrócić siebie

    def transform(self, X): # tylko wykonanie transformacji i zwrócenie nowej postaci danych
        return (X - self.means_) / self.stds_

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

    # równoważna wersja:
    # def fit_transform(sel, X, y=None):
        # self.fit(X, y)
        # return self.transform(X)

In [None]:
StandardScaler?

[0;31mInit signature:[0m [0mStandardScaler[0m[0;34m([0m[0;34m*[0m[0;34m,[0m [0mcopy[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mwith_mean[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mwith_std[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Standardize features by removing the mean and scaling to unit variance.

The standard score of a sample `x` is calculated as:

    z = (x - u) / s

where `u` is the mean of the training samples or zero if `with_mean=False`,
and `s` is the standard deviation of the training samples or one if
`with_std=False`.

Centering and scaling happen independently on each feature by computing
the relevant statistics on the samples in the training set. Mean and
standard deviation are then stored to be used on later data using
:meth:`transform`.

Standardization of a dataset is a common requirement for many
machine learning estimators: they might behave badly if the
individual features do not more or l

In [None]:
class MyStandardScaler:

    def __init__(self, with_std=True):
        self.with_std = with_std

    def fit(self, X, y=None): # tu gromadzimy statystyki potrzebne do wykonania transformacji
        self.means_ = X.mean(0)
        if self.with_std:
            self.stds_ = X.std(0)
        return self # na końcu zawsze musimy zwrócić siebie

    def transform(self, X): # tylko wykonanie transformacji i zwrócenie nowej postaci danych
        X = X - self.means_
        if self.with_std:
            X = X / self.stds_
        return X

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

In [None]:
scaler = MyStandardScaler()
scaler.fit_transform(A)

array([[-1.22474487, -0.9258201 ],
       [ 0.        , -0.46291005],
       [ 1.22474487,  1.38873015]])

In [None]:
class MyStandardScaler:

    def __init__(self, with_std=True):
        self.with_std = with_std

    def fit(self, X, y=None):
        self.means_ = X.mean(0)
        self.stds_ = X.std(0) if self.with_std else None
        return self

    def transform(self, X):
        X = X - self.means_
        if self.with_std:
            X = X / self.stds_
        return X

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin

In [None]:
class MyStandardScaler(BaseEstimator, TransformerMixin, OneToOneFeatureMixin): # w prqaktyce włacie z automatu tego używamy

    def __init__(self, with_std=True):
        self.with_std = with_std

    def fit(self, X, y=None):
        self.means_ = X.mean(0)
        self.stds_ = X.std(0) if self.with_std else None
        return self

    def transform(self, X):
        X = X - self.means_
        if self.with_std:
            X = X / self.stds_
        return X

In [None]:
scaler = MyStandardScaler()

In [None]:
scaler.fit_transform(A)

array([[-1.22474487, -0.9258201 ],
       [ 0.        , -0.46291005],
       [ 1.22474487,  1.38873015]])

In [None]:
scaler.get_params()

{'with_std': True}

In [None]:
scaler.get_feature_names_out()

NotFittedError: This MyStandardScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
class MyStandardScaler(BaseEstimator, TransformerMixin, OneToOneFeatureMixin): # w prqaktyce włacie z automatu tego używamy

    def __init__(self, with_std=True):
        self.with_std = with_std

    def fit(self, X, y=None):
        self.means_ = X.mean(0)
        self.stds_ = X.std(0) if self.with_std else None
        self.n_features_in_ = X.shape[1]
        return self

    def transform(self, X):
        X = X - self.means_
        if self.with_std:
            X = X / self.stds_
        return np.array(X)

In [None]:
scaler = MyStandardScaler()
scaler.fit(A)

scaler.get_feature_names_out()

array(['x0', 'x1'], dtype=object)

In [None]:
scaler = MyStandardScaler()
scaler.fit_transform(pd.DataFrame(A))

array([[-1.        , -0.75592895],
       [ 0.        , -0.37796447],
       [ 1.        ,  1.13389342]])

In [None]:
import warnings

In [None]:
class MyStandardScaler(BaseEstimator, TransformerMixin, OneToOneFeatureMixin): # w prqaktyce włacie z automatu tego używamy

    def __init__(self, with_std=True):
        self.with_std = with_std

    def fit(self, X, y=None):
        self.means_ = X.mean(0)
        self.stds_ = X.std(0) if self.with_std else None
        self.n_features_in_ = X.shape[1]
        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns
        return self

    def transform(self, X):
        X = X - self.means_
        if self.with_std:
            where_std_non_zero = np.where(self.stds_ > 0)[0]

            if any(self.stds_ == 0):
                warnings.warn("There are columns with constant value! They will be skip during dividing by stds.")

            if len(where_std_non_zero) > 0:
                X[:, where_std_non_zero] = X[:, where_std_non_zero] / self.stds_[where_std_non_zero]
        return np.array(X)

In [None]:
A = np.array([
    [2, 2],
    [2, 2],
    [2, 2]
])

In [None]:
scaler = MyStandardScaler()
scaler.fit_transform(A)



array([[0., 0.],
       [0., 0.],
       [0., 0.]])

In [None]:
class MyStandardScaler(BaseEstimator, TransformerMixin, OneToOneFeatureMixin): # w prqaktyce włacie z automatu tego używamy

    def __init__(self, with_std=True):
        self.with_std = with_std

    def fit(self, X, y=None):
        if not isinstance(X, (np.ndarray, pd.DataFrame)):
            raise ValueError(f"X must be numpy array or pandas dataframe - got {type(X)} instead.")
        self.means_ = X.mean(0)
        if self.with_std:
            self.stds_ = X.std(0)
            self.where_std_non_zero_ = np.where(self.stds_ > 0)[0]
            if any(self.stds_ == 0):
                warnings.warn("There are columns with constant value! They will be skip during dividing by stds.")
            self.any_std_non_zero_ = len(self.where_std_non_zero_) > 0

        self.n_features_in_ = X.shape[1]
        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns
        return self

    def transform(self, X):
        X = X - self.means_
        if self.with_std and self.any_std_non_zero_:
            X[:, self.where_std_non_zero_] = X[:, self.where_std_non_zero_] / self.stds_[self.where_std_non_zero_]
        return np.array(X)

TO nadal nie jest dopracowane. Chociażmy sprawdzanie poprawności danych w fit i tranform należałoby dokończyć.

In [None]:
A = np.array([
    [2, 1],
    [2, 2],
    [7, 2]
])

scaler = MyStandardScaler(False)
scaler.fit_transform(A)

array([[-1.66666667, -0.66666667],
       [-1.66666667,  0.33333333],
       [ 3.33333333,  0.33333333]])

In [None]:
num_pipe = Pipeline([
    ("imputation", SimpleImputer(strategy="median")),
    ("scaler", MyStandardScaler())
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OneHotEncoder(drop = "first", handle_unknown = "ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("numeric", num_pipe, num_features),
    ("categorical", cat_pipe, cat_features)
])

preprocessor.fit(data[num_features+cat_features])

In [None]:
preprocessor.transform(data[num_features+cat_features])

array([[-0.03900549,  3.44258413,  0.        ,  0.        ,  1.        ],
       [-2.21595217,  2.2866387 ,  1.        ,  0.        ,  1.        ],
       [-2.13197749,  2.2866387 ,  0.        ,  0.        ,  1.        ],
       ...,
       [-0.23279919, -0.50377442,  1.        ,  0.        ,  0.        ],
       [-0.19404045, -0.50377442,  1.        ,  0.        ,  0.        ],
       [-0.03900549, -0.49120717,  1.        ,  0.        ,  1.        ]])

In [None]:
preprocessor.get_feature_names_out()

array(['numeric__age', 'numeric__fare', 'categorical__sex_male',
       'categorical__embarked_Q', 'categorical__embarked_S'], dtype=object)

In [None]:
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LogisticRegression())
])

In [None]:
pipeline.fit(X, y)

In [None]:
pipeline.named_steps['preprocessor'].get_feature_names_out() # zmienne, które wchodzą do modelu

array(['numeric__age', 'numeric__fare', 'categorical__sex_male',
       'categorical__embarked_Q', 'categorical__embarked_S'], dtype=object)

In [None]:
class MyMinMaxScaler(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):

    def fit(self, X, y=None):
        self.mins_ = X.min(0)
        self.maxs_ = X.max(0)
        return self

    def transform(self, X):
        return (X - self.mins_) / (self.maxs_- self.mins_)

In [None]:
A = np.array([
    [2, 1],
    [2, 3],
    [7, 2]
])

In [None]:
scaler = MyMinMaxScaler()
scaler.fit_transform(A)

array([[0. , 0. ],
       [0. , 1. ],
       [1. , 0.5]])

In [None]:
scaler = MyMinMaxScaler()
scaler.fit(A)
scaler.transform(A)

array([[0. , 0. ],
       [0. , 1. ],
       [1. , 0.5]])

In [None]:
B = np.array([
    [77, -4]
])
scaler.transform(B)

array([[15. , -2.5]])

In [None]:
class MyMinMaxScaler(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):

    def __init__(self, feature_range=(0,1), clip_out_of_range=False):
        self.feature_range = feature_range
        self.clip_out_of_range = clip_out_of_range

    def fit(self, X, y=None):
        self.mins_ = X.min(0)
        self.maxs_ = X.max(0)
        return self

    def transform(self, X):
        a, b = self.feature_range
        X = (X - self.mins_) / (self.maxs_- self.mins_)
        X = a + X * (b-a)
        if self.clip_out_of_range:
            X = np.clip(X, a, b)
        return X

In [None]:
scaler = MyMinMaxScaler((0, 1), True)
scaler.fit(A)

B = np.array([
    [77, -4]
])
scaler.transform(B)

array([[1., 0.]])

In [None]:
np.clip?

[0;31mSignature:[0m       [0mnp[0m[0;34m.[0m[0mclip[0m[0;34m([0m[0ma[0m[0;34m,[0m [0ma_min[0m[0;34m,[0m [0ma_max[0m[0;34m,[0m [0mout[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mCall signature:[0m  [0mnp[0m[0;34m.[0m[0mclip[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mType:[0m            _ArrayFunctionDispatcher
[0;31mString form:[0m     <function clip at 0x7fbdb01012d0>
[0;31mFile:[0m            ~/.local/lib/python3.10/site-packages/numpy/core/fromnumeric.py
[0;31mDocstring:[0m      
Clip (limit) the values in an array.

Given an interval, values outside the interval are clipped to
the interval edges.  For example, if an interval of ``[0, 1]``
is specified, values smaller than 0 become 0, and values larger
than 1 become 1.

Equivalent to but faster than ``np.minimum(a_max, np.maximum(a, a_min))``.

No

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

num_features = ["age", "fare"]
cat_features = ["sex", "embarked"]

X = data[num_features+cat_features]
y = data["survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
num_pipe = Pipeline([
    ("imputation", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OneHotEncoder(drop = "first", handle_unknown = "ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("numeric", num_pipe, num_features),
    ("categorical", cat_pipe, cat_features)
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LogisticRegression())
])

In [None]:
param_grid = {
    "preprocessor__numeric__imputation__strategy": ["mean", "median"],
    "model__C": [0.1, 1, 10]
}

optimizer = GridSearchCV(pipeline, param_grid, cv=KFold(10, shuffle=True), scoring="accuracy", n_jobs=-1)
optimizer.fit(X_train, y_train)
optimizer.best_params_

{'model__C': 0.1, 'preprocessor__numeric__imputation__strategy': 'mean'}

Ćwiczenie

Zoptymalizować skalownie:

* czy skalowanie do zakresu [0, 1] czy [-1, 1]
* czy z clipowaniem czy bez.

In [None]:
num_pipe = Pipeline([
    ("imputation", SimpleImputer(strategy="median")),
    ("scaler", MyMinMaxScaler())
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OneHotEncoder(drop = "first", handle_unknown = "ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("numeric", num_pipe, num_features),
    ("categorical", cat_pipe, cat_features)
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LogisticRegression())
])

param_grid = [
    {
        "preprocessor__numeric__scaler__feature_range": [(0,1), (-1, 1)],
        "preprocessor__numeric__scaler__clip_out_of_range": [False, True],
    },
    {
        "preprocessor__numeric__scaler": [StandardScaler()]
    }
]

optimizer = GridSearchCV(pipeline, param_grid, cv=KFold(10, shuffle=True), scoring="accuracy", n_jobs=-1)
optimizer.fit(X_train, y_train)
optimizer.best_params_

{'preprocessor__numeric__scaler__clip_out_of_range': False,
 'preprocessor__numeric__scaler__feature_range': (0, 1)}

In [None]:
optimizer.cv_results_["mean_test_score"]

array([0.76880037, 0.76687729, 0.76880037, 0.76687729, 0.76401099])

In [None]:
data

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


In [None]:
data.groupby("pclass")["fare"].mean().to_dict()

{1: 87.50899164086687, 2: 21.179196389891697, 3: 13.302888700564973}

In [None]:
class SimpleRelationalImputer(BaseEstimator, TransformerMixin):

    def __init__(self, grouping_feature, target_feature):
        self.grouping_feature = grouping_feature
        self.target_feature = target_feature


    def fit(self, X, y=None):
        self.group_means_ = data.groupby(self.grouping_feature)[self.target_feature].mean().to_dict()
        return self

    def transform(self, X):
        X = X.copy() # żeby nie zmodyfikować danych źródłowych !!!
        for group, value in self.group_means_.items():
            mask = (X[self.grouping_feature] == group) & (X[self.target_feature].isna())
            X.loc[mask, self.target_feature] = value
        return X
        #byc może warto zwracą tylko kolumnę target_feature, poniewaz grouping_feature jest realnie tylko pomocnicza i moze i tak byc przetwarzana w innej gałęzi pipelinu




In [None]:
imputer = SimpleRelationalImputer(grouping_feature="pclass", target_feature="fare")
imputer.fit(data)
imputer.group_means_

{1: 87.50899164086687, 2: 21.179196389891697, 3: 13.30288870056497}

In [None]:
data["fare"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1309 entries, 0 to 1308
Series name: fare
Non-Null Count  Dtype  
--------------  -----  
1309 non-null   float64
dtypes: float64(1)
memory usage: 10.4 KB


In [None]:
imputer.transform(data)["fare"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1309 entries, 0 to 1308
Series name: fare
Non-Null Count  Dtype  
--------------  -----  
1309 non-null   float64
dtypes: float64(1)
memory usage: 10.4 KB


In [None]:
data["fare"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1309 entries, 0 to 1308
Series name: fare
Non-Null Count  Dtype  
--------------  -----  
1309 non-null   float64
dtypes: float64(1)
memory usage: 10.4 KB
