<a href="https://colab.research.google.com/github/krakowiakpawel9/ml_course/blob/master/cont/000_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### scikit-learn  
Website: [https://scikit-learn.org/](https://scikit-learn.org/)

In [0]:
# !pip install --upgrade scikit-learn

In [2]:
import numpy as np
import pandas as pd
import sklearn
sklearn.__version__

'0.22.1'

In [0]:
data = {'size': ['XL', 'L', 'M', 'L', 'M'],
        'color': ['red', 'green', 'blue', 'green', 'red'],
        'gender': ['female', 'male', 'male', 'female', 'female'],
        'price': [199.0, 89.0, 99.0, 129.0, 79.0],
        'weight': [500, 450, 300, 380, 410],
        'bought': ['yes', 'no', 'yes', 'no', 'yes']}

df_raw = pd.DataFrame(data=data)
df = df_raw.copy()
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500,yes
1,L,green,male,89.0,450,no
2,M,blue,male,99.0,300,yes
3,L,green,female,129.0,380,no
4,M,red,female,79.0,410,yes


In [0]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
size      5 non-null object
color     5 non-null object
gender    5 non-null object
price     5 non-null float64
weight    5 non-null int64
bought    5 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 368.0+ bytes


In [0]:
for col in ['size', 'color', 'gender', 'bought']:
    df[col] = df[col].astype('category')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
size      5 non-null category
color     5 non-null category
gender    5 non-null category
price     5 non-null float64
weight    5 non-null int64
bought    5 non-null category
dtypes: category(4), float64(1), int64(1)
memory usage: 628.0 bytes


In [0]:
df.describe()

Unnamed: 0,price,weight
count,5.0,5.0
mean,119.0,408.0
std,48.476799,75.299402
min,79.0,300.0
25%,89.0,380.0
50%,99.0,410.0
75%,129.0,450.0
max,199.0,500.0


In [0]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price,5.0,119.0,48.476799,79.0,89.0,99.0,129.0,199.0
weight,5.0,408.0,75.299402,300.0,380.0,410.0,450.0,500.0


In [0]:
df.describe(include=['category']).T

Unnamed: 0,count,unique,top,freq
size,5,3,M,2
color,5,3,red,2
gender,5,2,female,3
bought,5,2,yes,3


In [0]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df['bought'])
le.transform(df['bought'])

array([1, 0, 1, 0, 1])

In [0]:
le.fit_transform(df['bought'])

array([1, 0, 1, 0, 1])

In [0]:
le.classes_

array(['no', 'yes'], dtype=object)

In [0]:
df['bought'] = le.fit_transform(df['bought'])
df

Unnamed: 0,size,color,gender,price,bought
0,XL,red,female,199.0,1
1,L,green,male,89.0,0
2,M,blue,male,99.0,1
3,L,green,female,129.0,0
4,M,red,female,79.0,1


In [0]:
le.inverse_transform(df['bought'])

array(['yes', 'no', 'yes', 'no', 'yes'], dtype=object)

In [0]:
df['bought'] = le.inverse_transform(df['bought'])
df

Unnamed: 0,size,color,gender,price,bought
0,XL,red,female,199.0,yes
1,L,green,male,89.0,no
2,M,blue,male,99.0,yes
3,L,green,female,129.0,no
4,M,red,female,79.0,yes


In [0]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
encoder.fit(df[['size']])

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [0]:
encoder.transform(df[['size']]).toarray()

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [0]:
encoder.categories_

[array(['L', 'M', 'XL'], dtype=object)]

In [0]:
encoder = OneHotEncoder(drop='first')
encoder.fit(df[['size']])
encoder.transform(df[['size']]).toarray()

array([[0., 1.],
       [0., 0.],
       [1., 0.],
       [0., 0.],
       [1., 0.]])

In [0]:
df

Unnamed: 0,size,color,gender,price,bought
0,XL,red,female,199.0,yes
1,L,green,male,89.0,no
2,M,blue,male,99.0,yes
3,L,green,female,129.0,no
4,M,red,female,79.0,yes


### Pandas get_dummies()

In [0]:
pd.get_dummies(data=df)

Unnamed: 0,price,size_L,size_M,size_XL,color_blue,color_green,color_red,gender_female,gender_male,bought_no,bought_yes
0,199.0,0,0,1,0,0,1,1,0,0,1
1,89.0,1,0,0,0,1,0,0,1,1,0
2,99.0,0,1,0,1,0,0,0,1,0,1
3,129.0,1,0,0,0,1,0,1,0,1,0
4,79.0,0,1,0,0,0,1,1,0,0,1


In [0]:
pd.get_dummies(data=df, drop_first=True)

Unnamed: 0,price,size_M,size_XL,color_green,color_red,gender_male,bought_yes
0,199.0,0,1,0,1,0,1
1,89.0,0,0,1,0,1,0
2,99.0,1,0,0,0,1,1
3,129.0,0,0,1,0,0,0
4,79.0,1,0,0,1,0,1


In [0]:
pd.get_dummies(data=df, drop_first=True, prefix='new')

Unnamed: 0,price,new_M,new_XL,new_green,new_red,new_male,new_yes
0,199.0,0,1,0,1,0,1
1,89.0,0,0,1,0,1,0
2,99.0,1,0,0,0,1,1
3,129.0,0,0,1,0,0,0
4,79.0,1,0,0,1,0,1


In [0]:
pd.get_dummies(data=df, drop_first=True, prefix_sep='-')

Unnamed: 0,price,size-M,size-XL,color-green,color-red,gender-male,bought-yes
0,199.0,0,1,0,1,0,1
1,89.0,0,0,1,0,1,0
2,99.0,1,0,0,0,1,1
3,129.0,0,0,1,0,0,0
4,79.0,1,0,0,1,0,1


In [0]:
pd.get_dummies(data=df, drop_first=True, columns=['size'])

Unnamed: 0,color,gender,price,bought,size_M,size_XL
0,red,female,199.0,yes,0,1
1,green,male,89.0,no,0,0
2,blue,male,99.0,yes,1,0
3,green,female,129.0,no,0,0
4,red,female,79.0,yes,1,0


### Standaryzacja

std() - pandas nieobciążony  
std() - numpy obciążony

In [0]:
print(df['price'], '\n')
print(df['price'].mean(), '\n')
print(df['price'].std())

0    199.0
1     89.0
2     99.0
3    129.0
4     79.0
Name: price, dtype: float64 

119.0 

48.47679857416329


In [0]:
(df['price'] - df['price'].mean()) / df['price'].std()

0    1.650274
1   -0.618853
2   -0.412568
3    0.206284
4   -0.825137
Name: price, dtype: float64

In [0]:
def standardize(x):
    return (x - x.mean()) / x.std()

standardize(df['price'])

0    1.650274
1   -0.618853
2   -0.412568
3    0.206284
4   -0.825137
Name: price, dtype: float64

In [0]:
from sklearn.preprocessing import scale

scale(df['price'])

array([ 1.84506242, -0.69189841, -0.4612656 ,  0.2306328 , -0.92253121])

In [0]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(df[['price']])
scaler.transform(df[['price']])

array([[ 1.84506242],
       [-0.69189841],
       [-0.4612656 ],
       [ 0.2306328 ],
       [-0.92253121]])

In [0]:
df[['price']].std()

price    48.476799
dtype: float64

In [0]:
df[['price']].values.std()

43.3589667773576

In [0]:
scaler.mean_

array([119.])

In [0]:
scaler.scale_

array([43.35896678])

### Przygotowanie danych do modelu

In [0]:
df

Unnamed: 0,size,color,gender,price,bought
0,XL,red,female,199.0,yes
1,L,green,male,89.0,no
2,M,blue,male,99.0,yes
3,L,green,female,129.0,no
4,M,red,female,79.0,yes


In [0]:
le = LabelEncoder()
df['bought'] = le.fit_transform(df['bought'])

scaler = StandardScaler()
df['price'] = scaler.fit_transform(df[['price']])

df = pd.get_dummies(data=df, drop_first=True)
df

Unnamed: 0,price,bought,size_M,size_XL,color_green,color_red,gender_male
0,1.845062,1,0,1,0,1,0
1,-0.691898,0,0,0,1,0,1
2,-0.461266,1,1,0,0,0,1
3,0.230633,0,0,0,1,0,0
4,-0.922531,1,1,0,0,1,0


### Iloczyn kartezjański dwóch zmiennych

In [3]:
data = {'day': ['friday', 'saturday', 'sunday'],
        'weather': ['sunny', 'cloudy', 'sunny'],
        'sale': [2000, 3400, 2400]}

df = pd.DataFrame(data=data)
df

Unnamed: 0,day,weather,sale
0,friday,sunny,2000
1,saturday,cloudy,3400
2,sunday,sunny,2400


In [18]:
pd.merge(df[['day']].assign(key=0), df[['weather']].assign(key=0), on='key').drop('key', axis=1)

Unnamed: 0,day,weather
0,friday,sunny
1,friday,cloudy
2,friday,sunny
3,saturday,sunny
4,saturday,cloudy
5,saturday,sunny
6,sunday,sunny
7,sunday,cloudy
8,sunday,sunny
