<a href="https://colab.research.google.com/github/hxvrtbxvt/machine_learning/blob/main/pre_processing_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import sklearn

In [2]:
data = {
    'size': ['XL','L','M','L','M'],
    'color': ['red','green','blue','green','red'],
    'gender':['female','male','male','female','female'],
    'price': [199.0,89.0,99.0,129.0,79.0],
    'weight': [500,450,300,300,410],
    'bought': ['yes','no','yes','no','yes']
}

df_raw = pd.DataFrame(data=data)
df_raw

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500,yes
1,L,green,male,89.0,450,no
2,M,blue,male,99.0,300,yes
3,L,green,female,129.0,300,no
4,M,red,female,79.0,410,yes


In [3]:
df = df_raw.copy() # kopia danych, aby zawsze byla mozliwosc powrotu do danych w pierwotnej postaci 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   size    5 non-null      object 
 1   color   5 non-null      object 
 2   gender  5 non-null      object 
 3   price   5 non-null      float64
 4   weight  5 non-null      int64  
 5   bought  5 non-null      object 
dtypes: float64(1), int64(1), object(4)
memory usage: 368.0+ bytes


In [4]:
for col in ['size', 'color', 'gender', 'bought']:
  df[col] = df[col].astype('category') #kiedy tworzymy wizualizacje, jest o wiele łatwiej i efektywniej je tworzyc jesli mamy zmienne w postaci kategorycznej a nie object

df['weight'] = df['weight'].astype('float')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   size    5 non-null      category
 1   color   5 non-null      category
 2   gender  5 non-null      category
 3   price   5 non-null      float64 
 4   weight  5 non-null      float64 
 5   bought  5 non-null      category
dtypes: category(4), float64(2)
memory usage: 740.0 bytes


In [5]:
df.describe()

Unnamed: 0,price,weight
count,5.0,5.0
mean,119.0,392.0
std,48.476799,89.833179
min,79.0,300.0
25%,89.0,300.0
50%,99.0,410.0
75%,129.0,450.0
max,199.0,500.0


In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price,5.0,119.0,48.476799,79.0,89.0,99.0,129.0,199.0
weight,5.0,392.0,89.833179,300.0,300.0,410.0,450.0,500.0


In [7]:
df.describe(include=['category']).T

Unnamed: 0,count,unique,top,freq
size,5,3,L,2
color,5,3,green,2
gender,5,2,female,3
bought,5,2,yes,3


In [10]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,1
1,L,green,male,89.0,450.0,0
2,M,blue,male,99.0,300.0,1
3,L,green,female,129.0,300.0,0
4,M,red,female,79.0,410.0,1


In [8]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df['bought'])
le.transform(df['bought'])   # fit doopasowuje, transform przygotowuje do modelu   #można też od razu napisać !!!! le.fit_transform(df['bought]) !!!!

array([1, 0, 1, 0, 1])

# Top - wartość, która się pojawia najczęściej 

In [9]:
df['bought'] = le.fit_transform(df['bought'])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,1
1,L,green,male,89.0,450.0,0
2,M,blue,male,99.0,300.0,1
3,L,green,female,129.0,300.0,0
4,M,red,female,79.0,410.0,1


In [11]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
encoder.fit(df[['size']])

OneHotEncoder(sparse=False)

In [12]:
encoder.transform(df[['size']]) #w rezultacie dostajemy macierz, 

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [14]:
encoder.categories_ #tam gdzie w naszych danych pojawi się 'L', tam dostaniemy jedynkę w pierwszej kolumnie, tak samo w drugiej
                    #tam gdzie M, to jedynka i w trzeciej tam gdzie XL to jedynka

[array(['L', 'M', 'XL'], dtype=object)]

In [13]:
encoder = OneHotEncoder(drop='first', sparse=False) #usuwa pierwszą z kolumn tak aby nie bylo błędu jakiegoś
encoder.fit(df[['size']])
encoder.transform(df[['size']])   #dostajemy dwie kolumny

array([[0., 1.],
       [0., 0.],
       [1., 0.],
       [0., 0.],
       [1., 0.]])

In [15]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,1
1,L,green,male,89.0,450.0,0
2,M,blue,male,99.0,300.0,1
3,L,green,female,129.0,300.0,0
4,M,red,female,79.0,410.0,1


In [16]:
# pandas get_dummies() - pozwala bardziej elastycznie pozwalać korzystać z kodowania niż OneHotEncoder

In [17]:
pd.get_dummies(data=df) # wszystkie zmienne kategoryczne zakodowano, tam gdzie dane wystąpiły tam są jedynki 

Unnamed: 0,price,weight,bought,size_L,size_M,size_XL,color_blue,color_green,color_red,gender_female,gender_male
0,199.0,500.0,1,0,0,1,0,0,1,1,0
1,89.0,450.0,0,1,0,0,0,1,0,0,1
2,99.0,300.0,1,0,1,0,1,0,0,0,1
3,129.0,300.0,0,1,0,0,0,1,0,1,0
4,79.0,410.0,1,0,1,0,0,0,1,1,0


In [19]:
pd.get_dummies(data=df, drop_first=True)

Unnamed: 0,price,weight,bought,size_M,size_XL,color_green,color_red,gender_male
0,199.0,500.0,1,0,1,0,1,0
1,89.0,450.0,0,0,0,1,0,1
2,99.0,300.0,1,1,0,0,0,1
3,129.0,300.0,0,0,0,1,0,0
4,79.0,410.0,1,1,0,0,1,0


In [20]:
pd.get_dummies(data=df,drop_first=True, prefix='new' )

Unnamed: 0,price,weight,bought,new_M,new_XL,new_green,new_red,new_male
0,199.0,500.0,1,0,1,0,1,0
1,89.0,450.0,0,0,0,1,0,1
2,99.0,300.0,1,1,0,0,0,1
3,129.0,300.0,0,0,0,1,0,0
4,79.0,410.0,1,1,0,0,1,0


In [21]:
pd.get_dummies(data=df,drop_first=True,columns=['size']) # kodowanie tylko jednej kolumny

Unnamed: 0,color,gender,price,weight,bought,size_M,size_XL
0,red,female,199.0,500.0,1,0,1
1,green,male,89.0,450.0,0,0,0
2,blue,male,99.0,300.0,1,1,0
3,green,female,129.0,300.0,0,0,0
4,red,female,79.0,410.0,1,1,0


In [22]:
# standaryzacja 

In [23]:
print(f"{df['price']}\n")
print(f"Średnia: {df['price'].mean()}")
print(f"Odchylenie standardowe: {df['price'].std():.4f}")

0    199.0
1     89.0
2     99.0
3    129.0
4     79.0
Name: price, dtype: float64

Średnia: 119.0
Odchylenie standardowe: 48.4768


In [24]:
(df['price'] - df ['price'].mean()) / df ['price'].std()

0    1.650274
1   -0.618853
2   -0.412568
3    0.206284
4   -0.825137
Name: price, dtype: float64

In [26]:
def standardsize(x):
  return (x - x.mean()) / x.std()

standardsize(df['price'])

0    1.650274
1   -0.618853
2   -0.412568
3    0.206284
4   -0.825137
Name: price, dtype: float64

In [27]:
from sklearn.preprocessing import scale

scale(df['price']) #tutaj te wartości nieco się róznią - różnice między estymatorem obciażonym a nieobciążonym

array([ 1.84506242, -0.69189841, -0.4612656 ,  0.2306328 , -0.92253121])

In [28]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(df[['price']])
scaler.transform(df[['price']])

array([[ 1.84506242],
       [-0.69189841],
       [-0.4612656 ],
       [ 0.2306328 ],
       [-0.92253121]])

In [29]:
scaler_price = StandardScaler()
df[['price', 'weight']] = scaler_price.fit_transform(df[['price', 'weight']])

df


Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,1.845062,1.344132,1
1,L,green,male,-0.691898,0.721849,0
2,M,blue,male,-0.461266,-1.145002,1
3,L,green,female,0.230633,-1.145002,0
4,M,red,female,-0.922531,0.224022,1


In [30]:
df = df_raw.copy()
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500,yes
1,L,green,male,89.0,450,no
2,M,blue,male,99.0,300,yes
3,L,green,female,129.0,300,no
4,M,red,female,79.0,410,yes


In [32]:
le = LabelEncoder()

df['bought'] = le.fit_transform(df['bought'])

scaler = StandardScaler()
df[['price', 'weight']] = scaler.fit_transform(df[['price', 'weight']])

df = pd.get_dummies(data=df,drop_first=True)

df

Unnamed: 0,price,weight,bought,size_M,size_XL,color_green,color_red,gender_male
0,1.845062,1.344132,1,0,1,0,1,0
1,-0.691898,0.721849,0,0,0,1,0,1
2,-0.461266,-1.145002,1,1,0,0,0,1
3,0.230633,-1.145002,0,0,0,1,0,0
4,-0.922531,0.224022,1,1,0,0,1,0
