<a href="https://colab.research.google.com/github/krakowiakpawel9/ml_course/blob/master/cont/001_missing_values.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### scikit-learn  
Website: [https://scikit-learn.org/](https://scikit-learn.org/)

In [0]:
# !pip install --upgrade scikit-learn

In [2]:
import numpy as np
import pandas as pd
import sklearn
sklearn.__version__

'0.22.1'

In [3]:
data = {'size': ['XL', 'L', 'M', np.nan, 'M', 'M'],
        'color': ['red', 'green', 'blue', 'green', 'red', 'green'],
        'gender': ['female', 'male', np.nan, 'female', 'female', 'male'],
        'price': [199.0, 89.0, np.nan, 129.0, 79.0, 89.0],
        'weight': [500, 450, 300, np.nan, 410, np.nan],
        'bought': ['yes', 'no', 'yes', 'no', 'yes', 'no']}

df_raw = pd.DataFrame(data=data)
df = df_raw.copy()
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,,no


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 6 columns):
size      5 non-null object
color     6 non-null object
gender    5 non-null object
price     5 non-null float64
weight    4 non-null float64
bought    6 non-null object
dtypes: float64(2), object(4)
memory usage: 416.0+ bytes


In [5]:
df.isnull()

Unnamed: 0,size,color,gender,price,weight,bought
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,True,True,False,False
3,True,False,False,False,True,False
4,False,False,False,False,False,False
5,False,False,False,False,True,False


In [6]:
df.isnull().sum()

size      1
color     0
gender    1
price     1
weight    2
bought    0
dtype: int64

In [29]:
df.isnull().sum().sum()

5

In [7]:
df.isnull().sum() / len(df)

size      0.166667
color     0.000000
gender    0.166667
price     0.166667
weight    0.333333
bought    0.000000
dtype: float64

In [8]:
from sklearn.impute import SimpleImputer

# strategy: 'mean', 'median', 'most_frequent', 'constant'
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(df[['weight']])

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [9]:
imputer.statistics_

array([415.])

In [10]:
imputer.transform(df[['weight']])

array([[500.],
       [450.],
       [300.],
       [415.],
       [410.],
       [415.]])

In [11]:
df['weight'] = imputer.transform(df[['weight']])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


In [12]:
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=99.0)
imputer.fit_transform(df[['price']])

array([[199.],
       [ 89.],
       [ 99.],
       [129.],
       [ 79.],
       [ 89.]])

In [13]:
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='L')
imputer.fit_transform(df[['size']])

array([['XL'],
       ['L'],
       ['M'],
       ['L'],
       ['M'],
       ['M']], dtype=object)

In [14]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


In [15]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit_transform(df[['size']])

array([['XL'],
       ['L'],
       ['M'],
       ['M'],
       ['M'],
       ['M']], dtype=object)

In [20]:
df = df_raw.copy()
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,,no


In [28]:
df.isnull().sum()

size      1
color     0
gender    1
price     1
weight    2
bought    0
dtype: int64

In [32]:
pd.isnull(df['weight'])

0    False
1    False
2    False
3     True
4    False
5     True
Name: weight, dtype: bool

In [33]:
df[pd.isnull(df['weight'])]

Unnamed: 0,size,color,gender,price,weight,bought
3,,green,female,129.0,,no
5,M,green,male,89.0,,no


In [34]:
df[~pd.isnull(df['weight'])]

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
4,M,red,female,79.0,410.0,yes


In [35]:
pd.notnull(df['weight'])

0     True
1     True
2     True
3    False
4     True
5    False
Name: weight, dtype: bool

In [36]:
df[pd.notnull(df['weight'])]

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
4,M,red,female,79.0,410.0,yes


In [37]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,,no


In [38]:
df.fillna(value='brak')

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199,500,yes
1,L,green,male,89,450,no
2,M,blue,brak,brak,300,yes
3,brak,green,female,129,brak,no
4,M,red,female,79,410,yes
5,M,green,male,89,brak,no


In [39]:
df.fillna(value=0.0)

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,0,0.0,300.0,yes
3,0,green,female,129.0,0.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,0.0,no


In [40]:
df['size'].fillna(value='L', inplace=True)
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,L,green,female,129.0,,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,,no


In [41]:
df.dropna()

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
4,M,red,female,79.0,410.0,yes
