### <a name='0'></a>Import libraries

In [1]:
import numpy as np
import pandas as pd

np.__version__

'1.19.2'

### <a name='1'></a> Exercise 201
From the _data_ dictionary below, create a _DataFrame_ object and assign it to the _df_ variable.

In [2]:
data = {
    'size': ['XL', 'L', 'M', np.nan, 'M', 'M'],
    'color': ['red', 'green', 'blue', 'green', 'red', 'green'],
    'gender': ['female', 'male', np.nan, 'female', 'female', 'male'],
    'price': [199.0, 89.0, np.nan, 129.0, 79.0, 89.0],
    'weight': [500, 450, 300, np.nan, 410, np.nan],
    'bought': ['yes', 'no', 'yes', 'no', 'yes', 'no']
}

df = pd.DataFrame(data = data)
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,,no


Display basic information about the _df_ object.

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   size    5 non-null      object 
 1   color   6 non-null      object 
 2   gender  5 non-null      object 
 3   price   5 non-null      float64
 4   weight  4 non-null      float64
 5   bought  6 non-null      object 
dtypes: float64(2), object(4)
memory usage: 416.0+ bytes


### <a name='2'></a> Exercise 202
Check the number of missing data for each variable.

In [4]:
df.isnull().sum()

size      1
color     0
gender    1
price     1
weight    2
bought    0
dtype: int64

Check the number of missing data for each variable. Enter the percentage of scrap.

In [5]:
np.round(df.isnull().sum() / len(df) * 100,0)

size      17.0
color      0.0
gender    17.0
price     17.0
weight    33.0
bought     0.0
dtype: float64

### <a name='3'></a> Exercise 203
Using the _scikit-learn_ machine learning library and the _SimpleImputer_ class, fill in the missing data for the _weight_ variable with an average value. Permanently assign changes to the _df_ object.

In [6]:
from sklearn.impute import SimpleImputer

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
df.weight = imp_mean.fit_transform(df[['weight']])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


Display the mean value inserted in place of the missing _weight_ column.

In [7]:
imp_mean.statistics_

array([415.])

### <a name='4'></a> Exercise 204
Using the _scikit-learn_ machine learning library and the _SimpleImputer_ class, fill in the missing data for the _price_ variable with a constant value of 99.0. Permanently assign changes to the _df_ object.

In [8]:
from sklearn.impute import SimpleImputer

imp_mean = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value = 99.0)
df.price = imp_mean.fit_transform(df[['price']])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,99.0,300.0,yes
3,,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


### <a name='5'></a> Exercise 205
Using the _scikit-learn_ machine learning library and the _SimpleImputer_ class, fill in the missing data for the _size_ variable with the most frequently appearing element. Permanently assign changes to the _df_ object.

In [9]:
from sklearn.impute import SimpleImputer

imp_mean = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df.size = imp_mean.fit_transform(df[['size']])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,99.0,300.0,yes
3,M,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


### <a name='6'></a> Exercise 206
Reload the dictionary into the _df_ object before proceeding to the next exercises.

In [10]:
df = pd.DataFrame(data = data)
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,,no


Cut all lines of the _df_ object for which the _weight_ variable is _np.nan_.

In [11]:
df[df.weight.isna()]

Unnamed: 0,size,color,gender,price,weight,bought
3,,green,female,129.0,,no
5,M,green,male,89.0,,no


Cut all lines of the _df_ object for which the _weight_ variable is not _np.nan_.

In [12]:
df[~df.weight.isna()]

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
4,M,red,female,79.0,410.0,yes


### <a name='7'></a> Exercise 207
Complete all gaps in the _df_ object with the value 'missing'. Do not assign changes permanently to the variable _df_.

In [13]:
df.fillna('missing')

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199,500,yes
1,L,green,male,89,450,no
2,M,blue,missing,missing,300,yes
3,missing,green,female,129,missing,no
4,M,red,female,79,410,yes
5,M,green,male,89,missing,no


### <a name='8'></a> Exercise 208
Delete lines with missing values from the _df_ object. Do not assign changes permanently to the variable _df_.

In [14]:
df.dropna()

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
4,M,red,female,79.0,410.0,yes


### <a name='9'></a> Exercise 209
Delete the lines with missing values from the _df_ object for which there are no at least 5 defined values (otherwise they contain two missing values _ e.g. nan_). Do not assign changes permanently to the variable _df_.

In [15]:
df.dropna(thresh=5)

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,,no


### <a name='10'></a> Exercise 210
Delete the lines with missing data from the _df_ object for which there are no at least 5 defined values (otherwise they contain two missing values _ e.g. nan_) and fill the remaining gaps with a constant value of 400.0. Do not assign changes permanently to the variable _df_.

In [16]:
df.dropna(thresh=5).fillna(400.0)

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,400.0,no
