# Dealing With `Missing Data`:

In [29]:
import numpy as np 
import pandas as pd 

In [30]:
sample_dataset = {
    'A': [1, 2, np.nan, 4, 5],
    'B': [1, 2, 3, 4, 5],
    'C': [1, 2, 3, np.nan, np.nan],
    'D': [1, np.nan, np.nan, np.nan, 5]
}
sample_df = pd.DataFrame(data=sample_dataset)

In [31]:
sample_df 
#The NaN values are the null values a.k.a. missing data.

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


## There are many ways to display `missing data`:

### 1. by `isna()` 

In [32]:
sample_df.isna()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,False,True
2,True,False,False,True
3,False,False,True,True
4,False,False,True,False


In [33]:
sample_df.isna().sum()

A    1
B    0
C    2
D    3
dtype: int64

In [34]:
sample_df.isna().any()

A     True
B    False
C     True
D     True
dtype: bool

### 2. by `info()`:

In [35]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       4 non-null      float64
 1   B       5 non-null      int64  
 2   C       3 non-null      float64
 3   D       2 non-null      float64
dtypes: float64(3), int64(1)
memory usage: 292.0 bytes


## Removing Missing Data by `dropna()`:

In [36]:
#drop
sample_df.dropna() 

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0


In [None]:
#thresh arguement in dropna(), basically sets a bar from which only a selected rows are passed through.

sample_df.dropna(thresh=1)#drop every column which has only one non-null value.

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


## Filling the missing DataFrame:

### `fillna()`:
* #### `fillna()` fills the provided value inplace of the NaN values. 
* #### `inplace=False` is default.

In [39]:
sample_df.fillna(value=0)

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,0.0
2,0.0,3,3.0,0.0
3,4.0,4,0.0,0.0
4,5.0,5,0.0,5.0


In [40]:
#or we can give specific values to fill in different columns:

values = {'A':0,'B':100,"C":300,'D':400}

sample_df.fillna(value=values, inplace=False)

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,400.0
2,0.0,3,3.0,400.0
3,4.0,4,300.0,400.0
4,5.0,5,300.0,5.0


In [41]:
df=sample_df.copy()

df.fillna(value=0, inplace=True)

df

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,0.0
2,0.0,3,3.0,0.0
3,4.0,4,0.0,0.0
4,5.0,5,0.0,5.0


In [42]:
sample_df.fillna(sample_df.mean())

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,3.0
2,3.0,3,3.0,3.0
3,4.0,4,2.0,3.0
4,5.0,5,2.0,5.0
