## <font color="maroon"><h4 align="center">Handling Missing Data - fillna, interpolate, dropna</font>

In [2]:
import pandas as pd
df = pd.read_csv("resources/weather_data.csv",parse_dates=['day'])
type(df.day[0])
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32,6,Rain
1,2017-01-02,-99999,7,Sunny
2,2017-01-03,28,-99999,Snow
3,2017-01-04,-99999,7,0
4,2017-01-05,32,-99999,Rain
5,2017-01-06,31,2,Sunny
6,2017-01-06,34,5,0


In [3]:
df.set_index('day',inplace=True)
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-02,-99999,7,Sunny
2017-01-03,28,-99999,Snow
2017-01-04,-99999,7,0
2017-01-05,32,-99999,Rain
2017-01-06,31,2,Sunny
2017-01-06,34,5,0


## <font color="blue">fillna</font>

<font color="purple">**Fill all NaN with one specific value**</font>

In [4]:
new_df = df.fillna(0)
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-02,-99999,7,Sunny
2017-01-03,28,-99999,Snow
2017-01-04,-99999,7,0
2017-01-05,32,-99999,Rain
2017-01-06,31,2,Sunny
2017-01-06,34,5,0


<font color="purple">**Fill na using column names and dict**</font>

In [5]:
new_df = df.fillna({
        'temperature': 0,
        'windspeed': 0,
        'event': 'No Event'
    })
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-02,-99999,7,Sunny
2017-01-03,28,-99999,Snow
2017-01-04,-99999,7,0
2017-01-05,32,-99999,Rain
2017-01-06,31,2,Sunny
2017-01-06,34,5,0


<font color="purple">**Use method to determine how to fill na values**</font>

In [6]:
new_df = df.fillna(method="ffill")
new_df

  new_df = df.fillna(method="ffill")


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-02,-99999,7,Sunny
2017-01-03,28,-99999,Snow
2017-01-04,-99999,7,0
2017-01-05,32,-99999,Rain
2017-01-06,31,2,Sunny
2017-01-06,34,5,0


In [7]:
new_df = df.fillna(method="bfill")
new_df

  new_df = df.fillna(method="bfill")


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-02,-99999,7,Sunny
2017-01-03,28,-99999,Snow
2017-01-04,-99999,7,0
2017-01-05,32,-99999,Rain
2017-01-06,31,2,Sunny
2017-01-06,34,5,0


<font color="purple">**Use of axis**</font>

In [8]:
new_df = df.fillna(method="bfill", axis="columns") # axis is either "index" or "columns"
new_df

  new_df = df.fillna(method="bfill", axis="columns") # axis is either "index" or "columns"


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-02,-99999,7,Sunny
2017-01-03,28,-99999,Snow
2017-01-04,-99999,7,0
2017-01-05,32,-99999,Rain
2017-01-06,31,2,Sunny
2017-01-06,34,5,0


<font color="purple">**limit parameter**</font>

In [9]:
new_df = df.fillna(method="ffill",limit=1)
new_df

  new_df = df.fillna(method="ffill",limit=1)


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-02,-99999,7,Sunny
2017-01-03,28,-99999,Snow
2017-01-04,-99999,7,0
2017-01-05,32,-99999,Rain
2017-01-06,31,2,Sunny
2017-01-06,34,5,0


### <font color="blue">interpolate</font>

In [10]:
new_df = df.interpolate()
new_df

  new_df = df.interpolate()


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-02,-99999,7,Sunny
2017-01-03,28,-99999,Snow
2017-01-04,-99999,7,0
2017-01-05,32,-99999,Rain
2017-01-06,31,2,Sunny
2017-01-06,34,5,0


In [11]:
new_df = df.interpolate(method="time") 
new_df

  new_df = df.interpolate(method="time")


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-02,-99999,7,Sunny
2017-01-03,28,-99999,Snow
2017-01-04,-99999,7,0
2017-01-05,32,-99999,Rain
2017-01-06,31,2,Sunny
2017-01-06,34,5,0


**Notice that in above temperature on 2017-01-04 is 29 instead of 30 (in plain linear interpolate)**

**There are many other methods for interpolation such as quadratic, piecewise_polynomial, cubic etc. 
Just google "dataframe interpolate" to see complete documentation**

### <font color="blue">dropna</font>

In [12]:
new_df = df.dropna()
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-02,-99999,7,Sunny
2017-01-03,28,-99999,Snow
2017-01-04,-99999,7,0
2017-01-05,32,-99999,Rain
2017-01-06,31,2,Sunny
2017-01-06,34,5,0


In [13]:
new_df = df.dropna(how='all')
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-02,-99999,7,Sunny
2017-01-03,28,-99999,Snow
2017-01-04,-99999,7,0
2017-01-05,32,-99999,Rain
2017-01-06,31,2,Sunny
2017-01-06,34,5,0


In [14]:
new_df = df.dropna(thresh=1)
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32,6,Rain
2017-01-02,-99999,7,Sunny
2017-01-03,28,-99999,Snow
2017-01-04,-99999,7,0
2017-01-05,32,-99999,Rain
2017-01-06,31,2,Sunny
2017-01-06,34,5,0


### <font color="blue">Inserting Missing Dates</font>

In [16]:
dt = pd.date_range("01-01-2017","01-11-2017")
idx = pd.DatetimeIndex(dt)