# Imputing time-series data


In [1]:
#import packages
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#read_data
airquality = pd.read_csv("../datasets/air-quality.csv" , parse_dates=['Date'] , index_col='Date')
airquality.head()

Unnamed: 0_level_0,Ozone,Solar,Wind,Temp
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1976-05-01,41.0,190.0,7.4,67
1976-05-02,36.0,118.0,8.0,72
1976-05-03,12.0,149.0,12.6,74
1976-05-04,18.0,313.0,11.5,62
1976-05-05,,,14.3,56


### Airquality Dataset


In [3]:
airquality.isnull().sum()

Ozone    37
Solar     7
Wind      0
Temp      0
dtype: int64

In [4]:
airquality.isnull().mean() * 100

Ozone    24.183007
Solar     4.575163
Wind      0.000000
Temp      0.000000
dtype: float64

In [5]:
airquality['Ozone'][30:40]

Date
1976-05-31    37.0
1976-06-01     NaN
1976-06-02     NaN
1976-06-03     NaN
1976-06-04     NaN
1976-06-05     NaN
1976-06-06     NaN
1976-06-07    29.0
1976-06-08     NaN
1976-06-09    71.0
Name: Ozone, dtype: float64

#### Ffill method

#### pad is the same as 'ffill'

In [6]:
#Replace NaN s with last observed value
airquality_ffill = airquality.fillna(method='ffill')

In [7]:
airquality_ffill['Ozone'][30:40]

Date
1976-05-31    37.0
1976-06-01    37.0
1976-06-02    37.0
1976-06-03    37.0
1976-06-04    37.0
1976-06-05    37.0
1976-06-06    37.0
1976-06-07    29.0
1976-06-08    29.0
1976-06-09    71.0
Name: Ozone, dtype: float64

### Bfill method


### ackfill is the same as 'bfill

In [11]:
#Replace NaN s with next observed value
airquality_bfill = airquality.fillna(method='bfill' )

In [13]:
airquality_bfill['Ozone'][30:40]

Date
1976-05-31    37.0
1976-06-01    29.0
1976-06-02    29.0
1976-06-03    29.0
1976-06-04    29.0
1976-06-05    29.0
1976-06-06    29.0
1976-06-07    29.0
1976-06-08    71.0
1976-06-09    71.0
Name: Ozone, dtype: float64

## The .interpolate() method


### Linear interpolation


#### The .interpolate() method extends the sequence of values to the missing values


The attribute method in .interpolate() can be set to
<br>'linear'<br>
'quadratic'
<br>'nearest'

In [14]:
airquality_inter_linear = airquality.interpolate(method='linear' )

In [15]:
airquality_inter_linear['Ozone'][30:40]

Date
1976-05-31    37.000000
1976-06-01    35.857143
1976-06-02    34.714286
1976-06-03    33.571429
1976-06-04    32.428571
1976-06-05    31.285714
1976-06-06    30.142857
1976-06-07    29.000000
1976-06-08    50.000000
1976-06-09    71.000000
Name: Ozone, dtype: float64

### Quadratic interpolation


In [16]:
airquality_inter_quad = airquality.interpolate(method='quadratic' )

In [17]:
airquality_inter_quad['Ozone'][30:40]

Date
1976-05-31    37.000000
1976-06-01   -38.361123
1976-06-02   -79.352735
1976-06-03   -85.974836
1976-06-04   -62.354606
1976-06-05   -33.255133
1976-06-06    -2.803598
1976-06-07    29.000000
1976-06-08    62.155660
1976-06-09    71.000000
Name: Ozone, dtype: float64

### Nearest value imputation

In [19]:
airquality_inter_nearest = airquality.interpolate(method='nearest' )

In [20]:
airquality_inter_nearest['Ozone'][30:40]

Date
1976-05-31    37.0
1976-06-01    37.0
1976-06-02    37.0
1976-06-03    37.0
1976-06-04    29.0
1976-06-05    29.0
1976-06-06    29.0
1976-06-07    29.0
1976-06-08    29.0
1976-06-09    71.0
Name: Ozone, dtype: float64

## Exercise

### Filling missing time-series data


In [21]:
# Print prior to imputing missing values
print(airquality[30:40])

# Fill NaNs using forward fill
airquality.fillna(method="ffill", inplace=True)

# Print after imputing missing values
print(airquality[30:40])

            Ozone  Solar  Wind  Temp
Date                                
1976-05-31   37.0  279.0   7.4    76
1976-06-01    NaN  286.0   8.6    78
1976-06-02    NaN  287.0   9.7    74
1976-06-03    NaN  242.0  16.1    67
1976-06-04    NaN  186.0   9.2    84
1976-06-05    NaN  220.0   8.6    85
1976-06-06    NaN  264.0  14.3    79
1976-06-07   29.0  127.0   9.7    82
1976-06-08    NaN  273.0   6.9    87
1976-06-09   71.0  291.0  13.8    90
            Ozone  Solar  Wind  Temp
Date                                
1976-05-31   37.0  279.0   7.4    76
1976-06-01   37.0  286.0   8.6    78
1976-06-02   37.0  287.0   9.7    74
1976-06-03   37.0  242.0  16.1    67
1976-06-04   37.0  186.0   9.2    84
1976-06-05   37.0  220.0   8.6    85
1976-06-06   37.0  264.0  14.3    79
1976-06-07   29.0  127.0   9.7    82
1976-06-08   29.0  273.0   6.9    87
1976-06-09   71.0  291.0  13.8    90


In [22]:
# Print prior to imputing missing values
print(airquality[30:40])

# Fill NaNs using backward fill
airquality.fillna(method="bfill", inplace=True)

# Print after imputing missing values
print(airquality[30:40])

            Ozone  Solar  Wind  Temp
Date                                
1976-05-31   37.0  279.0   7.4    76
1976-06-01   37.0  286.0   8.6    78
1976-06-02   37.0  287.0   9.7    74
1976-06-03   37.0  242.0  16.1    67
1976-06-04   37.0  186.0   9.2    84
1976-06-05   37.0  220.0   8.6    85
1976-06-06   37.0  264.0  14.3    79
1976-06-07   29.0  127.0   9.7    82
1976-06-08   29.0  273.0   6.9    87
1976-06-09   71.0  291.0  13.8    90
            Ozone  Solar  Wind  Temp
Date                                
1976-05-31   37.0  279.0   7.4    76
1976-06-01   37.0  286.0   8.6    78
1976-06-02   37.0  287.0   9.7    74
1976-06-03   37.0  242.0  16.1    67
1976-06-04   37.0  186.0   9.2    84
1976-06-05   37.0  220.0   8.6    85
1976-06-06   37.0  264.0  14.3    79
1976-06-07   29.0  127.0   9.7    82
1976-06-08   29.0  273.0   6.9    87
1976-06-09   71.0  291.0  13.8    90


### Impute with interpolate method


In [23]:
# Print prior to interpolation
print(airquality[30:40])

# Interpolate the NaNs linearly
airquality.interpolate(method='linear', inplace=True)

# Print after interpolation
print(airquality[30:40])

            Ozone  Solar  Wind  Temp
Date                                
1976-05-31   37.0  279.0   7.4    76
1976-06-01   37.0  286.0   8.6    78
1976-06-02   37.0  287.0   9.7    74
1976-06-03   37.0  242.0  16.1    67
1976-06-04   37.0  186.0   9.2    84
1976-06-05   37.0  220.0   8.6    85
1976-06-06   37.0  264.0  14.3    79
1976-06-07   29.0  127.0   9.7    82
1976-06-08   29.0  273.0   6.9    87
1976-06-09   71.0  291.0  13.8    90
            Ozone  Solar  Wind  Temp
Date                                
1976-05-31   37.0  279.0   7.4    76
1976-06-01   37.0  286.0   8.6    78
1976-06-02   37.0  287.0   9.7    74
1976-06-03   37.0  242.0  16.1    67
1976-06-04   37.0  186.0   9.2    84
1976-06-05   37.0  220.0   8.6    85
1976-06-06   37.0  264.0  14.3    79
1976-06-07   29.0  127.0   9.7    82
1976-06-08   29.0  273.0   6.9    87
1976-06-09   71.0  291.0  13.8    90


In [24]:
# Print prior to interpolation
print(airquality[30:40])

# Interpolate the NaNs quadratically
airquality.interpolate(method='quadratic', inplace=True)

# Print after interpolation
print(airquality[30:40])

            Ozone  Solar  Wind  Temp
Date                                
1976-05-31   37.0  279.0   7.4    76
1976-06-01   37.0  286.0   8.6    78
1976-06-02   37.0  287.0   9.7    74
1976-06-03   37.0  242.0  16.1    67
1976-06-04   37.0  186.0   9.2    84
1976-06-05   37.0  220.0   8.6    85
1976-06-06   37.0  264.0  14.3    79
1976-06-07   29.0  127.0   9.7    82
1976-06-08   29.0  273.0   6.9    87
1976-06-09   71.0  291.0  13.8    90
            Ozone  Solar  Wind  Temp
Date                                
1976-05-31   37.0  279.0   7.4    76
1976-06-01   37.0  286.0   8.6    78
1976-06-02   37.0  287.0   9.7    74
1976-06-03   37.0  242.0  16.1    67
1976-06-04   37.0  186.0   9.2    84
1976-06-05   37.0  220.0   8.6    85
1976-06-06   37.0  264.0  14.3    79
1976-06-07   29.0  127.0   9.7    82
1976-06-08   29.0  273.0   6.9    87
1976-06-09   71.0  291.0  13.8    90


In [26]:
# Print prior to interpolation
print(airquality[30:40])

# Interpolate the NaNs with nearest value
airquality.interpolate(method="nearest" , inplace=True)

# Print after interpolation
print(airquality[30:40])

            Ozone  Solar  Wind  Temp
Date                                
1976-05-31   37.0  279.0   7.4    76
1976-06-01   37.0  286.0   8.6    78
1976-06-02   37.0  287.0   9.7    74
1976-06-03   37.0  242.0  16.1    67
1976-06-04   37.0  186.0   9.2    84
1976-06-05   37.0  220.0   8.6    85
1976-06-06   37.0  264.0  14.3    79
1976-06-07   29.0  127.0   9.7    82
1976-06-08   29.0  273.0   6.9    87
1976-06-09   71.0  291.0  13.8    90
            Ozone  Solar  Wind  Temp
Date                                
1976-05-31   37.0  279.0   7.4    76
1976-06-01   37.0  286.0   8.6    78
1976-06-02   37.0  287.0   9.7    74
1976-06-03   37.0  242.0  16.1    67
1976-06-04   37.0  186.0   9.2    84
1976-06-05   37.0  220.0   8.6    85
1976-06-06   37.0  264.0  14.3    79
1976-06-07   29.0  127.0   9.7    82
1976-06-08   29.0  273.0   6.9    87
1976-06-09   71.0  291.0  13.8    90
