In [2]:
# import


In [7]:
"""
None: Pythonic missing data
"""

# None objects as missing values

object
int32


In [1]:
# object type is incompatible with numpy and pandas operations


NameError: name 'arr2' is not defined

In [15]:
"""
NaN: Missing Numerical Data
"""


dtype('float64')

In [18]:
# Arithmetic with NaN will be another NaN


nan
nan
nan


In [19]:
# Special NumPy aggregation funcs that ignore these missing values


8.0
4.0
1.0


In [20]:
# Pandas automatically converts the None to a NaN value.


0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

### Pandas methods for operating on null values
+ isnull()
+ notnull()
+ dropna()
+ fillna()

In [21]:
"""
Detecting null values
"""


0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64 



0    False
1     True
2    False
3     True
dtype: bool

In [22]:
# notnull()


0     True
1    False
2     True
3    False
dtype: bool

In [31]:
"""
Dropping null values
"""
# make_df(): create a DataFrame object
def make_df(cols, ind):
    data = {c: [str(c) + str(i) for i in ind]
           for c in cols}

    return pd.DataFrame(data, ind)

df = make_df('ABC', [0, 1, 2])
df

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [32]:
# Introduce NaN values


Unnamed: 0,A,B,C
0,A0,,C0
1,A1,B1,C1
2,,B2,C2


In [36]:
# dropna()


    A   B   C
1  A1  B1  C1 

    A   B   C
1  A1  B1  C1
    C
0  C0
1  C1
2  C2


In [40]:
# Introduce more NaN values


Unnamed: 0,A,B,C
0,A0,,
1,A1,B1,
2,,B2,


In [43]:
# how='any' (default)
# how='all' which will only drop rows/columns that are all null values


Empty DataFrame
Columns: [A, B, C]
Index: [] 

     A    B
0   A0  NaN
1   A1   B1
2  NaN   B2 



In [55]:
# thresh: minimum number of non-null values to be kept


    A   B   C
1  A1  B1 NaN 

     A    B
0   A0  NaN
1   A1   B1
2  NaN   B2


In [57]:
"""
Filling Null Values
"""

# Create a Series

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [58]:
# Fill null values with a certain value

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [59]:
# Forward-fill = LOCF


a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [60]:
# backward-fill = NOCB


a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [61]:
from pandas import datetime
from matplotlib import pyplot as plt

"""
Load AirQualityUCI Data
"""

def parser(x):
    return datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

input_file = './data/AirQualityUCI_refined.csv'

df = pd.read_csv(input_file,
                 index_col=[0],
                 parse_dates=[0],
                 date_parser=parser)



  from pandas import datetime


In [62]:
# Print the summary of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9357 entries, 2004-03-10 18:00:00 to 2005-04-04 14:00:00
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   CO(GT)         7765 non-null   float64
 1   PT08.S1(CO)    8991 non-null   float64
 2   PT08.S2(NMHC)  8991 non-null   float64
 3   NOx(GT)        7718 non-null   float64
 4   PT08.S3(NOx)   8991 non-null   float64
 5   NO2(GT)        7715 non-null   float64
 6   PT08.S4(NO2)   8991 non-null   float64
 7   PT08.S5(O3)    8991 non-null   float64
 8   RH             8991 non-null   float64
 9   AH             8991 non-null   float64
 10  C6H6(GT)       9357 non-null   float64
dtypes: float64(11)
memory usage: 877.2 KB


In [63]:
# Visualization setup
%matplotlib
%config InlineBackend.figure_format = 'svg'

from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = [10, 5]
plt.ion() # enable the interactive mode

import seaborn as sns
sns.set()  # set plot styles

Using matplotlib backend: Qt5Agg


In [65]:
# Visualize the entire missing values
import missingno

missingno.matrix(df)

<AxesSubplot:>

In [64]:
# Visualize the series of CO(GT)
df['CO(GT)'].plot()

<AxesSubplot:xlabel='Datetime'>

In [66]:
# imputation
imp_locf = df['CO(GT)'].copy().ffill() # LOCF
imp_nocb = df['CO(GT)'].copy().bfill() # NOCB
imp_linear = df['CO(GT)'].copy().interpolate() # linear interpolation
imp_mean = df['CO(GT)'].copy().fillna(df['CO(GT)'].mean())

In [72]:
# k-nn imputation
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=2)    # default: 2
imp_knn = df.copy().values
imp_knn = imputer.fit_transform(imp_knn)


In [73]:
# add indices to the imputed result of k-nn
imp_df = pd.DataFrame(imp_knn, index=imp_locf.index, columns=df.columns)

In [74]:
# Visualizing the imputed results
plt.plot(df['CO(GT)'], label='actual', zorder=10)
plt.plot(imp_linear, label='linear interpolation', zorder=3)
plt.plot(imp_nocb, label='nocb', zorder=2)
plt.plot(imp_locf, label='locf', zorder=1)
plt.plot(imp_mean, label='mean substitution', zorder=4)
plt.plot(imp_df['CO(GT)'], label='k-nearest neighbor', zorder=5)
plt.legend(loc='best')
plt.show()