# Compare missing value imputation methods

on just one target

In [7]:
%load_ext rich

import pathlib
import numpy as np
import pandas as pd

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_absolute_error as mae, make_scorer

import seaborn as sns
import matplotlib.pyplot as plt

from collections.abc import Callable

sns.set_context("notebook")
sns.set_palette("colorblind")

The rich extension is already loaded. To reload it, use:
  %reload_ext rich


In [2]:
basedir = pathlib.Path.home() / "data" / "available_datasets"
df_stream = pd.read_csv(basedir / "stream_nitrogen" / "SINC_input_data_EDI.csv")

In [3]:
df_stream.head()

Unnamed: 0,Site,Datetime,Sample_Type,TP,TSP,SRP,TN,NO3,NH4,PP,...,EXO_fDOM_120h_lag,Press_kpa_120h_lag,SoilMoisture_120h_lag,EXO_SpCond_120delta,EXO_Temp_120delta,EXO_Turb_120delta,EXO_pH_120delta,EXO_fDOM_120delta,Press_kpa_120delta,SoilMoisture_120delta
0,TS_East,2018-01-22 16:35:00,Baseflow,5.692,3.5179,2.7809,0.5512,0.552,0.005,2.1741,...,,,0.1115,,,,,,,0.0125
1,TS_East,2018-02-28 15:10:00,Baseflow,7.0481,3.1527,1.9507,0.3439,0.362,0.0185,3.8954,...,4.988937,,0.135,24.75075,1.3185,-0.7675,0.01,-0.233648,,0.008
2,TS_East,2018-03-20 14:10:00,Baseflow,12.4426,2.7,0.5,0.6172,0.587,0.005,9.7426,...,3.308868,2.041,0.122,61.96175,-2.02375,-1.65325,0.0215,-0.647593,-0.2125,0.0045
3,TS_East,2018-04-04 14:00:00,Storm Event,10.823,3.2835,1.4783,0.4033,0.248,0.005,7.5395,...,11.785599,3.18525,0.128,83.5815,0.60375,-8.772,0.09775,-2.180811,-0.8195,-0.02
4,TS_East,2018-04-04 18:00:00,Storm Event,30.3826,4.7192,2.0857,0.4653,0.195,0.005,25.6634,...,11.587968,3.25025,0.1245,47.6135,0.353,1.2585,0.0235,0.785274,-0.179,-0.0025


In [4]:
df_stream["hour_of_day"] = df_stream["Datetime"].apply(lambda x: x.split(":")[0][-2:]).astype(float)
df_stream["month"] = df_stream["Datetime"].apply(lambda x: x.split("-")[1]).astype(float)
event = df_stream["Sample_Type"].values
for col in ["Site", "Datetime", "Sample_Type"]:
    df_stream.drop(col, axis=1, inplace=True)
target_cols = ["TP", "TSP", "SRP", "TN", "NO3", "NH4", "PP"]
targets = df_stream[target_cols].copy()
for col in target_cols:
    df_stream.drop(col, axis=1, inplace=True)
df_stream.shape

[1m([0m[1;36m680[0m, [1;36m65[0m[1m)[0m

In [5]:
targets.isna().sum()


TP     [1;36m0[0m
TSP    [1;36m0[0m
SRP    [1;36m0[0m
TN     [1;36m0[0m
NO3    [1;36m0[0m
NH4    [1;36m0[0m
PP     [1;36m0[0m
dtype: int64

In [6]:
df_stream.isna().sum().sort_values(ascending=False)


EXO_fDOM_120delta        [1;36m275[0m
EXO_Turb_120delta        [1;36m274[0m
SoilMoisture_120delta    [1;36m273[0m
EXO_pH_120delta          [1;36m269[0m
EXO_Temp_120delta        [1;36m268[0m
                        [33m...[0m 
Press_kpa_5h_lag         [1;36m169[0m
Press_kpa_120h_lag       [1;36m166[0m
Press_kpa_24h_lag        [1;36m162[0m
hour_of_day                [1;36m0[0m
month                      [1;36m0[0m
Length: [1;36m65[0m, dtype: int64

In [19]:
def evaluate_approach(f_impute: Callable) -> pd.DataFrame:
    X = f_impute(df_stream.copy())
    model = ExtraTreesRegressor(n_estimators=75, max_depth=7, n_jobs=-1)
    cv_results = cross_validate(model, X, targets, cv=5, scoring=make_scorer(mae))
    return np.mean(cv_results["test_score"])

In [20]:
imputation_scores = {}
# Baseline: Median imputation and Mean impute
def f_median_impute(df: pd.DataFrame) -> pd.DataFrame:
    fill_values = {}
    for col in df.columns:
        if df[col].isna().sum() > 0:
            fill_values[col] = np.nanmedian(df[col])
    return df.fillna(fill_values)
def f_mean_impute(df: pd.DataFrame) -> pd.DataFrame:
    fill_values = {}
    for col in df.columns:
        if df[col].isna().sum() > 0:
            fill_values[col] = np.nanmean(df[col])
    return df.fillna(fill_values)



In [21]:
for func in [f_median_impute, f_mean_impute]:
    imputation_scores[func.__name__] = evaluate_approach(func)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(np.nanmedian(df[col]), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(np.nanmean(df[col]), inplace=True)
