In [26]:
import numpy as np
import pandas as pd
import sklearn.metrics as metrics

from zoo.zouwu.preprocessing.impute.LastFill import LastFill
from zoo.zouwu.preprocessing.impute.MeanFill import MeanFill
from zoo.zouwu.preprocessing.impute.MFFill import MFFill

# Prepare data

In [2]:
raw_df = pd.read_csv("data/data.csv")

In [3]:
df = pd.DataFrame(pd.to_datetime(raw_df.StartTime))

In [4]:
raw_df.AvgRate.str[-4:].unique()
# Unify AvgRate value
df['AvgRate'] = raw_df.AvgRate.apply(lambda x: float(x[:-4]) if x.endswith("Mbps") else float(x[:-4]) * 1000)
df["total"] = raw_df["total"]
df.set_index("StartTime", inplace=True)

In [5]:
df.head()

Unnamed: 0_level_0,AvgRate,total
StartTime,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01 00:00:00,306.23,275605455598
2018-01-01 02:00:00,285.03,256527692256
2018-01-01 04:00:00,247.39,222652190823
2018-01-01 06:00:00,211.55,190396029658
2018-01-01 08:00:00,234.82,211340468977


choose a part of raw data to do evaluate

In [6]:
sampled_df = df["2019-01-01 00:00:00":"2019-06-30 23:00:00"]

In [7]:
full_idx = pd.date_range(start=sampled_df.index.min(), end=sampled_df.index.max(), freq='2H')
sampled_df = df.reindex(full_idx)
print("no. of n/a values:")
print(sampled_df.isna().sum())

no. of n/a values:
AvgRate    0
total      0
dtype: int64


In [8]:
sampled_df

Unnamed: 0,AvgRate,total
2019-01-01 00:00:00,271.53,244373500495
2019-01-01 02:00:00,208.49,187638154842
2019-01-01 04:00:00,175.88,158293382443
2019-01-01 06:00:00,229.20,206284366389
2019-01-01 08:00:00,264.95,238459494810
...,...,...
2019-06-30 14:00:00,618.90,557013671978
2019-06-30 16:00:00,533.31,479975450009
2019-06-30 18:00:00,475.86,428273046817
2019-06-30 20:00:00,448.12,403312442571


# Data Imputation with Last Seen Value

use LastFill method to fill missing values for all data

In [9]:
last_fill = LastFill()

In [10]:
full_idx = pd.date_range(start=df.index.min(), end=df.index.max(), freq='2H')
df1 = df.reindex(full_idx)
print("no. of n/a values:")
print(df1.isna().sum())

no. of n/a values:
AvgRate    327
total      327
dtype: int64


In [11]:
filled_df = last_fill.impute(df)

In [12]:
full_idx = pd.date_range(start=df.index.min(), end=df.index.max(), freq='2H')
fill_df = filled_df.reindex(full_idx)
print("no. of n/a values:")
print(filled_df.isna().sum())

no. of n/a values:
AvgRate    0
total      0
dtype: int64


In [13]:
mse_10 = last_fill.evaluate(sampled_df, 0.1)

In [14]:
mse_10

[2004.2239946593002, 1.6857280722708464e+21]

In [15]:
mse_50 = last_fill.evaluate(sampled_df, 0.5)

In [16]:
mse_50

[28298.18132730203, 2.4879356854775073e+22]

# Data Imputation with Mean Value

In [17]:
print("no. of n/a values:")
print(df.isna().sum())

no. of n/a values:
AvgRate    0
total      0
dtype: int64


In [20]:
mean_fill = MeanFill()
meanfilled_df = mean_fill.impute(df)

In [21]:
print("no. of n/a values:")
print(meanfilled_df.isna().sum())

no. of n/a values:
AvgRate    0
total      0
dtype: int64


In [22]:
meanfilled_df

Unnamed: 0_level_0,AvgRate,total
StartTime,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01 00:00:00,306.23,2.756055e+11
2018-01-01 02:00:00,285.03,2.565277e+11
2018-01-01 04:00:00,247.39,2.226522e+11
2018-01-01 06:00:00,211.55,1.903960e+11
2018-01-01 08:00:00,234.82,2.113405e+11
...,...,...
2019-12-31 14:00:00,255.08,2.295759e+11
2019-12-31 16:00:00,192.44,1.731922e+11
2019-12-31 18:00:00,209.65,1.886873e+11
2019-12-31 20:00:00,211.22,1.900964e+11


In [23]:
mse_10_mean = mean_fill.evaluate(sampled_df, 0.1)
mse_10_mean

[3502.068483994105, 2.432470198648963e+21]

In [24]:
mse_50_mean = mean_fill.evaluate(sampled_df, 0.5)
mse_50_mean

[12717.288743855159, 1.071286351880671e+22]

# Data Imputation with Matrix Factorization

In [25]:
print("no. of n/a values:")
print(df.isna().sum())

no. of n/a values:
AvgRate    0
total      0
dtype: int64


In [27]:
mf_fill = MFFill()

In [28]:
mffilled_df = mf_fill.impute(df)

Iteration: 20 ; error = 284.6870
Iteration: 40 ; error = 70.2494
Iteration: 60 ; error = 22.2271
Iteration: 80 ; error = 9.5587
Iteration: 100 ; error = 6.1154


In [29]:
print("no. of n/a values:")
print(mffilled_df.isna().sum())

no. of n/a values:
AvgRate    0
total      0
dtype: int64


In [30]:
mffilled_df

Unnamed: 0_level_0,AvgRate,total
StartTime,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01 00:00:00,306.23,2.756055e+11
2018-01-01 02:00:00,285.03,2.565277e+11
2018-01-01 04:00:00,247.39,2.226522e+11
2018-01-01 06:00:00,211.55,1.903960e+11
2018-01-01 08:00:00,234.82,2.113405e+11
...,...,...
2019-12-31 14:00:00,255.08,2.295759e+11
2019-12-31 16:00:00,192.44,1.731922e+11
2019-12-31 18:00:00,209.65,1.886873e+11
2019-12-31 20:00:00,211.22,1.900964e+11


In [31]:
mse_10_mf = mf_fill.evaluate(sampled_df, 0.1)
mse_10_mf

Iteration: 20 ; error = 64.7660
Iteration: 40 ; error = 27.6155
Iteration: 60 ; error = 12.9124
Iteration: 80 ; error = 6.4798
Iteration: 100 ; error = 3.4140
Iteration: 20 ; error = 58.1982
Iteration: 40 ; error = 25.2981
Iteration: 60 ; error = 11.8837
Iteration: 80 ; error = 5.9676
Iteration: 100 ; error = 3.1420


[1058.8989823030895, 8.321470851195326e+20]

In [32]:
mse_50_mf = mf_fill.evaluate(sampled_df, 0.5)
mse_50_mf

Iteration: 20 ; error = 58.5414
Iteration: 40 ; error = 25.7080
Iteration: 60 ; error = 11.9246
Iteration: 80 ; error = 5.9255
Iteration: 100 ; error = 3.1383
Iteration: 20 ; error = 36.8112
Iteration: 40 ; error = 16.3261
Iteration: 60 ; error = 7.6513
Iteration: 80 ; error = 3.9271
Iteration: 100 ; error = 2.0146


[5202.396991892464, 3.918369241813551e+21]