In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [17]:
data = pd.read_csv('./data/electric/normal.csv')
scaler = MinMaxScaler()
data_norm = scaler.fit_transform(data)
data = pd.DataFrame(data_norm, columns=data.columns)
data.head()

Unnamed: 0,precio,demanda,porcentaje,eolica,hidraulica,solar
0,0.507176,0.228012,0.508747,0.346219,0.309066,0.026074
1,0.44961,0.155475,0.528468,0.330156,0.284614,0.025928
2,0.376443,0.109446,0.528729,0.341617,0.244005,0.025928
3,0.373479,0.074617,0.492106,0.321261,0.232198,0.026001
4,0.372075,0.052909,0.471223,0.280541,0.251557,0.025928


In [18]:
def generate_missing_data(d, missing_rate=0.25):
    missing_mask = np.random.rand(data.shape[0], data.shape[1]) < missing_rate
    missing_data = np.where(missing_mask, np.nan, data)
    missing_data = pd.DataFrame(missing_data, columns=data.columns)
    return missing_mask, missing_data

# Carry imputation

In [10]:
res = []
for i in range(5):
    missing_mask, missing_data = generate_missing_data(data)
    carry_imputation = missing_data.fillna(method='ffill').fillna(method='bfill')
    results = mean_squared_error(data.values[missing_mask], carry_imputation.values[missing_mask])
    res.append(results)

print(f'Results: {np.mean(res)} +- {np.std(res)}')


Results: 0.007182878243679319 +- 0.0002952152769976712


# Mean imputation

In [11]:
res = []
for i in range(5):
    missing_mask, missing_data = generate_missing_data(data)
    mean_imputation = missing_data.fillna(missing_data.mean())
    results = mean_squared_error(data.values[missing_mask], mean_imputation.values[missing_mask])
    res.append(results)

print(f'Results: {np.mean(res)} +- {np.std(res)}')

Results: 0.05014863037172135 +- 0.0005052916034427229


# zero imputation

In [12]:
res = []
for i in range(5):
    missing_mask, missing_data = generate_missing_data(data)
    zero_imputation = missing_data.fillna(0)
    results = mean_squared_error(data.values[missing_mask], zero_imputation.values[missing_mask])
    res.append(results)

print(f'Results: {np.mean(res)} +- {np.std(res)}')

Results: 0.25248181149597443 +- 0.003926736144059718


# linear interpolation

In [13]:
res = []
for i in range(5):
    missing_mask, missing_data = generate_missing_data(data)
    linear_interpolation = missing_data.interpolate().fillna(method='bfill')
    results = mean_squared_error(data.values[missing_mask], linear_interpolation.values[missing_mask])
    res.append(results)

print(f'Results: {np.mean(res)} +- {np.std(res)}')

Results: 0.0012057555962061083 +- 8.25929836402565e-05


In [24]:
res = []
for i in range(5):
    missing_mask, missing_data = generate_missing_data(data, missing_rate=0.9)
    linear_interpolation = missing_data.interpolate().fillna(method='bfill')
    results = mean_squared_error(data.values[missing_mask], linear_interpolation.values[missing_mask])
    res.append(results)

print(f'Results: {np.mean(res)} +- {np.std(res)}')

Results: 0.030396513839068613 +- 0.0007567891366223787


# Gain 0.03 in MAE

In [14]:
results_gain = [
    0.047141119837760925,
    0.03490561991930008,
    0.03099825792014599,
    0.02529190666973591,
    0.03605275973677635
]

print(f'Results: {np.mean(results_gain)} +- {np.std(results_gain)}')

Results: 0.034877932816743853 +- 0.007193198389792043


In [15]:
0.00284798499196764 +- 0.0007088827267968981

0.0021391022651707416