In [1]:
from scipy.io import loadmat
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly as py
import plotly.graph_objs as go
import ipywidgets as widgets
from tqdm.auto import tqdm 


from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.impute import MissingIndicator
from sklearn.preprocessing import PowerTransformer
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%config InlineBackend.figure_format = 'retina'

In [2]:
PM25 = pd.read_pickle("/Users/iditbela/Documents/Broday/saved_data_from_notebooks/PM25")

In [3]:
times = pd.date_range(start='2016-01-01 00:00:00', end='2018-12-31 23:00:00', freq='30Min') #one less because the last is always nan

In [4]:
start_year = PM25.shape[0]-times.shape[0]

In [5]:
# remove the last index as it is always nan
PM25 = PM25[:-1]
times = times[:-1]

In [6]:
threshold = 0.6 # how much non-missing values are in the time-series in order to include the station?

In [7]:
# reduced PM25 
r_PM25 = PM25[start_year:] 
idx = r_PM25.notnull().sum(axis = 0)/r_PM25.shape[0]>threshold
r_PM25 = r_PM25.loc[:, idx]

In [8]:
r_PM25.reset_index(inplace=True)
r_PM25.drop(labels = 'index',axis=1, inplace=True)

In [9]:
# r_PM25['datetime'] = pd.to_datetime(times)
r_PM25_withDays = r_PM25.copy()
r_PM25_withDays['week day'] = pd.to_datetime(times).dayofweek
r_PM25_withDays['month'] = pd.to_datetime(times).month
r_PM25_withDays['hour'] = pd.to_datetime(times).hour

In [10]:
# null percentages
r_PM25.shape[1]
1-r_PM25.notnull().sum(axis = 0)/r_PM25.shape[0]

39

AFULA               0.029825
ANTOKOLSKY          0.061381
EHAD_HAAM           0.316922
HOLON               0.079402
IRONID              0.053264
KVISH4              0.399346
RAKEVET_HASHALOM    0.116070
REMEZ               0.107858
RISHON_LEZION       0.046763
YEFET_YAFO          0.205661
AHUZA_G             0.021461
ATZMAUT_B           0.076645
KIRYAT_ATA          0.346671
KIRYAT_BINYAMIN     0.051667
KIRYAT_TIVON        0.043607
NAVE_SHANAAN        0.031118
NESHER              0.147398
EFRATA              0.030130
NAVE_ILAN           0.297419
ASHDOD_IGUD         0.085903
ASHKELON_SOUTH      0.139490
BNEI_DAROM          0.351139
DALYA               0.151903
GEDERA              0.101072
GVARAAM             0.096643
KIRYAT_MALAHI       0.039596
NIR_ISRAEL          0.121317
ORT                 0.325039
SDEROT              0.061628
SDE_YOAV            0.067179
YAHALOM             0.366631
BEER_SHEVA          0.101167
EAST_NEGEV          0.112573
KFAR_MASARIK        0.093164
PARDES_HANA   

In [28]:
r_PM25.shape[0]
r_PM25.shape[1]
r_PM25.shape[1]*r_PM25.shape[0]

52606

39

2051634

# Functions

In [12]:
def KFold_cross_validation(PM25,k):
    kf = KFold(n_splits=k, random_state=0, shuffle=True)
    not_nan_idx = np.argwhere(PM25.notnull().values)
    
    for train_index, test_index in kf.split(not_nan_idx):
        y_missing = PM25.copy()
        y_missing.iloc[:] = np.nan
        s = pd.Series(tuple(map(tuple, not_nan_idx[test_index])))
        s.apply(lambda xy: y_missing[xy[0],xy[1]] = PM25.iloc[xy[0],xy[1]])


In [78]:
kf = KFold(n_splits=2, random_state=0, shuffle=True)
not_nan_idx = np.argwhere(r_PM25.notnull().values)
not_nan_idx

array([[    0,     0],
       [    0,     1],
       [    0,     3],
       ...,
       [52605,    34],
       [52605,    35],
       [52605,    36]])

In [70]:
# out of all non-nan indexes, perform 10-fold cross validation.
# the test is y_missing. copy r_PM25, put all null inside, and assign values from r_PM25 according to test indexes. 
# the train is X_missing. copy r_PM25, and assign nans according to test indexes. 

for train_index, test_index in kf.split(not_nan_idx):
    print("TRAIN:", train_index, "TEST:", test_index)
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]

TRAIN: [      1       3       4 ... 1754019 1754021 1754024] TEST: [      0       2       6 ... 1754023 1754025 1754026]
TRAIN: [      0       2       6 ... 1754023 1754025 1754026] TEST: [      1       3       4 ... 1754019 1754021 1754024]


In [66]:
not_nan_idx[train_index]

array([[    0,     0],
       [    0,     3],
       [    0,     8],
       ...,
       [52605,    33],
       [52605,    35],
       [52605,    36]])

In [43]:
y_missing = r_PM25.copy()
y_missing.iloc[:] = np.nan

In [49]:
y_missing.head()

Unnamed: 0,AFULA,ANTOKOLSKY,EHAD_HAAM,HOLON,IRONID,KVISH4,RAKEVET_HASHALOM,REMEZ,RISHON_LEZION,YEFET_YAFO,...,SDE_YOAV,YAHALOM,BEER_SHEVA,EAST_NEGEV,KFAR_MASARIK,PARDES_HANA,RAANANA,SHFEYA,ASHALIM,NEOT_HAKIKAR
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [59]:
import operator

lambda map: operator.setitem(map, 'x', 'y')

In [79]:
s = pd.Series(tuple(map(tuple, not_nan_idx[test_index])))
value_array = s.apply(lambda xy: r_PM25.iloc[xy[0],xy[1]])

SyntaxError: lambda cannot contain assignment (<ipython-input-79-c2c1d2191c86>, line 2)

In [68]:
value_array

0          4.8
1          6.3
2          6.6
3         11.0
4         10.5
5          1.4
6          3.8
7          6.0
8          6.8
9          3.8
10         5.9
11         1.5
12        25.2
13        15.6
14        10.3
15         0.7
16         6.7
17         3.5
18        10.1
19        16.4
20         2.6
21         9.2
22         5.8
23         3.8
24        10.5
25         9.8
26        10.9
27        11.7
28         7.3
29        11.1
          ... 
876983    74.0
876984    22.5
876985    29.0
876986    33.4
876987    32.5
876988    24.3
876989     6.9
876990     4.7
876991    20.0
876992    14.9
876993    22.7
876994    20.4
876995    28.9
876996    13.7
876997    16.1
876998    14.1
876999    20.5
877000    33.0
877001    29.5
877002    29.2
877003    18.4
877004    26.0
877005    16.4
877006    22.9
877007    21.4
877008    27.3
877009    15.8
877010    47.7
877011    25.4
877012    33.4
Length: 877013, dtype: float64

In [24]:
# r_PM25.notnull().lookup(*zip(*tuple(map(tuple, not_nan_idx[test_index]))))

KeyError: 'One or more column labels was not found'

# (1) IterativeImputer with BayesianRidge and ExtraTreesRegressor

## A - without days 

## B - without days, with normal data

## C - with days

# (2) KNN

# (3) ARIMA/LSTM/Prophet for 1-3 missing time-steps (short intervals)

In [None]:
# ARIMA might be problematic since I need to tune the parameters all the time. Does LSTM 
# require less parameters? in addition, I could predic short intervals but then continue 
# to predict with the LSTM with the long intervals as -1, as described in machinelearning
# mastery. 

# (4) Runing models 1-2 again with the short-interval imputed values

In [None]:
# others
# https://impyute.readthedocs.io/en/latest/index.html
# https://towardsdatascience.com/6-different-ways-to-compensate-for-missing-values-data-imputation-with-examples-6022d9ca0779
# statsmodels MICE
# datawig 


# (5) Statistical imputation?