In [412]:
from scipy.io import loadmat
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly as py
import plotly.graph_objs as go
import ipywidgets as widgets
from tqdm.auto import tqdm 


from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.impute import MissingIndicator
from sklearn.preprocessing import PowerTransformer
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

from scipy.spatial import distance


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%config InlineBackend.figure_format = 'retina'

In [413]:
import warnings
warnings.filterwarnings("ignore")

In [414]:
PM25 = pd.read_pickle("/Users/iditbela/Documents/Broday/saved_data_from_notebooks/PM25")

In [415]:
times = pd.date_range(start='2013-01-01 00:00:00', end='2018-12-31 23:00:00', freq='30Min') #one less because the last is always nan

In [416]:
start_year = PM25.shape[0]-times.shape[0]

In [417]:
# remove the last index as it is always nan
PM25 = PM25[:-1]
times = times[:-1]

In [418]:
threshold = 0.6 # how much non-missing values are in the time-series in order to include the station?

In [419]:
# reduced PM25 
r_PM25 = PM25[start_year:] 
idx = r_PM25.notnull().sum(axis = 0)/r_PM25.shape[0]>threshold
r_PM25 = r_PM25.loc[:, idx]

In [420]:
r_PM25.reset_index(inplace=True)
r_PM25.drop(labels = 'index',axis=1, inplace=True)

In [421]:
# r_PM25['datetime'] = pd.to_datetime(times)
r_PM25_withDays = r_PM25.copy()
r_PM25_withDays['week day'] = pd.to_datetime(times).dayofweek
r_PM25_withDays['month'] = pd.to_datetime(times).month
r_PM25_withDays['hour'] = pd.to_datetime(times).hour

In [422]:
# null percentages
r_PM25.shape[1]
1-r_PM25.notnull().sum(axis = 0)/r_PM25.shape[0]

34

AFULA              0.082660
ANTOKOLSKY         0.095611
HOLON              0.058488
IRONID             0.091893
KVISH4             0.238214
REMEZ              0.135196
YAD_LEBANIM        0.372896
YEFET_YAFO         0.183843
AHUZA_G            0.030295
ATZMAUT_B          0.107849
KIRYAT_ATA         0.275526
KIRYAT_BIALIK      0.308836
KIRYAT_BINYAMIN    0.033518
KIRYAT_TIVON       0.039347
NAVE_SHANAAN       0.041886
NESHER             0.152359
BAR_ILAN           0.229418
EFRATA             0.137630
ASHDOD_IGUD        0.078153
ASHKELON_SOUTH     0.252325
GEDERA             0.097465
GVARAAM            0.103921
KIRYAT_MALAHI      0.143573
NIR_ISRAEL         0.085788
ORT                0.233716
ROVA_TV            0.320161
SDEROT             0.072200
SDE_YOAV           0.039633
YAHALOM            0.258496
BEER_SHEVA         0.144552
EAST_NEGEV         0.273140
KFAR_MASARIK       0.233612
PARDES_HANA        0.263165
RAANANA            0.099024
dtype: float64

In [423]:
r_PM25.shape[0]
r_PM25.shape[1]
r_PM25.shape[1]*r_PM25.shape[0]

105166

34

3575644

# Functions

In [424]:
# keep all random stuff identical
rnd_state = 0

In [442]:
# out of all non-nan indexes, perform 10-fold cross validation.
# the test is y_missing. copy r_PM25, put all null inside, and assign values from r_PM25 according to test indexes. 
# the train is X_missing. copy r_PM25, and assign nans according to test indexes. 
# the splitting currently doesn't try to preserve the original relative missing intervals
# of each feature. maybe I will add it somehow later. 

def KFold_cross_validation(imp,PM25,k,withDays):
    
    if withDays:
        wd = PM25['week day']
        m = PM25['month']
        h = PM25['hour']
        PM25.drop(['week day','month','hour'],axis=1,inplace=True)
        
        
    kf = KFold(n_splits=k, random_state=rnd_state, shuffle=True)
    not_nan_idx = np.argwhere(PM25.notnull().values)
    results = []
    
    for train_index, test_index in kf.split(not_nan_idx):
        np_PM25 = PM25.values
        X_missing = PM25.copy()
        y_missing = PM25.copy()
        
        # y_missing 
        y_missing.iloc[:] = np.nan
        np_y_missing = y_missing.values
        
        # asssign values according to test indexes
        rows, cols = zip(*not_nan_idx[test_index])
        vals = np_PM25[rows, cols]
        np_y_missing[rows, cols] = vals
        # turn back to dataframe
        y_missing = pd.DataFrame(np_y_missing,columns=PM25.columns)

        # X_missing
        # assign nans according to test indexes
        np_X_missing = X_missing.values
        np_X_missing[rows, cols] = np.nan
        
        # turn back to dataframe
        X_missing = pd.DataFrame(np_X_missing,columns=PM25.columns)
        
        if withDays:
            X_missing['week day']=wd
            X_missing['hour']=h
            X_missing['month']=m
        
        # mask all missing values
        indicator = MissingIndicator(missing_values=np.nan)
        mask_missing_values_original = indicator.fit_transform(PM25)
        mask_missing_values_all = indicator.fit_transform(X_missing)

        # perform fit 
        imp.fit(X_missing)
        imputed_df = imp.transform(X_missing) # impute it
        imputed_df = pd.DataFrame(imputed_df, columns=X_missing.columns) #turn it from IterativeImputer object to a dataframe
        
        if withDays:
            imputed_df.drop(['week day','hour','month'],axis = 1, inplace=True)
                       
        # evaluate
        y_train = vals
        np_imputed_df = imputed_df.values
        y_pred = np_imputed_df[rows, cols]
        
        # assign results
        RMSE = np.sqrt(mean_squared_error(y_train, y_pred))
        MedianAE = median_absolute_error(y_train, y_pred)
        MeanAE = mean_absolute_error(y_train,y_pred)
        R2 = r2_score(y_train,y_pred)
        results.append([RMSE,MedianAE,MeanAE,R2])
               
    results = pd.DataFrame(results, columns=['RMSE','MedianAE','MeanAE','R2'])
    
    return results
        
        

# (1) IterativeImputer with BayesianRidge and ExtraTreesRegressor

In [446]:
imp_RF = IterativeImputer(max_iter=5,estimator=ExtraTreesRegressor(n_estimators=10,random_state=rnd_state),verbose=True) 
imp_BR = IterativeImputer(max_iter=10,estimator=BayesianRidge(),verbose=True) 
# try also to change the initial imputer - mean/median/constant...
# and n_estimators (number of trees in the forest...)

## A - without days 

In [429]:
results_woD_RF = KFold_cross_validation(imp_RF,r_PM25,k=10,withDays=False)
results_woD_BR = KFold_cross_validation(imp_BR,r_PM25,k=10,withDays=False)

[IterativeImputer] Completing matrix with shape (105166, 34)
[IterativeImputer] Change: 6918.020158184981, scaled tolerance: 1.6975 
[IterativeImputer] Change: 1085.8700000000001, scaled tolerance: 1.6975 
[IterativeImputer] Change: 775.2500000000001, scaled tolerance: 1.6975 
[IterativeImputer] Change: 657.5999999999998, scaled tolerance: 1.6975 
[IterativeImputer] Change: 654.4599999999998, scaled tolerance: 1.6975 
[IterativeImputer] Completing matrix with shape (105166, 34)
[IterativeImputer] Completing matrix with shape (105166, 34)
[IterativeImputer] Change: 6701.908004507962, scaled tolerance: 1.6975 
[IterativeImputer] Change: 950.15, scaled tolerance: 1.6975 
[IterativeImputer] Change: 798.5300000000001, scaled tolerance: 1.6975 
[IterativeImputer] Change: 948.0899999999999, scaled tolerance: 1.6975 
[IterativeImputer] Change: 1199.1300000000003, scaled tolerance: 1.6975 
[IterativeImputer] Completing matrix with shape (105166, 34)
[IterativeImputer] Completing matrix with sha

[IterativeImputer] Change: 5982.95403206991, scaled tolerance: 1.5118 
[IterativeImputer] Change: 881.6953328326664, scaled tolerance: 1.5118 
[IterativeImputer] Change: 439.2480515100924, scaled tolerance: 1.5118 
[IterativeImputer] Change: 260.58574628649916, scaled tolerance: 1.5118 
[IterativeImputer] Change: 332.2593840848925, scaled tolerance: 1.5118 
[IterativeImputer] Change: 441.83906438942284, scaled tolerance: 1.5118 
[IterativeImputer] Change: 366.9775863379437, scaled tolerance: 1.5118 
[IterativeImputer] Change: 154.94731992730368, scaled tolerance: 1.5118 
[IterativeImputer] Change: 37.71066236547796, scaled tolerance: 1.5118 
[IterativeImputer] Change: 35.58128835853455, scaled tolerance: 1.5118 
[IterativeImputer] Completing matrix with shape (105166, 34)
[IterativeImputer] Completing matrix with shape (105166, 34)
[IterativeImputer] Change: 6080.373215905371, scaled tolerance: 1.6975 
[IterativeImputer] Change: 847.4497371266439, scaled tolerance: 1.6975 
[IterativeIm

In [434]:
results_woD_RF
results_woD_RF.to_pickle("/Users/iditbela/Documents/Broday/saved_data_from_notebooks/results_woD_RF")

Unnamed: 0,RMSE,MedianAE,MeanAE,R2
0,9.276128,4.08,5.739155,0.840807
1,9.43285,4.06,5.744132,0.836131
2,9.288789,4.07,5.748222,0.844457
3,9.384672,4.07,5.732657,0.841935
4,9.580541,4.06,5.754636,0.836096
5,8.978527,4.06,5.70831,0.843317
6,9.436479,4.08,5.752286,0.822759
7,9.458052,4.06,5.748231,0.843511
8,9.773849,4.07,5.746742,0.83045
9,9.289947,4.06,5.728445,0.839699


In [435]:
results_woD_BR
results_woD_BR.to_pickle("/Users/iditbela/Documents/Broday/saved_data_from_notebooks/results_woD_BR")

Unnamed: 0,RMSE,MedianAE,MeanAE,R2
0,12.146577,4.757204,6.994312,0.727041
1,12.194144,4.789904,7.030386,0.726149
2,12.152302,4.783614,6.997097,0.733775
3,11.985335,4.777705,6.971577,0.742191
4,12.287126,4.785513,6.987615,0.730406
5,11.964026,4.740049,6.946813,0.721794
6,11.894747,4.788714,6.950691,0.718386
7,12.287924,4.753388,6.965011,0.735859
8,12.436219,4.793671,7.043854,0.725499
9,11.814214,4.763854,6.968005,0.740749


## B - without days, with normal data

## C - with days

In [447]:
r_PM25_withDays = r_PM25.copy()
r_PM25_withDays['week day'] = pd.to_datetime(times).dayofweek
r_PM25_withDays['month'] = pd.to_datetime(times).month
r_PM25_withDays['hour'] = pd.to_datetime(times).hour

In [None]:
results_wD_RF = KFold_cross_validation(imp_RF,r_PM25_withDays,k=10,withDays=True)
results_wD_BR = KFold_cross_validation(imp_BR,r_PM25_withDays,k=10,withDays=True)

[IterativeImputer] Completing matrix with shape (105166, 37)


In [None]:
results_wD_RF
results_wD_RF.to_pickle("/Users/iditbela/Documents/Broday/saved_data_from_notebooks/results_wD_RF")

In [None]:
results_wD_BR
results_wD_BR.to_pickle("/Users/iditbela/Documents/Broday/saved_data_from_notebooks/results_wD_BR")

## D - with days, with normal data

In [None]:
# wind/other met./other pollutants

# (2) KNN

## A - iterative imputer - VERY SLOW!

In [188]:
# imp = IterativeImputer(max_iter=1,estimator=KNeighborsRegressor(n_neighbors=20,weights='distance',n_jobs=-1),verbose=True) 

In [190]:
# results = KFold_cross_validation(imp,r_PM25,k=2,withDays=False)

## B - my KNN implemintation

In [386]:
# np_r_PM25 = r_PM25.iloc[:10,:].dropna(how='all',axis=1).values
# np_r_PM25 = r_PM25.iloc[:10,:].values
np_r_PM25 = r_PM25.values

In [387]:
# do I need to sort all rows so I start with imputing the row with the smallest number 
# of missing values? not sure it matters.

# for i,row in enumerate(np_r_PM25):  
#     # euclidean
#     distance.cdist(np_r_PM25, row.reshape(-1, 1).T, lambda u, v: np.sqrt(np.nansum((u-v)**2)))
#     print(i)
# each chunk I get, is an array of all distances of row i with all other rows.
# this is why in the first chunk 0 is first, in the second chunk 0 is second...
# Once distances are calculated, we must sort all of the records in the training 
# dataset by their distance to the new data (the desired row). 
# We can then select the top k to return as the most similar neighbors. 
    
def euclidean_distance(all_data, imputed_row):
    dist = distance.cdist(all_data, imputed_row.reshape(-1, 1).T, lambda u, v: np.sqrt(np.nansum((u-v)**2)))
    return dist
    
# Locate the most similar neighbors of a row. 
# all_data = np_r_PM25
# imputed_row = the current row you want to impute. 

def get_neighbors(all_data, imputed_row, num_neighbors):
    dist = euclidean_distance(all_data, imputed_row)
    distances = list(zip(np_r_PM25, dist))    
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append(distances[i][0])
    return neighbors

# get_all_neighbors weigh all the neighbors. 
# so no need to sort
def get_all_neighbors(all_data, imputed_row):
    dist = euclidean_distance(all_data, imputed_row)
    distances = list(zip(np_r_PM25, dist))

    return neighbors
    
        
#     cosine doesnt work
#     distance.cdist(np.delete(np_r_PM25, i,axis=0), row.reshape(-1, 1).T, lambda u, v: np.dot(u, v)/(np.linalg.norm(u)*np.linalg.norm(v)))
#     distance.cdist(np.delete(np_r_PM25, i,axis=0), row.reshape(-1, 1).T, 'cosine') #euclidean

# dot(a, b)/(norm(a)*norm(b))

In [396]:
distances[3][1]

array([14.39201167])

In [383]:
neighbors = get_neighbors(np_r_PM25, np_r_PM25[0,:], 5)
for neighbor in neighbors:
    print(neighbor)   
    
np_r_PM25[0,:]

[ 0.7  4.8  nan  8.9  6.3  6.6  nan 11.  14.7  nan  1.7 10.5  1.4  0.2
  3.8  0.5  1.7  2.9  6.   nan  6.8  nan  nan  3.8  5.9  0.9  5.5  nan
  1.5  2.9 25.2  4.6 15.6 10.3  nan  3.7  nan  nan  nan]
[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan]
[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan]
[ 4.3  6.8  nan  8.1  nan  nan  nan  nan  nan  nan  4.6  nan  4.5  0.1
  4.7  0.3  2.1  3.5  4.   2.8  nan  nan  nan  1.1  5.7  0.1  3.2  nan
  nan 11.3 22.5  1.   9.   6.8  nan  1.2  nan  nan  nan]
[ 2.7  2.5 10.7  nan  4.1  nan  2.5  7.7  nan 10.2  3.6  4.8  2.8  2.3
  2.8  5.   nan  5.6  nan 13.2  1.1  5.1  6.7  nan  nan  3.5  2.8  nan
  2.4  0.9  nan  nan 22.3  4.2 10.   nan  5.  11.   5.8]


array([ 0.7,  4.8,  nan,  8.9,  6.3,  6.6,  nan, 11. , 14.7,  nan,  1.7,
       10.5,  1.4,  0.2,  3.8,  0.5,  1.7,  2.9,  6. ,  nan,  6.8,  nan,
        nan,  3.8,  5.9,  0.9,  5.5,  nan,  1.5,  2.9, 25.2,  4.6, 15.6,
       10.3,  nan,  3.7,  nan,  nan,  nan])

# (3) ARIMA/LSTM/Prophet for 1-? missing time-steps (short intervals)

# (4) The simplest interpolation for 1-? missing time-steps (short intervals)

In [None]:
# ARIMA might be problematic since I need to tune the parameters all the time. Does LSTM 
# require less parameters? in addition, I could predic short intervals but then continue 
# to predict with the LSTM with the long intervals as -1, as described in machinelearning
# mastery. 

# (5) Runing models 1-2 again with the short-interval imputed values

In [None]:
# others
# https://impyute.readthedocs.io/en/latest/index.html
# https://towardsdatascience.com/6-different-ways-to-compensate-for-missing-values-data-imputation-with-examples-6022d9ca0779
# statsmodels MICE
# datawig 


# (6) Statistical imputation?

In [None]:
# trash

s = pd.Series(tuple(map(tuple, not_nan_idx[test_index])))
vals = s.apply(lambda xy: r_PM25.iloc[xy[0],xy[1]])

In [178]:
PM25=r_PM25
k=2
withDays=False

kf = KFold(n_splits=k, random_state=0, shuffle=True)
not_nan_idx = np.argwhere(PM25.notnull().values)
results = []

In [179]:
for train_index, test_index in kf.split(not_nan_idx):
        train_index
        test_index

array([      1,       3,       4, ..., 1754019, 1754021, 1754024])

array([      0,       2,       6, ..., 1754023, 1754025, 1754026])

array([      0,       2,       6, ..., 1754023, 1754025, 1754026])

array([      1,       3,       4, ..., 1754019, 1754021, 1754024])

In [139]:
np_PM25 = PM25.values
X_missing = PM25.copy()
y_missing = PM25.copy()

# y_missing 
y_missing.iloc[:] = np.nan
np_y_missing = y_missing.values

# asssign values according to test indexes
rows, cols = zip(*not_nan_idx[test_index])
vals = np_PM25[rows, cols]
np_y_missing[rows, cols] = vals
# turn back to dataframe
y_missing = pd.DataFrame(np_y_missing,columns=PM25.columns)

# X_missing
# assign nans according to test indexes
np_X_missing = X_missing.values
np_X_missing[rows, cols] = np.nan

# turn back to dataframe
X_missing = pd.DataFrame(np_X_missing,columns=PM25.columns)

# mask all missing values
indicator = MissingIndicator(missing_values=np.nan)
mask_missing_values_original = indicator.fit_transform(PM25)
mask_missing_values_all = indicator.fit_transform(X_missing)

# perform fit 
imp.fit(X_missing)
imputed_df = imp.transform(X_missing) # impute it
imputed_df = pd.DataFrame(imputed_df, columns=X_missing.columns) #turn it from IterativeImputer object to a dataframe

[IterativeImputer] Completing matrix with shape (52606, 39)
[IterativeImputer] Change: 6112.810739330168, scaled tolerance: 1.133 



[IterativeImputer] Early stopping criterion not reached.



IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=10, random_state=0),
                 max_iter=1, verbose=True)

[IterativeImputer] Completing matrix with shape (52606, 39)


In [180]:
# evaluate
# y_train = inverse_y_missing.values
y_train = vals

# y_pred = inverse_imputed_df.mask(~mask).values
np_imputed_df = imputed_df.values
y_pred = np_imputed_df[rows, cols]

In [181]:
y_train.shape
y_pred.shape

(877013,)

(877013,)

In [182]:
# assign results
RMSE = np.sqrt(mean_squared_error(y_train, y_pred))
MedianAE = median_absolute_error(y_train, y_pred)
MeanAE = mean_absolute_error(y_train,y_pred)
R2 = r2_score(y_train,y_pred)
# results.append([RMSE,MedianAE,MeanAE,R2])

In [183]:
# assign results
RMSE = np.sqrt(mean_squared_error(y_train, y_pred))
MedianAE = median_absolute_error(y_train, y_pred)
MeanAE = mean_absolute_error(y_train,y_pred)
R2 = r2_score(y_train,y_pred)
results.append([RMSE,MedianAE,MeanAE,R2])

In [184]:
results

[[12.065151202546158,
  4.540000000000001,
  6.837151125467923,
  0.5649296791889007]]