In [397]:
from scipy.io import loadmat
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly as py
import plotly.graph_objs as go
import ipywidgets as widgets
from tqdm.auto import tqdm 


from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.impute import MissingIndicator
from sklearn.preprocessing import PowerTransformer
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

from scipy.spatial import distance


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%config InlineBackend.figure_format = 'retina'

In [411]:
import warnings
warnings.filterwarnings("ignore")

In [398]:
PM25 = pd.read_pickle("/Users/iditbela/Documents/Broday/saved_data_from_notebooks/PM25")

In [399]:
times = pd.date_range(start='2016-01-01 00:00:00', end='2018-12-31 23:00:00', freq='30Min') #one less because the last is always nan

In [400]:
start_year = PM25.shape[0]-times.shape[0]

In [401]:
# remove the last index as it is always nan
PM25 = PM25[:-1]
times = times[:-1]

In [402]:
threshold = 0.6 # how much non-missing values are in the time-series in order to include the station?

In [403]:
# reduced PM25 
r_PM25 = PM25[start_year:] 
idx = r_PM25.notnull().sum(axis = 0)/r_PM25.shape[0]>threshold
r_PM25 = r_PM25.loc[:, idx]

In [404]:
r_PM25.reset_index(inplace=True)
r_PM25.drop(labels = 'index',axis=1, inplace=True)

In [405]:
# r_PM25['datetime'] = pd.to_datetime(times)
r_PM25_withDays = r_PM25.copy()
r_PM25_withDays['week day'] = pd.to_datetime(times).dayofweek
r_PM25_withDays['month'] = pd.to_datetime(times).month
r_PM25_withDays['hour'] = pd.to_datetime(times).hour

In [406]:
# null percentages
r_PM25.shape[1]
1-r_PM25.notnull().sum(axis = 0)/r_PM25.shape[0]

39

AFULA               0.029825
ANTOKOLSKY          0.061381
EHAD_HAAM           0.316922
HOLON               0.079402
IRONID              0.053264
KVISH4              0.399346
RAKEVET_HASHALOM    0.116070
REMEZ               0.107858
RISHON_LEZION       0.046763
YEFET_YAFO          0.205661
AHUZA_G             0.021461
ATZMAUT_B           0.076645
KIRYAT_ATA          0.346671
KIRYAT_BINYAMIN     0.051667
KIRYAT_TIVON        0.043607
NAVE_SHANAAN        0.031118
NESHER              0.147398
EFRATA              0.030130
NAVE_ILAN           0.297419
ASHDOD_IGUD         0.085903
ASHKELON_SOUTH      0.139490
BNEI_DAROM          0.351139
DALYA               0.151903
GEDERA              0.101072
GVARAAM             0.096643
KIRYAT_MALAHI       0.039596
NIR_ISRAEL          0.121317
ORT                 0.325039
SDEROT              0.061628
SDE_YOAV            0.067179
YAHALOM             0.366631
BEER_SHEVA          0.101167
EAST_NEGEV          0.112573
KFAR_MASARIK        0.093164
PARDES_HANA   

In [407]:
r_PM25.shape[0]
r_PM25.shape[1]
r_PM25.shape[1]*r_PM25.shape[0]

52606

39

2051634

# Functions

In [408]:
# keep all random stuff identical
rnd_state = 0

In [409]:
# out of all non-nan indexes, perform 10-fold cross validation.
# the test is y_missing. copy r_PM25, put all null inside, and assign values from r_PM25 according to test indexes. 
# the train is X_missing. copy r_PM25, and assign nans according to test indexes. 
# the splitting currently doesn't try to preserve the original relative missing intervals
# of each feature. maybe I will add it somehow later. 

def KFold_cross_validation(imp,PM25,k,withDays=False):
    kf = KFold(n_splits=k, random_state=rnd_state, shuffle=True)
    not_nan_idx = np.argwhere(PM25.notnull().values)
    results = []
    
    for train_index, test_index in kf.split(not_nan_idx):
        np_PM25 = PM25.values
        X_missing = PM25.copy()
        y_missing = PM25.copy()
        
        # y_missing 
        y_missing.iloc[:] = np.nan
        np_y_missing = y_missing.values
        
        # asssign values according to test indexes
        rows, cols = zip(*not_nan_idx[test_index])
        vals = np_PM25[rows, cols]
        np_y_missing[rows, cols] = vals
        # turn back to dataframe
        y_missing = pd.DataFrame(np_y_missing,columns=PM25.columns)

        # X_missing
        # assign nans according to test indexes
        np_X_missing = X_missing.values
        np_X_missing[rows, cols] = np.nan
        
        # turn back to dataframe
        X_missing = pd.DataFrame(np_X_missing,columns=PM25.columns)
        
        if withDays:
            X_missing['week day']=PM25['week day']
            X_missing['hour']=PM25['hour']
            X_missing['month']=PM25['month']
        
        # mask all missing values
        indicator = MissingIndicator(missing_values=np.nan)
        mask_missing_values_original = indicator.fit_transform(PM25)
        mask_missing_values_all = indicator.fit_transform(X_missing)

        # perform fit 
        imp.fit(X_missing)
        imputed_df = imp.transform(X_missing) # impute it
        imputed_df = pd.DataFrame(imputed_df, columns=X_missing.columns) #turn it from IterativeImputer object to a dataframe
        
        if withDays:
            imputed_df.drop(['week day','hour','month'],axis = 1, inplace=True)
                       
        # evaluate
        y_train = vals
        np_imputed_df = imputed_df.values
        y_pred = np_imputed_df[rows, cols]
        
        # assign results
        RMSE = np.sqrt(mean_squared_error(y_train, y_pred))
        MedianAE = median_absolute_error(y_train, y_pred)
        MeanAE = mean_absolute_error(y_train,y_pred)
        R2 = r2_score(y_train,y_pred)
        results.append([RMSE,MedianAE,MeanAE,R2])
               
    results = pd.DataFrame(results, columns=['RMSE','MedianAE','MeanAE','R2'])
    
    return results
        
        

# (1) IterativeImputer with BayesianRidge and ExtraTreesRegressor

In [410]:
imp_RF = IterativeImputer(max_iter=5,estimator=ExtraTreesRegressor(n_estimators=10,random_state=rnd_state),verbose=True) 
imp_BR = IterativeImputer(max_iter=10,estimator=BayesianRidge(),verbose=True) 
# try also to change the initial imputer - mean/median/constant...
# and n_estimators (number of trees in the forest...)

## A - without days 

In [176]:
results_woD_RF = KFold_cross_validation(imp_RF,r_PM25,k=10,withDays=False)
results_woD_BR = KFold_cross_validation(imp_BR,r_PM25,k=10,withDays=False)

[IterativeImputer] Completing matrix with shape (52606, 39)
[IterativeImputer] Change: 6296.2617190883475, scaled tolerance: 1.2255 
[IterativeImputer] Completing matrix with shape (52606, 39)



[IterativeImputer] Early stopping criterion not reached.



(877014,)
(877014,)
[IterativeImputer] Completing matrix with shape (52606, 39)
[IterativeImputer] Change: 6112.810739330168, scaled tolerance: 1.133 
[IterativeImputer] Completing matrix with shape (52606, 39)



[IterativeImputer] Early stopping criterion not reached.



(877013,)
(877013,)


In [177]:
results_woD

Unnamed: 0,RMSE,MedianAE,MeanAE,R2
0,12.112761,4.55,6.847774,0.564379
1,12.065151,4.54,6.837151,0.56493


## B - without days, with normal data

## C - with days

In [None]:
r_PM25_withDays = r_PM25.copy()
r_PM25_withDays['week day'] = pd.to_datetime(times).dayofweek
r_PM25_withDays['month'] = pd.to_datetime(times).month
r_PM25_withDays['hour'] = pd.to_datetime(times).hour

In [None]:
results_wD_RF = KFold_cross_validation(imp_RF,r_PM25_withDays,k=10,withDays=True)
results_wD_BR = KFold_cross_validation(imp_BR,r_PM25_withDays,k=10,withDays=True)

## D - with days, with normal data

In [None]:
# wind/other met./other pollutants

# (2) KNN

## A - iterative imputer - VERY SLOW!

In [188]:
imp = IterativeImputer(max_iter=1,estimator=KNeighborsRegressor(n_neighbors=20,weights='distance',n_jobs=-1),verbose=True) 

In [190]:
results = KFold_cross_validation(imp,r_PM25,k=2,withDays=False)

## B - my KNN implemintation

In [386]:
# np_r_PM25 = r_PM25.iloc[:10,:].dropna(how='all',axis=1).values
# np_r_PM25 = r_PM25.iloc[:10,:].values
np_r_PM25 = r_PM25.values

In [387]:
# do I need to sort all rows so I start with imputing the row with the smallest number 
# of missing values? not sure it matters.

# for i,row in enumerate(np_r_PM25):  
#     # euclidean
#     distance.cdist(np_r_PM25, row.reshape(-1, 1).T, lambda u, v: np.sqrt(np.nansum((u-v)**2)))
#     print(i)
# each chunk I get, is an array of all distances of row i with all other rows.
# this is why in the first chunk 0 is first, in the second chunk 0 is second...
# Once distances are calculated, we must sort all of the records in the training 
# dataset by their distance to the new data (the desired row). 
# We can then select the top k to return as the most similar neighbors. 
    
def euclidean_distance(all_data, imputed_row):
    dist = distance.cdist(all_data, imputed_row.reshape(-1, 1).T, lambda u, v: np.sqrt(np.nansum((u-v)**2)))
    return dist
    
# Locate the most similar neighbors of a row. 
# all_data = np_r_PM25
# imputed_row = the current row you want to impute. 

def get_neighbors(all_data, imputed_row, num_neighbors):
    dist = euclidean_distance(all_data, imputed_row)
    distances = list(zip(np_r_PM25, dist))    
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append(distances[i][0])
    return neighbors

# get_all_neighbors weigh all the neighbors. 
# so no need to sort
def get_all_neighbors(all_data, imputed_row):
    dist = euclidean_distance(all_data, imputed_row)
    distances = list(zip(np_r_PM25, dist))

    return neighbors
    
        
#     cosine doesnt work
#     distance.cdist(np.delete(np_r_PM25, i,axis=0), row.reshape(-1, 1).T, lambda u, v: np.dot(u, v)/(np.linalg.norm(u)*np.linalg.norm(v)))
#     distance.cdist(np.delete(np_r_PM25, i,axis=0), row.reshape(-1, 1).T, 'cosine') #euclidean

# dot(a, b)/(norm(a)*norm(b))

In [396]:
distances[3][1]

array([14.39201167])

In [383]:
neighbors = get_neighbors(np_r_PM25, np_r_PM25[0,:], 5)
for neighbor in neighbors:
    print(neighbor)   
    
np_r_PM25[0,:]

[ 0.7  4.8  nan  8.9  6.3  6.6  nan 11.  14.7  nan  1.7 10.5  1.4  0.2
  3.8  0.5  1.7  2.9  6.   nan  6.8  nan  nan  3.8  5.9  0.9  5.5  nan
  1.5  2.9 25.2  4.6 15.6 10.3  nan  3.7  nan  nan  nan]
[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan]
[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan]
[ 4.3  6.8  nan  8.1  nan  nan  nan  nan  nan  nan  4.6  nan  4.5  0.1
  4.7  0.3  2.1  3.5  4.   2.8  nan  nan  nan  1.1  5.7  0.1  3.2  nan
  nan 11.3 22.5  1.   9.   6.8  nan  1.2  nan  nan  nan]
[ 2.7  2.5 10.7  nan  4.1  nan  2.5  7.7  nan 10.2  3.6  4.8  2.8  2.3
  2.8  5.   nan  5.6  nan 13.2  1.1  5.1  6.7  nan  nan  3.5  2.8  nan
  2.4  0.9  nan  nan 22.3  4.2 10.   nan  5.  11.   5.8]


array([ 0.7,  4.8,  nan,  8.9,  6.3,  6.6,  nan, 11. , 14.7,  nan,  1.7,
       10.5,  1.4,  0.2,  3.8,  0.5,  1.7,  2.9,  6. ,  nan,  6.8,  nan,
        nan,  3.8,  5.9,  0.9,  5.5,  nan,  1.5,  2.9, 25.2,  4.6, 15.6,
       10.3,  nan,  3.7,  nan,  nan,  nan])

# (3) ARIMA/LSTM/Prophet for 1-? missing time-steps (short intervals)

# (4) The simplest interpolation for 1-? missing time-steps (short intervals)

In [None]:
# ARIMA might be problematic since I need to tune the parameters all the time. Does LSTM 
# require less parameters? in addition, I could predic short intervals but then continue 
# to predict with the LSTM with the long intervals as -1, as described in machinelearning
# mastery. 

# (5) Runing models 1-2 again with the short-interval imputed values

In [None]:
# others
# https://impyute.readthedocs.io/en/latest/index.html
# https://towardsdatascience.com/6-different-ways-to-compensate-for-missing-values-data-imputation-with-examples-6022d9ca0779
# statsmodels MICE
# datawig 


# (6) Statistical imputation?

In [None]:
# trash

s = pd.Series(tuple(map(tuple, not_nan_idx[test_index])))
vals = s.apply(lambda xy: r_PM25.iloc[xy[0],xy[1]])

In [178]:
PM25=r_PM25
k=2
withDays=False

kf = KFold(n_splits=k, random_state=0, shuffle=True)
not_nan_idx = np.argwhere(PM25.notnull().values)
results = []

In [179]:
for train_index, test_index in kf.split(not_nan_idx):
        train_index
        test_index

array([      1,       3,       4, ..., 1754019, 1754021, 1754024])

array([      0,       2,       6, ..., 1754023, 1754025, 1754026])

array([      0,       2,       6, ..., 1754023, 1754025, 1754026])

array([      1,       3,       4, ..., 1754019, 1754021, 1754024])

In [139]:
np_PM25 = PM25.values
X_missing = PM25.copy()
y_missing = PM25.copy()

# y_missing 
y_missing.iloc[:] = np.nan
np_y_missing = y_missing.values

# asssign values according to test indexes
rows, cols = zip(*not_nan_idx[test_index])
vals = np_PM25[rows, cols]
np_y_missing[rows, cols] = vals
# turn back to dataframe
y_missing = pd.DataFrame(np_y_missing,columns=PM25.columns)

# X_missing
# assign nans according to test indexes
np_X_missing = X_missing.values
np_X_missing[rows, cols] = np.nan

# turn back to dataframe
X_missing = pd.DataFrame(np_X_missing,columns=PM25.columns)

# mask all missing values
indicator = MissingIndicator(missing_values=np.nan)
mask_missing_values_original = indicator.fit_transform(PM25)
mask_missing_values_all = indicator.fit_transform(X_missing)

# perform fit 
imp.fit(X_missing)
imputed_df = imp.transform(X_missing) # impute it
imputed_df = pd.DataFrame(imputed_df, columns=X_missing.columns) #turn it from IterativeImputer object to a dataframe

[IterativeImputer] Completing matrix with shape (52606, 39)
[IterativeImputer] Change: 6112.810739330168, scaled tolerance: 1.133 



[IterativeImputer] Early stopping criterion not reached.



IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=10, random_state=0),
                 max_iter=1, verbose=True)

[IterativeImputer] Completing matrix with shape (52606, 39)


In [180]:
# evaluate
# y_train = inverse_y_missing.values
y_train = vals

# y_pred = inverse_imputed_df.mask(~mask).values
np_imputed_df = imputed_df.values
y_pred = np_imputed_df[rows, cols]

In [181]:
y_train.shape
y_pred.shape

(877013,)

(877013,)

In [182]:
# assign results
RMSE = np.sqrt(mean_squared_error(y_train, y_pred))
MedianAE = median_absolute_error(y_train, y_pred)
MeanAE = mean_absolute_error(y_train,y_pred)
R2 = r2_score(y_train,y_pred)
# results.append([RMSE,MedianAE,MeanAE,R2])

In [183]:
# assign results
RMSE = np.sqrt(mean_squared_error(y_train, y_pred))
MedianAE = median_absolute_error(y_train, y_pred)
MeanAE = mean_absolute_error(y_train,y_pred)
R2 = r2_score(y_train,y_pred)
results.append([RMSE,MedianAE,MeanAE,R2])

In [184]:
results

[[12.065151202546158,
  4.540000000000001,
  6.837151125467923,
  0.5649296791889007]]