In [1]:
from scipy.io import loadmat
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly as py
import plotly.graph_objs as go
import ipywidgets as widgets
from tqdm.auto import tqdm 

import seaborn as sns

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.impute import MissingIndicator
from sklearn.preprocessing import PowerTransformer
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_absolute_error #=mean error (Simon 2012)
from sklearn.metrics import r2_score

from scipy.spatial import distance
from sklearn.metrics.pairwise import nan_euclidean_distances

from matplotlib.lines import Line2D
from matplotlib import rcParams, cycler
from sklearn.preprocessing import PowerTransformer

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%config InlineBackend.figure_format = 'retina'

In [2]:
import warnings
warnings.filterwarnings("ignore")

## Functions

In [3]:
def initialize(threshold, start_time):
    PM25 = pd.read_pickle("/Users/iditbela/Documents/Broday/saved_data_from_notebooks/PM25")

    times = pd.date_range(start=start_time, end='2018-12-31 23:00:00', freq='30Min') #one less because the last is always nan

    start_idx = PM25.shape[0]-times.shape[0]

    # remove the last index as it is always nan
    PM25 = PM25[:-1]
    times = times[:-1]

    # reduced PM25 
    r_PM25 = PM25[start_idx:] 
    idx = r_PM25.notnull().sum(axis = 0)/r_PM25.shape[0]>threshold
    r_PM25 = r_PM25.loc[:, idx]

    r_PM25.reset_index(inplace=True)
    r_PM25.drop(labels = 'index',axis=1, inplace=True)
    
    return times, r_PM25

In [None]:
# create a np_array np_y_missing, fill the np_y_missing with values where the validation
# indexes are chosen. 

def get_validation_index(PM25, interval_length, s, exclude):
    
    idx_start = np.random.choice([i for i in range(0,len(PM25.iloc[:,s])) if i not in exclude])
    condition = PM25.iloc[idx_start:idx_start+interval_length,s].isnull().rolling(interval_length,min_periods=0).sum().max()

    # I allow some nan values (0.5%) to be inside in validation interval. 
    while ((condition>interval_length/100/2) or (np.isnan(PM25.iloc[idx_start-1,s])) or (np.isnan(PM25.iloc[idx_start+interval_length+1,s])) or ((idx_start+interval_length+1)>len(PM25.iloc[:,s]))):
        idx_start = np.random.choice([i for i in range(0,len(PM25.iloc[:,s])) if i not in exclude])
        condition = PM25.iloc[idx_start:idx_start+interval_length,s].isnull().rolling(interval_length,min_periods=0).sum().max()

    return (idx_start,idx_start + interval_length)

In [None]:
# a function that returns X_missing, y_missing (splits the data)


def return_X_y_for_validation(PM25,IL):
    
    np_r_PM25_y_mask = PM25.copy().values
    np_r_PM25_y_mask[:] = 0
    
    for s in range(np_r_PM25_y_mask.shape[1]-33):
        
        total_exclude = [] # a list of excluded indexes (to not overlap the chunks. not 
        # sure it is important if I evaluate seperately for each chunk size).
        
        # 720 hours(30 days)(1)
        interval_length = IL[0]*2
        (a,b) = get_validation_index(PM25, interval_length, s, total_exclude)
        total_exclude.append(range(a,b))
        np_r_PM25_y_mask[a:b,s] = 1

        # 120 hours (5 days)(6)
        interval_length = IL[1]*2
        for i in tqdm(range(6)):
            (a,b) = get_validation_index(PM25, interval_length, s, total_exclude)
            total_exclude.append(range(a,b))
            np_r_PM25_y_mask[a:b,s] = 2

        # 24 hours (1 day)(30)
        interval_length = IL[2]*2
        for i in tqdm(range(30)):
            (a,b) = get_validation_index(PM25, interval_length, s, total_exclude)
            total_exclude.append(range(a,b))
            np_r_PM25_y_mask[a:b,s] = 3

        # 12 hours(60)
        interval_length = IL[3]*2
        for i in tqdm(range(60)):
            (a,b) = get_validation_index(PM25, interval_length, s, total_exclude)
            total_exclude.append(range(a,b))
            np_r_PM25_y_mask[a:b,s] = 4

        # 2 hours(360) 
        interval_length = IL[4]*2
        for i in tqdm(range(360)):
            (a,b) = get_validation_index(PM25, interval_length, s, total_exclude)
            total_exclude.append(range(a,b))
            np_r_PM25_y_mask[a:b,s] = 5

        # 1 hour(720)
        interval_length = IL[5]*2
        for i in tqdm(range(720)):
            (a,b) = get_validation_index(PM25, interval_length, s, total_exclude)
            total_exclude.append(range(a,b))
            np_r_PM25_y_mask[a:b,s] = 6

        # 0.5 hour(1440)
        interval_length = int(IL[6]*2)
        for i in tqdm(range(1440)):
            (a,b) = get_validation_index(PM25, interval_length, s, total_exclude)
            total_exclude.append(range(a,b))
            np_r_PM25_y_mask[a:b,s] = 7

        
    return np_r_PM25_y_mask  
    




# a function that imputes (and saves imputation results)

# a function that cumputes validation results

In [None]:
threshold = 0.6 # how much non-missing values are in the time-series in order to include the station?
start_time = '2013-01-01 00:00:00'
times, r_PM25 = initialize(threshold, start_time)

In [None]:
IL = [720,120,24,12,2,1,0.5]   
# IL = [1200, 100, 50, 10, 5, 1, 0.5]

np_r_PM25_y_mask = return_X_y_for_validation(r_PM25,IL)

In [None]:
# m (number of iterations) should be approximately as the average missing percentage. 
# which is about 15% in my case. ok to start with 5-10 though. 
# loose the 5 rows where all rows are empty!!!

In [4]:
# Ways to check randomness - https://www.theanalysisfactor.com/missing-data-mechanism/
# mark missing values in 1 and non-missing in zero. run a t-test or chi-square test 
# between one variable(column) and all other variables(columns). I think it should
# tell me if they are drown from the same poplutation (I want p-value not significant,
# meaning they are from different populations). if they come from the same population 
# I suspect that it is not random.
# MAYBE THE 0-1 IS TO TEST THE ORDER OF THE VALUES? IF IN RANDOM. = TEST FOR MCAR
# another option (I think test for MAR) is if I suspect that it is not completely random 
# (due to malfunctioning in a certain
# time maybe), I can try to train a logistic regression to see if time of day can 
# predict the missing non-missing classification. 

In [None]:
# check the distribution of the imputed values Vs. the dist. of the non-missing values.


'''The idea is that good imputations have a distribution 
similar to the observed data. In other words, the imputations
could have been real values had they been observed. 
Except under MCAR, the distributions do not need to be identical, 
since strong MAR mechanisms may induce systematic differences between 
the two distributions. However, any dramatic differences between the 
imputed and observed data should certainly alert us to the possibility
that something is wrong.'''

In [None]:
# https://stefvanbuuren.name/fimd/sec-nutshell.html
# http://www.stat.columbia.edu/~gelman/arm/missing.pdf