# Check if the value after the missing interval is unusually high using a global normal distribution

In [None]:
PROMINENCE = None
DISTANCE = 2
THRESHOLD = 0.99
MODEL = NormalDistribution

In [None]:
def get_thresholds_global(row,prominence = 0.5, likelihood_threshold = 0.99): 
    X = row.fillna(0).to_numpy()
    if prominence is not None:
        # if a prominence is given detect peaks 
        peaks, _ = find_peaks(X, prominence = prominence, distance = DISTANCE)
        x_to_use = X[peaks]
    else: 
        # otherwise just use all the data
        x_to_use = X
    mu, std = norm.fit(x_to_use)
    _, max_thres = norm.interval(likelihood_threshold, mu, std)
    min_thres = np.nan
    if row.min() < 0:
        # this profile has injection so negative peaks are possible
        if prominence is not None:
            inverse_peaks, _ = find_peaks(-X, prominence = prominence)
            x_to_use = X[peaks]
        else: 
            x_to_use = X
        mu, std = norm.fit(x_to_use)
        min_thres, _ = norm.interval(likelihood_threshold, mu, std)
    return min_thres ,max_thres

def get_global_gauss_peaks(profile_df, prominence = 0.5, likelihood_threshold = 0.99): 
    thresholds = data16_df.apply(get_thresholds_global, axis = 1, result_type = 'expand', prominence = prominence, likelihood_threshold = likelihood_threshold).rename(columns = {0:'gauss_min_threshold',1:'gauss_max_threshold'})
    thresholds = nan_intervals.join(thresholds)
    is_gauss_peak = thresholds.replace({'end': np.NaN}).eval('(first_value_after_end < gauss_min_threshold) | (first_value_after_end > gauss_max_threshold)')
    return is_gauss_peak


In [None]:
global_gauss_peaks = get_global_gauss_peaks(nan_intervals,PROMINENCE, THRESHOLD)
global_gauss_peaks.value_counts().to_frame('count')

In [None]:
pd.crosstab(is_gauss_peak, global_gauss_peaks, rownames = ['gauss peaks'], colnames =['global_peaks'])

## Visualise some detected/non-detected peaks

In [None]:
IDX = 1
vis_df = pd.concat([nan_intervals, global_gauss_peaks.to_frame('global_gauss_peak')], axis = 1)
detected_peak_ids = vis_df[vis_df.global_gauss_peak].index.get_level_values(0).unique()
non_detected_peak_ids = vis_df[~ vis_df.global_gauss_peak].index.get_level_values(0).unique()
# profile_to_show = 'Sl2clpSwmYpN1Q' # a profile with a clear peak that is not discovered using connection capacity
profile_to_show = detected_peak_ids[IDX]
print(profile_to_show)
plot_profile_with_intervals(profile_to_show, 2016, data = vis_df, period_type_column = 'global_gauss_peak').properties(height = 400)

### Compare with the connection capacity peaks

In [None]:
pd.crosstab(connection_power_peaks, global_gauss_peaks, rownames = ['connection power peaks'], colnames =['global_peaks'])

So this is good almost all the connection_power_peaks are detected!

# Check if the value after the missing interval is unusually high using a local normal distribution

In [None]:
PROMINENCE = None
DISTANCE = 2
THRESHOLD = 0.99

In [None]:
def get_thresholds_local(row,window = '15D',prominence = 0.5, likelihood_threshold = 0.99, include_start = True): 
    meterID, year, start, end = row.name 
    start_time, end_time = row[['start_time', 'end_time']]
    delta = pd.Timedelta(window)
    if include_start:
        interval_start, interval_end = start_time - delta, end_time + delta
    else: 
        interval_start, interval_end = end_time, end_time + delta 
    profile = data16_df.loc[(meterID, year), interval_start:interval_end].drop(end_time, axis = 0, errors = 'ignore')
    profile
    X = profile.fillna(0).to_numpy()
    if prominence is not None:
        # if a prominence is given detect peaks 
        peaks, _ = find_peaks(X, prominence = prominence)
        x_to_use = profile.iloc[peaks]
    else: 
        # otherwise just use all the data
        x_to_use = profile
    x_to_use = x_to_use.dropna().to_numpy()
    mu, std = norm.fit(x_to_use)
    _, max_thres = norm.interval(likelihood_threshold, mu, std)
    min_thres = np.nan
    if profile.min() < 0:
        # this profile has injection so negative peaks are possible
        if prominence is not None:
            inverse_peaks, _ = find_peaks(-X, prominence = prominence)
            x_to_use = X[peaks]
        else: 
            x_to_use = X
        mu, std = norm.fit(x_to_use)
        min_thres, _ = norm.interval(likelihood_threshold, mu, std)
    return min_thres ,max_thres

def get_local_gauss_peaks(interval_df, window = '15D', prominence = 0.5, likelihood_threshold = 0.99, include_start = True): 
    thresholds = interval_df.progress_apply(get_thresholds_local, axis = 1, result_type = 'expand', window = window, prominence = prominence, likelihood_threshold = likelihood_threshold, include_start = include_start).rename(columns = {0:'gauss_min_threshold',1:'gauss_max_threshold'})
    thresholds = pd.concat([interval_df, thresholds], axis = 1)
    is_gauss_peak = thresholds.replace({'end': np.NaN}).eval('(first_value_after_end < gauss_min_threshold) | (first_value_after_end > gauss_max_threshold)')
    return is_gauss_peak


In [None]:
local_gauss_peaks = get_local_gauss_peaks(nan_intervals, '5D', PROMINENCE, THRESHOLD)
local_gauss_peaks.value_counts().to_frame('count')

### Confusion matrix between the local gauss peaks and connection power peaks

In [None]:
pd.crosstab(connection_power_peaks, local_gauss_peaks, rownames = ['connection_power'], colnames =['local_peaks'])


So they mostly seem to agree! There are some intervals that are a global peak but not a local peak (which is weird). There are some intervals that are a local peak which is not a global peak (this is possible).

## showing some profiles

In [None]:
IDX = 2
vis_df = pd.concat([nan_intervals, local_gauss_peaks.to_frame('detected')], axis = 1)
detected_peak_ids = vis_df[vis_df.detected].index.get_level_values(0).unique()
non_detected_peak_ids = vis_df[~ vis_df.detected].index.get_level_values(0).unique()
profile_to_show = detected_peak_ids[IDX]
print(profile_to_show)
plot_profile_with_intervals(profile_to_show, 2016, data = vis_df, period_type_column = 'detected').properties(height = 400)

# Check if the value after the missing interval is unusually high using a very local normal distribution

In [None]:
very_local_gauss_peaks = get_local_gauss_peaks(nan_intervals, '2H 30min', None, 0.99, include_start = False)
very_local_gauss_peaks.value_counts().to_frame('count')

### Confusion matrix between the local gauss peaks and very local gauss peaks

In [None]:
pd.crosstab(very_local_gauss_peaks, local_gauss_peaks, rownames = ['very_local'], colnames =['local_peaks'])


### Show some profiles

In [None]:
IDX = 2
vis_df = pd.concat([nan_intervals, very_local_gauss_peaks.to_frame('detected')], axis = 1)
detected_peak_ids = vis_df[vis_df.detected].index.get_level_values(0).unique()
non_detected_peak_ids = vis_df[~ vis_df.detected].index.get_level_values(0).unique()
profile_to_show = detected_peak_ids[IDX]
print(profile_to_show)
plot_profile_with_intervals(profile_to_show, 2016, data = vis_df, period_type_column = 'detected').properties(height = 400)

# Helper method detect peaks in the data  

In [None]:
peaks16_df = data16_df[data16_df.isna().any(axis = 1)].fillna(method = 'ffill', axis = 1).apply(lambda o: find_peaks(o, prominence = PROMINENCE)[0], axis = 1).to_frame('peaks')
peaks16_df

# Check if the value after a missing interval is a peak 

In [None]:
def get_peaks_after_interval(interval_df, peaks_df): 
    def is_peak(row): 
        meterID, year, start, end = row.name
        return end in peaks_df.loc[(meterID, year),'peaks']
    return interval_df.apply(is_peak, axis =1)

In [None]:
after_interval_peaks = get_peaks_after_interval(nan_intervals, peaks16_df)
after_interval_peaks.value_counts().to_frame('count')

In [None]:
pd.crosstab(gauss_peaks, after_interval_peaks, rownames = ['detected gauss peaks'], colnames =['peaks'])