In [None]:
import math
import json
import os
import pandas as pd
import numpy as np
from datetime import datetime, timezone, timedelta, time
from research.utils.data_access_utils import RDSAccessUtils
from research.weight_estimation.keypoint_utils.optics import euclidean_distance, pixel2world, depth_from_disp, convert_to_world_point
import matplotlib.pyplot as plt
from matplotlib.dates import AutoDateFormatter, AutoDateLocator

rds_access_utils = RDSAccessUtils(json.load(open(os.environ['DATA_WAREHOUSE_SQL_CREDENTIALS'])))

In [None]:
duplicates = pd.read_csv('aplavika_p95_duplicate.csv')

duplicates = duplicates.sort_values('captured_at').copy(deep=True)
duplicates.index = pd.to_datetime(duplicates.captured_at)
dates = duplicates.index.date.astype(str)
duplicates['date'] = dates
duplicates['hour'] = duplicates.index.hour

In [None]:
duplicates.head()

In [None]:
mask = (duplicates['is_duplicate'] == 1)

plt.figure(figsize=(20, 10))


plt.hist(duplicates[mask]['hour'], alpha = 0.5, color = 'blue', density = True, bins = 24)
plt.hist(duplicates[~mask]['hour'], alpha = 0.5, color = 'red', density = True, bins = 24)

In [None]:
len(dist1), len(dist2)

In [None]:
from scipy.stats import t, norm

#mask0 = (duplicates['captured_at'] > '2020-07-22') & (duplicates['captured_at'] < '2020-07-25')
mask0 = (duplicates['captured_at'] > '2020-01-01') & (duplicates['captured_at'] < '2021-01-01')

#mask1 = mask0 & (np.abs(duplicates['theta']) < 10) & (np.abs(duplicates['phi']) < 10)
mask1 = mask0 #& (np.abs(duplicates['hour']) > 5) & (np.abs(duplicates['hour']) < 17)
mask2 = mask1 & (duplicates['is_duplicate'] == 0)

print(sum(mask0), sum(mask1), sum(mask2))

dist1 = duplicates[mask1]
dist2 = duplicates[mask2]

# df, mean, std = t.fit(dist1['estimated_weight_g'])
# df2, mean2, std2 = t.fit(dist2['estimated_weight_g'])
# gt_df, gt_mean, gt_std = t.fit(gt_weights)
mean, std = norm.fit(dist1['estimated_weight_g'])
mean2, std2 = norm.fit(dist2['estimated_weight_g'])
# gt_mean, gt_std = norm.fit(gt_weights)

print(len(dist1), len(dist2))

plt.figure(figsize=(20, 10))

# plt.hist(dist1['estimated_weight_g'], color = 'blue', alpha = 0.5, density = True, bins = 30)
# plt.hist(dist2['estimated_weight_g'], color = 'red', alpha = 0.5, density = True, bins = 30)
plt.hist(dist1['estimated_weight_g'], color = 'blue', alpha = 0.5, bins = 30)
plt.hist(dist2['estimated_weight_g'], color = 'red', alpha = 0.5, bins = 30)
# plt.hist(gt_weights, color = 'red', alpha = 0.5, density = True, bins = 30)

xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 1000)

#y = t.pdf(x, df, mean, std)
y = norm.pdf(x, mean, std)
plt.plot(x, y)
y2 = norm.pdf(x, mean2, std2)
# plt.plot(x, y2)
#plt.plot(x, y + 10 * (y - y2), color = 'green', linestyle = '-')

new_x = x - 5 * (np.mean(dist1['estimated_weight_g']) - np.mean(dist2['estimated_weight_g']))
plt.plot(new_x, y + 10 * (y - y2), color = 'red', linestyle = '-')

# gt_x = np.linspace(xmin, xmax, 1000)
# #gt_y = t.pdf(gt_x, gt_df, gt_mean, gt_std)
# gt_y = norm.pdf(gt_x, gt_mean, gt_std)
# plt.plot(gt_x, gt_y, color = 'black', linewidth = 4)

plt.figure(figsize=(20, 10))
plt.hist(dist1['estimated_weight_g'], color = 'blue', alpha = 0.5, density = True, bins = 30)
# plt.hist(gt_weights, color = 'red', alpha = 0.5, density = True, bins = 30)

In [None]:
plt.figure(figsize=(20, 10))

loss_factor = 0#0.1753

buckets = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]

density, bins, _ = plt.hist(dist1['estimated_weight_g'] * (1 - loss_factor), alpha = 0, density = True, bins=buckets)#bins = 30)
density2, bins, _ = plt.hist(dist2['estimated_weight_g'] * (1 - loss_factor), bins = bins, alpha = 0, density = True)

print(density)

bin_width = bins[1] - bins[0]

_bins_adj = []
for i, end_bin in enumerate(bins[1:]):
    start_bin = bins[i]
    _mask = (dist1['estimated_weight_g'] * (1 - loss_factor) > start_bin) & (dist1['estimated_weight_g'] * (1 - loss_factor) <= end_bin)
    _bins_adj.append(np.mean(dist1['estimated_weight_g'][_mask] * (1 - loss_factor)))
    
#bins_adj = bins[1:] - bin_width / 2
bins_adj = np.array(_bins_adj)

factor = 10
factor2 = factor * 0.5

new_density = density + factor * (density - density2)
new_density[new_density < 0] = 0

new_bins_adj = bins_adj - factor2 * (np.mean(dist1['estimated_weight_g'] * (1 - loss_factor)) - np.mean(dist2['estimated_weight_g'] * (1 - loss_factor)))

plt.bar(new_bins_adj, new_density, color = 'blue', alpha = 0.5, width = bin_width)
# gt_density, gt_bins, _ = plt.hist(gt_weights, bins = bins, color = 'red', alpha = 0.5, density = True)

In [None]:
plt.figure(figsize=(20, 10))
plt.bar(new_bins_adj, (density - density2) / np.sum(new_density), color = 'blue', alpha = 0.5, width = bin_width)

In [None]:
buckets = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
x_buckets = np.array(buckets[1:])
gt_pcts = [0, 0, .0036, .1060, .3990, .3576, .1147, .0180, .0011, 0]

d1 = dist1['estimated_weight_g'] * (1 - loss_factor)
d2 = dist2['estimated_weight_g'] * (1 - loss_factor)
new_density_adj = new_density / np.sum(new_density)

new_pcts = []
pcts1 = []
pcts2 = []

for i in range(len(buckets) - 1):
    mask1 = (d1 > buckets[i]) & (d1 <= buckets[i + 1])
    mask2 = (d2 > buckets[i]) & (d2 <= buckets[i + 1])
    mask_new = (new_bins_adj > buckets[i]) & (new_bins_adj <= buckets[i + 1])
    gt_pct = gt_pcts[i]
#     dist = dist1['estimated_weight_g'][mask1]
#     gt = gt_weights[mask2]

    new_pcts.append(np.sum(new_density_adj[mask_new]))
    pct1 = np.sum(mask1) / len(mask1)
    pcts1.append(pct1)
    pct2 = np.sum(mask2) / len(mask2)
    pcts2.append(pct2)
#     print('%i: %0.3f, %0.3f vs %0.3f' % (buckets[i], np.sum(new_density_adj[mask1]) - gt_pct, np.sum(new_density_adj[mask1]), gt_pct))

pcts1 = np.array(pcts1)
pcts2 = np.array(pcts2)

gt_avg = 4944.34

result = np.sum(new_bins_adj * new_density_adj) 
(result - gt_avg) / gt_avg
print(result, gt_avg)
print((result - gt_avg) / gt_avg)

plt.figure(figsize=(20, 10))
plt.bar(x_buckets - 300, new_pcts, color = 'orange', width = 150, label = 'Dedup diff')
plt.bar(x_buckets - 150, pcts1, color = 'red', width = 150, label = 'Original')
plt.bar(x_buckets + 150, pcts2, color = 'blue', width = 150, label = 'Dedup')
plt.bar(x_buckets + 300, 10 * (pcts1 - pcts2), color = 'purple', width = 150, label = 'Dedup diff')
plt.bar(x_buckets, gt_pcts, color = 'green', width = 150, label = 'Ground truth')

In [None]:
d1 = dist1['estimated_weight_g'] * (1 - loss_factor)
d2 = dist2['estimated_weight_g'] * (1 - loss_factor)

for i in range(len(buckets) - 1):
    mask1 = (d1 > buckets[i]) & (d1 <= buckets[i + 1])
    mask2 = (d2 > buckets[i]) & (d2 <= buckets[i + 1])
    pct2 = pcts[i]
#     dist = dist1['estimated_weight_g'][mask1]
#     gt = gt_weights[mask2]
    
    print('%i: %0.3f, %0.3f vs %0.3f' % (buckets[i], np.sum(mask1) / len(mask1) - pct2, np.sum(mask1) / len(mask1), pct2))

gt_avg = 4944.34

result = np.mean(d1)
(result - gt_avg) / gt_avg
print(result, gt_avg)
print((result - gt_avg) / gt_avg)

In [None]:
for i in range(len(buckets) - 1):
    mask1 = (d1 > buckets[i]) & (d1 <= buckets[i + 1])
    mask2 = (d2 > buckets[i]) & (d2 <= buckets[i + 1])
    pct2 = pcts[i]
#     dist = dist1['estimated_weight_g'][mask1]
#     gt = gt_weights[mask2]
    
    print('%i: %0.3f, %0.3f vs %0.3f' % (buckets[i], np.sum(mask2) / len(mask2) - pct2, np.sum(mask2) / len(mask2), pct2))

gt_avg = 4944.34

result = np.mean(d2)
(result - gt_avg) / gt_avg
print(result, gt_avg)
print((result - gt_avg) / gt_avg)