In [1]:
from google.cloud import storage
import pandas as pd
import os
from datetime import datetime


def list_buckets():
    """Lists all buckets."""

    storage_client = storage.Client()
    buckets = storage_client.list_buckets()

    for bucket in buckets:
        print(bucket.name)
        
def set_gcs_connection():
    
    try:
    
        os.chdir('/Users/temporaryadmin/')
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = './cred/credentials.json'
        print("Connection Established")
    except Exception as ex:
        print(ex)

def get_current_time():
    current_timestamp = datetime.now()
    day_ = current_timestamp.day
    month_ = current_timestamp.month
    year_ = current_timestamp.year
    hour_ = current_timestamp.time().hour
    minutes_ = current_timestamp.time().minute

    timestamp = f'{year_}-{month_}-{day_}_{hour_}-{minutes_}'
    return timestamp

In [2]:
import pandas as pd
import seaborn as sns

# from google.cloud import bigquery
# from google.colab import auth
# auth.authenticate_user()
# print('Authenticated')

import matplotlib.pyplot as plt
import seaborn as sns
import json
from itertools import chain
sns.set(
    {
      'figure.figsize' : (15,8)
        }
    )


from sklearn.cluster import KMeans
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score,davies_bouldin_score,calinski_harabasz_score
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import cdist
from scipy.stats import gamma

from shapely.ops import unary_union, cascaded_union
from shapely.geometry import MultiPoint, Polygon, mapping, Point

from collections import Counter
import itertools
from itertools import islice
import numpy as np

import warnings
warnings.filterwarnings('ignore')

PROJECT_ID='dh-darkstores-live'

# def query_to_df(query):
#   df = pd.read_gbq(query, project_id=PROJECT_ID, dialect='standard')
#   return df 

In [3]:
!pwd

/Users/temporaryadmin/dh-darkstores-storetrek/experiments


In [4]:

def read_json(path_to_file):
    
    json_ = pd.read_json(path_to_file).to_dict()
    return json_

def get_drive_modes(json_):
    driving_modes = set()
    for key_detail in json_.keys():
        driving_modes.add(key_detail.split('_')[-1:][0])
    return driving_modes

def label_drive_modes(driving_modes):
  driving_mode_maps = dict()
  for label,mode in enumerate(driving_modes):
    driving_mode_maps[mode] = label
  return driving_mode_maps

def invert_drive_mode_labels(driving_mode_maps):
  drive_maps = {value:key for key,value in driving_mode_maps.items()}
  return drive_maps

def convert_json_to_dataframe(json_):
  driving_modes = get_drive_modes(json_)
  driving_mode_maps = label_drive_modes(driving_modes)
  drive_maps = invert_drive_mode_labels(driving_mode_maps)

  warehouse_drivemode = json_.keys()
  warehouse_locations = list()
  for warehouse in warehouse_drivemode:
    location,drive_time,drive_mode = warehouse.split('_')
    drive_value = driving_mode_maps[drive_mode]
    polygons = json_[warehouse]
    location = location.replace('(','').replace(')','')
    warehouse_locations.append(f'({location}, {drive_time}, {drive_value}, {polygons})')

  warehouse_polygon_df = pd.DataFrame(eval(','.join(warehouse_locations)),columns = ['latitude','longitude','driving_time','driving_mode','polygon'])
  warehouse_polygon_df['driving_mode'] = warehouse_polygon_df['driving_mode'].apply(lambda row: drive_maps[row])
  return warehouse_polygon_df

def points_inside_polygon(geo_json_poly, list_of_points=None):
    polygon = Polygon([tuple(l) for l in geo_json_poly['coordinates'][0]])
    geometries = []
    for idx, point in enumerate(list_of_points):
        point_obj = Point(point)  # create point
        list_of_points[idx] = (polygon.contains(point_obj), point)  # check if polygon contains point
        point_geo = {"type": "Point",
                      "coordinates": point
                      },
        geometries.append(point_geo)
    geometries.append(geo_json_poly)
    # print(json.dumps({"type": "GeometryCollection", "geometries": geometries}))
    # print(list_of_points)
    return list_of_points


def generate_features_per_store(weight_parameters_in_polygon):
  weight_parameters_in_polygon['lat_long'] = weight_parameters_in_polygon[['lat','long']].apply(lambda row: (
                                                                                                            row.lat,
                                                                                                            row.long
                                                                                                            ),
                                                                                                            axis=1
                                                                                                          )
  
  weight_parameters_in_polygon = weight_parameters_in_polygon.groupby('pred_code')['lat_long'].apply(list).to_frame()\
                                .join(
                                    weight_parameters_in_polygon.groupby('pred_code')['num_customers'].apply(list).to_frame()
                                ).join(
                                    weight_parameters_in_polygon.groupby('pred_code')['num_orders'].apply(list).to_frame()
                                ).join(
                                    weight_parameters_in_polygon.groupby('pred_code')['num_items'].apply(list).to_frame()
                                ).join(
                                    weight_parameters_in_polygon.groupby('pred_code')['gmv'].apply(list).to_frame()
                                )
  return weight_parameters_in_polygon

def update_store_locations(lat,long,weight_parameters_in_polygon):
  weight_parameters_in_polygon['pred_lat'] = lat
  weight_parameters_in_polygon['pred_long'] = long
  return weight_parameters_in_polygon

def get_covered_locations_per_store(covered_polygons_df,queried_order_info,pred_code):

  #Adding a key for join and for aggregating weight parameters in a list
  covered_polygons_df['pred_code'] = pred_code

  covered_polygons_df.set_index(['lat','long'],inplace = True)
  

  weight_parameters_in_polygon = covered_polygons_df.join(queried_order_info.set_index(['lat','long'])).reset_index()

  return weight_parameters_in_polygon

def get_covered_locations(row,list_of_locations):
  
  geo_json_poly = row['polygon']
  saved_df = pd.DataFrame(
                            points_inside_polygon(geo_json_poly,list_of_locations)
  ).copy()
  covered_polygons_df = saved_df[saved_df[0]]

  covered_polygons_df = pd.DataFrame(covered_polygons_df[1].to_list(),columns = ['long','lat'])
  covered_polygons_df['driving_time'] = row['driving_time']
  covered_polygons_df['pred_lat'] = row['latitude']
  covered_polygons_df['pred_long'] = row['longitude']

  return covered_polygons_df

def get_weighted_features_per_store(weight_parameters_in_polygon,queried_order_info,pred_code,row):
  
  lat,long = row[['latitude','longitude']]
  weight_parameters_in_polygon = generate_features_per_store(weight_parameters_in_polygon)
  weight_parameters_in_polygon = update_store_locations(lat,long,weight_parameters_in_polygon)
  weight_parameters_in_polygon['driving_time'] = row['driving_time']

  return weight_parameters_in_polygon

def generate_source_kpi(queried_order_info,polygons_based_on_driving):

  input_format_for_kpi = pd.DataFrame()
  locations_matching_constraints = pd.DataFrame()

  for pred_code,row in polygons_based_on_driving.iterrows():
    list_of_locations = list(map(tuple,queried_order_info[['long','lat']].values))


    covered_polygons_df = get_covered_locations(row,list_of_locations)
    covered_locations_per_store = get_covered_locations_per_store(covered_polygons_df,queried_order_info,pred_code)
    locations_matching_constraints = locations_matching_constraints.append(covered_locations_per_store)

    weight_parameters_in_polygon = get_weighted_features_per_store(covered_locations_per_store,queried_order_info,pred_code,row)

    input_format_for_kpi = input_format_for_kpi.append(weight_parameters_in_polygon)

  input_format_for_kpi.reset_index(inplace = True)
  return input_format_for_kpi, locations_matching_constraints

def split_every(split_size, items):
    item = iter(items)
    piece = list(islice(item, split_size))
    while piece:
        yield piece
        piece = list(islice(item, split_size))



def evaluate_locations(model,warehouses,metric):

  center = pd.DataFrame(
                         get_cluster_centres(model)
                        )
  eval_locations = pd.concat([warehouses,center],axis=1)
  eval_locations.rename(
      columns = {
          'latitude':'actual_lat',
          'longitude' : 'actual_long',
          0 : 'pred_lat',
          1 : 'pred_long'
      },
      inplace=True
  )
  # handling locations for greater than or lesser than actual locations
  # eval_locations.fillna(value = 0,inplace=True)

  eval_locations[f'off by Distance({metric[0]})'] = get_eucledian_distance(eval_locations,metric[0])
  eval_locations[f'off by Time({metric[1]})'] = get_time(eval_locations,f'off by Distance({metric[0]})',metric[1])
  return eval_locations

def calculate_overlapping_per_store(locations_matching_constraints):

  overlapped_df = locations_matching_constraints.groupby(['lat','long'])['pred_code'].apply(list).reset_index()['pred_code'].to_frame()
  overlapped_df['length'] = overlapped_df['pred_code'].apply(len)

  count_overlap = locations_matching_constraints.groupby('pred_code')['pred_code'].count().to_frame()
  count_overlap.index.name = ''

  all_codes = list(itertools.chain.from_iterable(
          overlapped_df[overlapped_df['length']>1]['pred_code'].to_list() 
      ))


  count_overlap = count_overlap.join(
      pd.DataFrame(Counter(all_codes),index = ['overlap_count']).T,

      )
  count_overlap['overlap coverage %'] = (count_overlap['overlap_count']/count_overlap['pred_code'])*100

  count_overlap['overlap coverage %'] = count_overlap['overlap coverage %'].fillna(0)
  count_overlap.reset_index(inplace = True)

  count_overlap.rename(columns= {'':'pred_label'},inplace = True)

  return count_overlap[['pred_label','overlap coverage %']]



def get_latlong(locs):
  lat= []
  long = []
  for loc in locs:
    latitude,longitude = loc
    lat.append(np.float(latitude))
    long.append(np.float(longitude))
  return lat,long



def generate_feature_metrics(covered_per_actual_store):

  feature_columns = ['filtered_loc','customers_list','orders_list']

  covered_per_actual_store['number_of_locations_covered'] = covered_per_actual_store[feature_columns[0]].apply(lambda locs : len(locs))

  for feature_column in feature_columns[1:]:

    covered_per_actual_store[f'total_{feature_column}_coverage'] = covered_per_actual_store[feature_column].apply(lambda locs : 0 if len(locs)==0  else sum(locs))


  return covered_per_actual_store

def make_coverage_data(covered_per_actual_store):
    

    covered_per_actual_store['filtered_loc'] = get_filtered_locations(covered_per_actual_store)

    # add the changes here 

    covered_per_actual_store['latlong_list'] = covered_per_actual_store['filtered_loc'].apply(lambda rows: split_list_into_features(rows)[0])
    covered_per_actual_store['customers_list'] = covered_per_actual_store['filtered_loc'].apply(lambda rows: split_list_into_features(rows)[1])
    covered_per_actual_store['orders_list'] = covered_per_actual_store['filtered_loc'].apply(lambda rows: split_list_into_features(rows)[2])

    #
    covered_per_actual_store['number_of_locations_covered'] = covered_per_actual_store['filtered_loc'].apply(lambda locs : len(locs))

    covered_per_actual_store['latlong_list'] = covered_per_actual_store['latlong_list'].apply(lambda locs: get_latlong(locs))

    return covered_per_actual_store



def get_filtered_locations(covered_per_actual_store):
  traversed_loc = set()
  filtered_loc = []


  for num,row in covered_per_actual_store.iterrows():

    new_loc = [
                (loc,cust,orders) for loc,cust,orders in zip(
                    row[f'lat_long'],
                    row['num_customers'],
                    row['num_orders']
                    ) if loc not in traversed_loc
              ]
    traversed_loc.update(row[f'lat_long'])
    filtered_loc.append(new_loc)

  return filtered_loc#,traversed_loc

def split_list_into_features(rows):
  loc_ ,num_customers,num_orders = [],[],[]
  for row in rows:

    lat_long,customers,orders = row[0],row[1],row[2]
    loc_.append(lat_long)
    num_customers.append(customers)
    num_orders.append(orders)
  return loc_, num_customers, num_orders
  
def get_all_baseline_metrics(queried_order_info,metrics):
  total_num_customers = queried_order_info['num_customers'].sum()
  total_num_orders = queried_order_info['num_orders'].sum()
  total_num_locations =  queried_order_info[['lat','long']].count()[0]
  total_gmv = queried_order_info['gmv'].sum()
  return dict(zip(metrics,[total_num_locations,total_num_customers,total_num_orders]))

def generate_constraint_based_info(queried_order_info, polygons_based_on_driving):
    """
    Returns polygon data in dict format.

            Parameters:
                   queried_order_info(DataFrame): queried order from Big Query
                   polygons_based_on_driving(DataFrame) : 1 - T minutes Polygon for each store location based on
                                                          user inputs

            Returns:
                    input_format_for_kpi(DataFrame): Order Locations within Time Constrained Polygon boundaries with
                                                     list aggregated weight parameters -> num_customers,num_items,gmv..

    """
    input_format_for_kpi = pd.DataFrame()
    locations_matching_constraints = pd.DataFrame()

    all_locations = list(map(tuple, queried_order_info[['long', 'lat']].values))
    for pred_code, row in polygons_based_on_driving.iterrows():
        list_of_locations = all_locations.copy()

        covered_polygons_df = get_covered_locations(row, list_of_locations)
        covered_locations_per_store = get_covered_locations_per_store(covered_polygons_df, queried_order_info,
                                                                      pred_code)

        weight_parameters_in_polygon = get_weighted_features_per_store(covered_locations_per_store, queried_order_info,
                                                                       pred_code, row)

        input_format_for_kpi = input_format_for_kpi.append(weight_parameters_in_polygon)

    input_format_for_kpi.reset_index(inplace=True)
    return input_format_for_kpi

In [44]:
set_gcs_connection()

Connection Established


In [6]:
bucket_name = 'qc-store-growth-expansion-stg'

In [46]:
folder_name = 'existing_dmart_bucket'
polygon_file = 'dmart-polygon-FP_SG-drive-time-4-15.json'

In [47]:
path_to_file = f'gs://{bucket_name}/{folder_name}/{polygon_file}'
path_to_file

'gs://qc-store-growth-expansion-stg/existing_dmart_bucket/dmart-polygon-FP_SG-drive-time-4-15.json'

In [48]:
json_ = read_json(path_to_file)
driving_modes = get_drive_modes(json_)
driving_mode_maps = label_drive_modes(driving_modes)
drive_maps = invert_drive_mode_labels(driving_mode_maps)

warehouse_polygon_df = convert_json_to_dataframe(json_)
warehouse_polygon_df.head()

Unnamed: 0,latitude,longitude,driving_time,driving_mode,polygon
0,1.326209,103.853011,5,driving,"{'coordinates': [[[103.862011, 1.333937], [103..."
1,1.326209,103.853011,5,cycling,"{'coordinates': [[[103.851011, 1.328516], [103..."
2,1.326209,103.853011,7,driving,"{'coordinates': [[[103.860011, 1.3454359999999..."
3,1.326209,103.853011,7,cycling,"{'coordinates': [[[103.850011, 1.329475], [103..."
4,1.326209,103.853011,10,driving,"{'coordinates': [[[103.858011, 1.378981], [103..."


In [49]:
INPUT_PARAMETERS = {
    'Number_of_stores': warehouse_polygon_df[['latitude','longitude']].drop_duplicates().shape[0],
    'Driving_time' : int(polygon_file.split('.')[0].split('-')[-1]),
    'Driving_mode' : list(driving_modes)[1],
    'City_ID': polygon_file.split('.')[0].split('-')[2]
}

In [54]:
drive_mode_selected = INPUT_PARAMETERS['Driving_mode']
drive_time_selected = 8#INPUT_PARAMETERS['Driving_time']
region = INPUT_PARAMETERS['City_ID']
no_of_clusters = INPUT_PARAMETERS['Number_of_stores']

In [55]:
domain = 'DS_exploration'
project_name = 'optimal_location_finder'
folder = 'evaluation'
file_name = 'cluster_data_created_only_restaurant_11_11_2021.csv'
region = 'FP_SG'

path_to_file = f'gs://{bucket_name}/{domain}/{project_name}/{folder}/{region}/{file_name}'
path_to_file

'gs://qc-store-growth-expansion-stg/DS_exploration/optimal_location_finder/evaluation/FP_SG/cluster_data_created_only_restaurant_11_11_2021.csv'

In [56]:

queried_order_info = pd.read_csv(path_to_file)
print(queried_order_info.shape)
queried_order_info.head()

(174397, 8)


Unnamed: 0.1,Unnamed: 0,store_type_l2,lat,long,num_customers,num_orders,num_items,gmv
0,0,restaurants,1.3098,103.836,2,362,389,7560.56
1,1,restaurants,1.3122,103.896,37,2346,2700,62872.6
2,2,restaurants,1.4318,103.7751,8,299,326,5378.24
3,3,restaurants,1.3767,103.8824,38,1335,1464,29663.68
4,4,restaurants,1.3011,103.8526,14,237,281,3794.57


In [57]:



polygons_based_on_driving = warehouse_polygon_df[
                                                 (warehouse_polygon_df['driving_mode'] == drive_mode_selected) 
                                                 & (warehouse_polygon_df['driving_time'] <= drive_time_selected) 
                                                    & (warehouse_polygon_df['driving_time'] > 2) 
]
drive_mode_selected,drive_time_selected

('driving', 8)

In [63]:
warehouse_polygon_df['coordinates_length'] = warehouse_polygon_df['polygon'].apply(lambda row : len(row['coordinates']))

In [64]:
warehouse_polygon_df[warehouse_polygon_df['coordinates_length']==0]

Unnamed: 0,latitude,longitude,driving_time,driving_mode,polygon,coordinates_length


In [128]:
%%time
source_for_kpi_metrics,locations_matching_constraints = generate_source_kpi(queried_order_info,polygons_based_on_driving)


CPU times: user 1min 27s, sys: 411 ms, total: 1min 27s
Wall time: 1min 27s


In [129]:
source_for_kpi_metrics[
    (source_for_kpi_metrics['pred_lat'] == pred_lat) & 
    (source_for_kpi_metrics['pred_long'] == pred_long) &
    (source_for_kpi_metrics['driving_time'] == 5)
]['lat_long'].values == check_alternate_approach[
    (check_alternate_approach['pred_lat'] == pred_lat) & 
    (check_alternate_approach['pred_long'] == pred_long) &
    (check_alternate_approach['driving_time'] == 5)
]['lat_long'].values

array([ True])

In [148]:
def get_weighted_driving_time(source_for_kpi_metrics):
    all_stores = pd.DataFrame()
    for _,row in source_for_kpi_metrics[['pred_lat','pred_long']].drop_duplicates().iterrows():
        one_store = source_for_kpi_metrics[(source_for_kpi_metrics['pred_lat']==row['pred_lat']) & (source_for_kpi_metrics['pred_long']==row['pred_long'])]
        one_store['filtered_loc'] = get_filtered_locations(one_store)
        one_store['total_customers'] = one_store['filtered_loc'].apply(lambda rows: sum(split_list_into_features(rows)[1]))

        timeline= one_store['driving_time'].to_list()
        avg_driving_time = dict()
        for driving_time in timeline:
            selected_polygons = one_store[one_store['driving_time'] <= driving_time]
            avg_driving_time[driving_time] = (selected_polygons['driving_time'] * selected_polygons['total_customers']).sum()/selected_polygons['total_customers'].sum()

        one_store['avg_driving_time'] = avg_driving_time.values()

        all_stores = all_stores.append(one_store)
    return all_stores

# def get_driving_time_fast(source_for_kpi_metrics):
#     all_stores = pd.DataFrame()
#     for _,row in source_for_kpi_metrics[['pred_lat','pred_long']].drop_duplicates().iterrows():
#         one_store = source_for_kpi_metrics[(source_for_kpi_metrics['pred_lat']==row['pred_lat']) & (source_for_kpi_metrics['pred_long']==row['pred_long'])]
#         one_store['total_customers_covered'] = one_store['num_customers'].apply(lambda row: sum(row))

#         timeline= one_store['driving_time'].to_list()
#         avg_driving_time = dict()
#         for driving_time in timeline:
            
#             selected_polygons = one_store[one_store['driving_time'] <= driving_time]
#             avg_driving_time[driving_time] = (selected_polygons['driving_time'] * selected_polygons['total_customers_covered']).sum()/selected_polygons['total_customers_covered'].sum()

#         one_store['avg_driving_time'] = avg_driving_time.values()

#         all_stores = all_stores.append(one_store)
#     return all_stores

def check_store_level_metrics(driving_time,business_KPIs):
    store_level_metrics_df = pd.DataFrame(
    eval(
        business_KPIs.loc[driving_time]['store_level_metrics']
         )
    )
    return store_level_metrics_df

In [131]:

metrics = ['number_of_locations_covered','total_customers_list_coverage','total_orders_list_coverage'] 

value = get_all_baseline_metrics(queried_order_info,metrics)

covered_per_actual_store = source_for_kpi_metrics.copy()


In [136]:
def calculate_metrics_at_store_and_network_level(covered_per_actual_store,locations_matching_constraints,metrics):
    
    metric_df = pd.DataFrame()
    store_level_metrics = dict()
    timeline = covered_per_actual_store['driving_time'].drop_duplicates().to_list()
    all_locations_for_all_time = pd.DataFrame()
    for driving_time in timeline:
        output = dict()
        print(driving_time)
        all_locations_for_a_time = covered_per_actual_store[covered_per_actual_store['driving_time'] == driving_time]
        all_locations_for_a_time = make_coverage_data(all_locations_for_a_time)


        all_locations_for_a_time = generate_feature_metrics(all_locations_for_a_time)
        overlapped_per_store_for_a_time = calculate_overlapping_per_store(locations_matching_constraints[locations_matching_constraints['driving_time'] == driving_time])
        all_locations_for_a_time.reset_index(inplace = True)
        all_locations_for_a_time = all_locations_for_a_time.join(overlapped_per_store_for_a_time) 

        store_level_metrics[(no_of_clusters,driving_time)]= all_locations_for_a_time['num_customers'].apply(lambda row:sum(row)).to_frame(\
                                                    ).join(
                                                        all_locations_for_a_time['driving_time'].apply(lambda row: np.mean(row)).to_frame()
                                                    ).join(
                                                        all_locations_for_a_time[['pred_lat','pred_long','overlap coverage %']]
                                                    ).T.to_dict()
        for metric_ in metrics:
            sum_pred = all_locations_for_a_time[metric_].sum() 
            output[f'{metric_}[pred]'] = sum_pred
            output[f'{metric_}[pred] %'] = sum_pred/value[metric_] * 100

        output.update(
          {

              'overlap %[pred]' : (sum(locations_matching_constraints[locations_matching_constraints['driving_time'] == driving_time].groupby(['lat','long'])['pred_code'].count()>1)/value['number_of_locations_covered'])*100,

              'driving_time' : driving_time,

              'store_level_metrics' : ','.join(['{' + f"\
                                                  'pred_lat' : {row['pred_lat']},\
                                                  'pred_long' : {row['pred_long']},\
                                                  'customer_penetration' : {row['num_customers']},\
                                                  'customer_penetration %' : {(row['num_customers']/output['number_of_locations_covered[pred]'])*100},\
                                                  'overlap coverage %' : {row['overlap coverage %']}\
                                                  " + '}' for _,row in pd.DataFrame(store_level_metrics[no_of_clusters,driving_time]).T.iterrows()])

          }
        )

        all_locations_for_all_time = all_locations_for_all_time.append(all_locations_for_a_time)
        metric_df = metric_df.append(pd.concat([
                pd.DataFrame(output,index=[no_of_clusters])
        ],
        axis=1))
    all_locations_for_all_time.reset_index(inplace = True)
    metric_df.set_index('driving_time',inplace = True)
    return all_locations_for_all_time,metric_df

In [152]:
def get_avg_drivetime_network():
    pass

def calculate_avg_driving_time_at_network_level(all_locations_for_all_time):
    
    average_driving_time_for_all_stores_all_time = get_weighted_driving_time(all_locations_for_all_time)[['pred_lat','pred_long','driving_time','total_customers_list_coverage','avg_driving_time']]
    timeline = average_driving_time_for_all_stores_all_time['driving_time'].drop_duplicates().to_list()
    network_driving_average_at_different_times = dict()
    for driving_time in timeline:
        average_driving_time_for_all_stores_a_time = average_driving_time_for_all_stores_all_time[average_driving_time_for_all_stores_all_time['driving_time'] <= driving_time]
        network_driving_average_at_different_times[driving_time] = (average_driving_time_for_all_stores_a_time['driving_time'] * average_driving_time_for_all_stores_a_time['total_customers_list_coverage']).sum()/\
        average_driving_time_for_all_stores_a_time['total_customers_list_coverage'].sum()
        
    network_driving_average_at_different_times_df = pd.DataFrame(network_driving_average_at_different_times,index=[0]).T.rename(columns={0:'avg_drive_time'})
    return network_driving_average_at_different_times_df,average_driving_time_for_all_stores_all_time

In [153]:
all_locations_for_all_time,metric_df = calculate_metrics_at_store_and_network_level(covered_per_actual_store,locations_matching_constraints,metrics)

5
7


In [154]:
all_locations_for_all_time

Unnamed: 0,level_0,index,pred_code,lat_long,num_customers,num_orders,num_items,gmv,pred_lat,pred_long,driving_time,filtered_loc,latlong_list,customers_list,orders_list,number_of_locations_covered,total_customers_list_coverage,total_orders_list_coverage,pred_label,overlap coverage %
0,0,0,0,"[(1.3273, 103.848), (1.3237, 103.85), (1.3232,...","[6, 14, 28, 19, 47, 1, 4, 60, 23, 1, 65, 10, 1...","[166, 654, 1292, 704, 3802, 507, 626, 2630, 12...","[208, 728, 1460, 762, 4370, 575, 717, 3226, 15...","[2948.35, 12571.91, 42854.38, 23314.07, 112898...",1.326209,103.853011,5,"[((1.3273, 103.848), 6, 166), ((1.3237, 103.85...","([1.3273, 1.3237, 1.3232, 1.3242, 1.3251, 1.31...","[6, 14, 28, 19, 47, 1, 4, 60, 23, 1, 65, 10, 1...","[166, 654, 1292, 704, 3802, 507, 626, 2630, 12...",1865,8163,313621,0,0.0
1,1,2,8,"[(1.3261, 103.9265), (1.3205, 103.9152), (1.32...","[34, 3, 42, 12, 53, 8, 67, 42, 27, 11, 79, 2, ...","[2244, 221, 2029, 1417, 3443, 167, 4187, 1501,...","[2501, 369, 2538, 1658, 3719, 239, 4699, 1731,...","[54270.29, 11294.83, 50145.29, 39257.85, 77243...",1.322911,103.920874,5,"[((1.3261, 103.9265), 34, 2244), ((1.3205, 103...","([1.3261, 1.3205, 1.3289, 1.328, 1.3276, 1.325...","[34, 3, 42, 12, 53, 8, 67, 42, 27, 11, 79, 2, ...","[2244, 221, 2029, 1417, 3443, 167, 4187, 1501,...",1157,3568,143824,8,0.0
2,2,4,16,"[(1.4067, 103.9115), (1.4045, 103.9051), (1.40...","[45, 1, 37, 33, 20, 22, 2, 1, 2, 1, 41, 1, 26,...","[2065, 77, 1553, 1440, 793, 618, 116, 237, 84,...","[2381, 77, 1706, 1746, 929, 687, 129, 296, 84,...","[43634.23, 992.59, 40016.77, 33398.96, 15207.3...",1.420482,103.912009,5,"[((1.4067, 103.9115), 45, 2065), ((1.4045, 103...","([1.4067, 1.4045, 1.4038, 1.4018, 1.4058, 1.40...","[45, 1, 37, 33, 20, 22, 2, 1, 2, 1, 41, 1, 26,...","[2065, 77, 1553, 1440, 793, 618, 116, 237, 84,...",1004,4018,174055,16,0.0
3,3,6,24,"[(1.2938, 103.8293), (1.2959, 103.8299), (1.28...","[42, 33, 44, 1, 29, 5, 4, 96, 171, 1, 60, 3, 8...","[1513, 1274, 2023, 67, 1677, 216, 116, 4733, 5...","[1689, 1479, 2288, 80, 1936, 223, 140, 5563, 6...","[53567.32, 39304.95, 52423.2, 1567.71, 43755.2...",1.28537,103.834724,5,"[((1.2938, 103.8293), 42, 1513), ((1.2959, 103...","([1.2938, 1.2959, 1.2869, 1.2849, 1.2929, 1.28...","[42, 33, 44, 1, 29, 5, 4, 96, 171, 1, 60, 3, 8...","[1513, 1274, 2023, 67, 1677, 216, 116, 4733, 5...",5067,22718,845949,24,7.223209
4,4,8,32,"[(1.3632, 103.8581), (1.3639, 103.852), (1.366...","[32, 39, 1, 22, 28, 35, 25, 25, 36, 65, 21, 5,...","[1222, 1334, 125, 1302, 867, 1075, 1180, 615, ...","[1504, 1531, 140, 1396, 1005, 1178, 1515, 742,...","[31304.66, 29490.47, 2097.35, 38608.37, 17565....",1.360945,103.857244,5,"[((1.3632, 103.8581), 32, 1222), ((1.3639, 103...","([1.3632, 1.3639, 1.3663, 1.3676, 1.3659, 1.36...","[32, 39, 1, 22, 28, 35, 25, 25, 36, 65, 21, 5,...","[1222, 1334, 125, 1302, 867, 1075, 1180, 615, ...",944,3688,115112,32,0.105932
5,5,10,40,"[(1.4107, 103.7577), (1.4042, 103.757), (1.392...","[1, 2, 1, 5, 6, 8, 3, 9, 7, 9, 4, 24, 2, 1, 2,...","[92, 63, 427, 53, 177, 53, 153, 155, 325, 309,...","[125, 81, 614, 59, 221, 80, 163, 232, 468, 430...","[8088.04, 1385.87, 21389.02, 1023.92, 2720.75,...",1.406025,103.758371,5,"[((1.4107, 103.7577), 1, 92), ((1.4042, 103.75...","([1.4107, 1.4042, 1.3924, 1.4061, 1.3912, 1.40...","[1, 2, 1, 5, 6, 8, 3, 9, 7, 9, 4, 24, 2, 1, 2,...","[92, 63, 427, 53, 177, 53, 153, 155, 325, 309,...",218,434,12503,40,0.0
6,6,12,48,"[(1.2931, 103.8283), (1.2907, 103.8023), (1.29...","[2, 25, 2, 29, 36, 6, 30, 20, 1, 27, 9, 16, 1,...","[120, 1272, 566, 2625, 1553, 199, 1848, 854, 3...","[125, 1329, 602, 3354, 2129, 251, 1979, 1135, ...","[2895.66, 26284.06, 14941.92, 79576.18, 44363....",1.290559,103.814227,5,"[((1.2931, 103.8283), 2, 120), ((1.2907, 103.8...","([1.2931, 1.2907, 1.2935, 1.2952, 1.2915, 1.29...","[2, 25, 2, 29, 36, 6, 30, 20, 1, 27, 9, 16, 17...","[120, 1272, 566, 2625, 1553, 199, 1848, 854, 3...",3753,14835,620662,48,8.885652
7,7,14,56,"[(1.3624, 103.9372), (1.3625, 103.9359), (1.36...","[1, 1, 40, 14, 3, 1, 2, 2, 22, 20, 2, 1, 24, 1...","[188, 87, 1121, 222, 299, 49, 81, 103, 1101, 3...","[188, 87, 1625, 267, 350, 56, 103, 107, 1238, ...","[2873.32, 1719.3, 34147.91, 6391.03, 12887.71,...",1.372423,103.933873,5,"[((1.3624, 103.9372), 1, 188), ((1.3625, 103.9...","([1.3624, 1.3625, 1.36, 1.3717, 1.3557, 1.3593...","[1, 1, 40, 14, 3, 1, 2, 2, 22, 20, 2, 1, 24, 1...","[188, 87, 1121, 222, 299, 49, 81, 103, 1101, 3...",883,2888,87700,56,0.0
8,8,16,64,"[(1.446, 103.8352), (1.446, 103.8344), (1.4466...","[1, 7, 2, 1, 5, 5, 43, 46, 1, 35, 31, 11, 36, ...","[81, 235, 82, 72, 300, 341, 1126, 737, 58, 211...","[148, 275, 90, 105, 311, 405, 1227, 867, 58, 2...","[1292.99, 4874.32, 1278.76, 1194.23, 4773.66, ...",1.437959,103.844939,5,"[((1.446, 103.8352), 1, 81), ((1.446, 103.8344...","([1.446, 1.446, 1.4466, 1.4332, 1.4352, 1.4361...","[1, 7, 2, 1, 5, 5, 43, 46, 1, 35, 31, 11, 36, ...","[81, 235, 82, 72, 300, 341, 1126, 737, 58, 211...",1525,6261,234941,64,0.0
9,9,18,72,"[(1.3366, 103.6957), (1.3393, 103.6961), (1.33...","[8, 31, 24, 45, 14, 49, 31, 1, 50, 6, 11, 5, 4...","[225, 1126, 1522, 872, 514, 2709, 1119, 62, 20...","[252, 1238, 1632, 1037, 573, 3595, 1449, 84, 2...","[4376.37, 34296.89, 29898.31, 20510.8, 14557.9...",1.334231,103.699479,5,"[((1.3366, 103.6957), 8, 225), ((1.3393, 103.6...","([1.3366, 1.3393, 1.3356, 1.3362, 1.3373, 1.33...","[8, 31, 24, 45, 14, 49, 31, 1, 50, 6, 11, 5, 4...","[225, 1126, 1522, 872, 514, 2709, 1119, 62, 20...",1117,3318,128837,72,0.0


In [138]:
metric_df

Unnamed: 0_level_0,number_of_locations_covered[pred],number_of_locations_covered[pred] %,total_customers_list_coverage[pred],total_customers_list_coverage[pred] %,total_orders_list_coverage[pred],total_orders_list_coverage[pred] %,overlap %[pred],store_level_metrics
driving_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5,32478,18.623027,132509,19.474732,5266074,19.927034,0.210439,{ ...
7,71736,41.133735,291458,42.835328,11580857,43.822424,3.715087,{ ...


In [155]:
# all_locations_for_all_time.reset_index(inplace = True)
network_driving_average_at_different_times_df,average_driving_time_for_all_stores_all_time= calculate_avg_driving_time_at_network_level(all_locations_for_all_time)


In [147]:
average_driving_time_for_all_stores_all_time

Unnamed: 0,pred_lat,pred_long,driving_time,total_customers_list_coverage,avg_driving_time
0,1.326209,103.853011,5,8163,5.0
15,1.326209,103.853011,7,21419,6.239838
1,1.322911,103.920874,5,3568,5.0
16,1.322911,103.920874,7,9017,6.208606
2,1.420482,103.912009,5,4018,5.0
17,1.420482,103.912009,7,16428,6.513294
3,1.28537,103.834724,5,22718,5.0
18,1.28537,103.834724,7,43712,5.986912
4,1.360945,103.857244,5,3688,5.0
19,1.360945,103.857244,7,15949,6.540006


In [30]:
metric_df = metric_df.set_index('driving_time').join(network_driving_average_at_different_times_df)

In [None]:
# %%time
    
# metric_df = pd.DataFrame()
# store_level_metrics = dict()
# # fixing time and calculating metrics for all locations at store level and network level
# # def generate_metrics()
# timeline = covered_per_actual_store['driving_time'].drop_duplicates().to_list()
# all_locations_for_all_time = pd.DataFrame()
# for driving_time in timeline:
#     output = dict()
#     print(driving_time)
#     all_locations_for_a_time = covered_per_actual_store[covered_per_actual_store['driving_time'] == driving_time]
#     all_locations_for_a_time = make_coverage_data(all_locations_for_a_time)


#     all_locations_for_a_time = generate_feature_metrics(all_locations_for_a_time)
#     overlapped_per_store_for_a_time = calculate_overlapping_per_store(locations_matching_constraints[locations_matching_constraints['driving_time'] == driving_time])
#     all_locations_for_a_time.reset_index(inplace = True)
#     all_locations_for_a_time = all_locations_for_a_time.join(overlapped_per_store_for_a_time) 

#     store_level_metrics[(no_of_clusters,driving_time)]= all_locations_for_a_time['num_customers'].apply(lambda row:sum(row)).to_frame(\
#                                                 ).join(
#                                                     all_locations_for_a_time['driving_time'].apply(lambda row: np.mean(row)).to_frame()
#                                                 ).join(
#                                                     all_locations_for_a_time[['pred_lat','pred_long','overlap coverage %']]
#                                                 ).T.to_dict()
#     for metric_ in metrics:
#         sum_pred = all_locations_for_a_time[metric_].sum() 
#         output[f'{metric_}[pred]'] = sum_pred
#         output[f'{metric_}[pred] %'] = sum_pred/value[metric_] * 100

#     output.update(
#       {

#           'overlap %[pred]' : (sum(locations_matching_constraints[locations_matching_constraints['driving_time'] == driving_time].groupby(['lat','long'])['pred_code'].count()>1)/value['number_of_locations_covered'])*100,

#           'driving_time' : driving_time,

#           'store_level_metrics' : ','.join(['{' + f"\
#                                               'pred_lat' : {row['pred_lat']},\
#                                               'pred_long' : {row['pred_long']},\
#                                               'customer_penetration' : {row['num_customers']},\
#                                               'customer_penetration %' : {(row['num_customers']/output['number_of_locations_covered[pred]'])*100},\
#                                               'overlap coverage %' : {row['overlap coverage %']}\
#                                               " + '}' for _,row in pd.DataFrame(store_level_metrics[no_of_clusters,driving_time]).T.iterrows()])

#       }
#     )

#     all_locations_for_all_time = all_locations_for_all_time.append(all_locations_for_a_time)
#     metric_df = metric_df.append(pd.concat([
#             pd.DataFrame(output,index=[no_of_clusters])
#     ],
#     axis=1))

# ## calculate and update time now
# all_locations_for_all_time.reset_index(inplace = True)
# average_driving_time_for_all_stores_all_time = get_driving_time(all_locations_for_all_time)[['pred_lat','pred_long','driving_time','total_customers_list_coverage','avg_driving_time']]

# network_driving_average_at_different_times = dict()
# for driving_time in timeline:
#     average_driving_time_for_all_stores_a_time = average_driving_time_for_all_stores_all_time[average_driving_time_for_all_stores_all_time['driving_time'] <= driving_time]
#     network_driving_average_at_different_times[driving_time] = (average_driving_time_for_all_stores_a_time['driving_time'] * average_driving_time_for_all_stores_a_time['total_customers_list_coverage']).sum()/\
#     average_driving_time_for_all_stores_a_time['total_customers_list_coverage'].sum()



In [None]:
# metric_df = metric_df.set_index('driving_time').join(
# pd.DataFrame(network_driving_average_at_different_times,index=[0]).T.rename(columns={0:'avg_drive_time'})
#     )

In [31]:
metric_df.head()

Unnamed: 0_level_0,number_of_locations_covered[pred],number_of_locations_covered[pred] %,total_customers_list_coverage[pred],total_customers_list_coverage[pred] %,total_orders_list_coverage[pred],total_orders_list_coverage[pred] %,overlap %[pred],store_level_metrics,avg_drive_time
driving_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3,8568,4.912929,34521,5.073521,1344492,5.087611,0.0,{ ...,3.0
4,18607,10.669335,76522,11.246372,3028453,11.459787,0.008601,{ ...,3.68912
5,32478,18.623027,132509,19.474732,5266074,19.927034,0.210439,{ ...,4.402329
6,49150,28.182824,200777,29.508021,7971831,30.165726,1.346927,{ ...,5.124262
7,71736,41.133735,291458,42.835328,11580857,43.822424,3.715087,{ ...,5.867274


In [None]:
#updating the store level metrics with the average delivery time

# store_level_new_metrics = dict()
# for driving_time in timeline:
#   # store_level_metrics_for_given_time = pd.DataFrame(eval(metric_df.loc[driving_time]['store_level_metrics']),columns = ['pred_lat','pred_long','customer_penetration','customer_penetration %','overlap coverage %']).set_index(['pred_lat','pred_long'])
#     store_level_metrics_for_given_time = pd.DataFrame(eval(metric_df.loc[driving_time]['store_level_metrics'])).set_index(['pred_lat','pred_long'])
#     store_level_updated_metrics = average_driving_time_for_all_stores_all_time[average_driving_time_for_all_stores_all_time['driving_time']==driving_time][['pred_lat','pred_long','avg_driving_time']].set_index(['pred_lat','pred_long']).join(store_level_metrics_for_given_time)
#     store_level_new_metrics[driving_time] = ','.join(['{' + f"\
#     'pred_lat' : {row['pred_lat']},\
#     'pred_long' : {row['pred_long']},\
#     'customer_penetration' : {row['customer_penetration']},\
#     'customer_penetration %' : {row['customer_penetration %']},\
#     'overlap coverage %' : {row['overlap coverage %']},\
#     'avg_driving_time' : {row['avg_driving_time']}\
#     " + '}' for _,row in store_level_updated_metrics.reset_index().iterrows()])

In [32]:
def calculate_avg_driving_time_at_store_level(metric_df,average_driving_time_for_all_stores_all_time):
    store_level_new_metrics = dict()
    timeline = average_driving_time_for_all_stores_all_time['driving_time'].drop_duplicates().to_list()
    for driving_time in timeline:
      # store_level_metrics_for_given_time = pd.DataFrame(eval(metric_df.loc[driving_time]['store_level_metrics']),columns = ['pred_lat','pred_long','customer_penetration','customer_penetration %','overlap coverage %']).set_index(['pred_lat','pred_long'])
        store_level_metrics_for_given_time = pd.DataFrame(eval(metric_df.loc[driving_time]['store_level_metrics'])).set_index(['pred_lat','pred_long'])
        store_level_updated_metrics = average_driving_time_for_all_stores_all_time[average_driving_time_for_all_stores_all_time['driving_time']==driving_time][['pred_lat','pred_long','avg_driving_time']].set_index(['pred_lat','pred_long']).join(store_level_metrics_for_given_time)
        store_level_new_metrics[driving_time] = ','.join(['{' + f"\
        'pred_lat' : {row['pred_lat']},\
        'pred_long' : {row['pred_long']},\
        'customer_penetration' : {row['customer_penetration']},\
        'customer_penetration %' : {row['customer_penetration %']},\
        'overlap coverage %' : {row['overlap coverage %']},\
        'avg_driving_time' : {row['avg_driving_time']}\
        " + '}' for _,row in store_level_updated_metrics.reset_index().iterrows()])
    return store_level_new_metrics

In [33]:
store_level_new_metrics = calculate_avg_driving_time_at_store_level(metric_df,average_driving_time_for_all_stores_all_time)

In [34]:
try:
    metric_df.drop(columns = 'store_level_metrics',inplace = True)
except Exception as ex: 
    print(ex)

In [35]:
try:
    metric_df = metric_df.join(pd.DataFrame(store_level_new_metrics,index=['store_level_metrics']).T)
except Exception as ex:
    print(ex)

In [36]:
metric_df

Unnamed: 0_level_0,number_of_locations_covered[pred],number_of_locations_covered[pred] %,total_customers_list_coverage[pred],total_customers_list_coverage[pred] %,total_orders_list_coverage[pred],total_orders_list_coverage[pred] %,overlap %[pred],avg_drive_time,store_level_metrics
driving_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3,8568,4.912929,34521,5.073521,1344492,5.087611,0.0,3.0,"{ 'pred_lat' : 1.32620892, 'pred..."
4,18607,10.669335,76522,11.246372,3028453,11.459787,0.008601,3.68912,"{ 'pred_lat' : 1.32620892, 'pred..."
5,32478,18.623027,132509,19.474732,5266074,19.927034,0.210439,4.402329,"{ 'pred_lat' : 1.32620892, 'pred..."
6,49150,28.182824,200777,29.508021,7971831,30.165726,1.346927,5.124262,"{ 'pred_lat' : 1.32620892, 'pred..."
7,71736,41.133735,291458,42.835328,11580857,43.822424,3.715087,5.867274,"{ 'pred_lat' : 1.32620892, 'pred..."
8,94632,54.262401,378717,55.659708,14909627,56.418623,7.040259,6.591991,"{ 'pred_lat' : 1.32620892, 'pred..."
9,117621,67.444394,467769,68.747603,18351236,69.441808,13.984186,7.303873,"{ 'pred_lat' : 1.32620892, 'pred..."
10,135621,77.765673,536740,78.88421,20953463,79.288739,27.276845,7.986794,"{ 'pred_lat' : 1.32620892, 'pred..."
11,153695,88.129383,607735,89.318284,23688007,89.636362,40.660677,8.658375,"{ 'pred_lat' : 1.32620892, 'pred..."
12,166198,95.298658,651237,95.711735,25334068,95.865123,55.78307,9.302602,"{ 'pred_lat' : 1.32620892, 'pred..."


In [37]:
BUSINESS_METRICS = [
                    'customer_penetration_absolute',
                    'customer_penetration %',
                    'overlap %',
                    'avg_drive_time',
                    'store_level_metrics'
]
METRICS_TO_LOOK = [
                   'total_customers_list_coverage[pred]',
                   'total_customers_list_coverage[pred] %',
                   'overlap %[pred]',
                   'avg_drive_time',
                   'store_level_metrics'
]

rename_columns = dict(zip(METRICS_TO_LOOK,BUSINESS_METRICS))
rename_columns

{'total_customers_list_coverage[pred]': 'customer_penetration_absolute',
 'total_customers_list_coverage[pred] %': 'customer_penetration %',
 'overlap %[pred]': 'overlap %',
 'avg_drive_time': 'avg_drive_time',
 'store_level_metrics': 'store_level_metrics'}

In [None]:
metric_df.rename(columns = rename_columns, inplace = True)

In [None]:
metric_df

In [None]:
BUSINESS_METRICS

In [None]:
business_KPIs = metric_df[BUSINESS_METRICS]
business_KPIs

In [None]:
INPUT_PARAMETERS

In [None]:
business_KPIs.loc[45].to_frame()

In [None]:
check_store_level_metrics(drive_time_selected,business_KPIs.loc[drive_time_selected].to_frame().T)

### Saving the metrics