### Instructions

1) Ingest geo maps data to BigQuery

2) View the main cities from where users access the ticket sales page

3) Calculate average distance for each GA session to all concert halls: [Barcelona, Madrid, Bilbao, Murcia, Valencia]

4) Train the model without/with new feature and compare the results

In [None]:
# input tables:
#   ga_events_data
#   geo_maps
#   concert_venues

# Produce a query with the following outputs:
#
#    event_city,
#    ROUND(AVG(distance_km), 2) AS avg_distance_km,
#    ROUND(MIN(distance_km), 2) AS min_distance_km,
#    ROUND(MAX(distance_km), 2) AS max_distance_km,

# consider the distances from each source city to all the cities where the concert venues are located (concert_venues table)

# cities in geo_maps might be duplicated by name
# hint: add the heuristic criteria (like size) to rank them first, e.g. ST_NUMPOINTS(geometry) - the more points, the larger the city

# there might be cities outside of Spain (like Valencia in Venezuela), so bear in mind that the distance should not be more than some reasonable threshold e.g. 1000km. Use that to filter those records out.


In [None]:
WITH
  ranked_cities AS (
  SELECT
    NAME AS city_name,
    geometry,
    RANK() OVER(PARTITION BY NAME ORDER BY ST_NUMPOINTS(geometry) DESC) AS rank
  FROM
    `<project_id>.city_maps.geo_maps`),
  main_cities AS (
  SELECT
    LOWER(city_name) AS city_name,
    geometry
  FROM
    ranked_cities
  WHERE
    rank = 1),
  event_cities AS (
  SELECT
    DISTINCT LOWER(city) AS city
  FROM
    `<project_id>.web_analytics_eu.indie_label_events_data`),
  concert_venues AS (
  SELECT
    LOWER(city_name) AS city,
    geometry
  FROM
    `<project_id>.city_maps.concert_venues` ),
  distance_from_venues AS (
  SELECT
    e.city AS event_city,
    v.city AS venue_city,
    ROUND(ST_DISTANCE(m.geometry,
        v.geometry) / 1000, 2) AS distance_km
  FROM
    event_cities e
  LEFT JOIN
    main_cities m
  ON
    e.city = LOWER(m.city_name)
  CROSS JOIN
    concert_venues v ),
  distance_statistics AS (
  SELECT
    event_city,
    ROUND(AVG(distance_km), 2) AS avg_distance_km,
    ROUND(MIN(distance_km), 2) AS min_distance_km,
    ROUND(MAX(distance_km), 2) AS max_distance_km,
    COUNT(*) num_connections,
  FROM
    distance_from_venues
  WHERE
    distance_km IS NOT NULL
  GROUP BY
    1)
SELECT
  *
FROM
  distance_statistics
WHERE
  min_distance_km < 1000
ORDER BY
  2 ASC;