In [0]:
CREATE OR REPLACE TABLE city_data (
  city_id INT,
  city_name STRING,
  region STRING
);

INSERT INTO city_data VALUES
  (1, 'New York', 'USA'),
  (2, 'New york', 'USA'),
  (3, 'Los Angeles', 'USA'),
  (4, 'Los Angeles', 'USA'),
  (5, 'San Francisco', 'USA'),
  (6, 'San Fransisco', 'USA'),
  (7, 'São Paulo', 'Brazil'),
  (8, 'Sao Paulo', 'Brazil'),
  (9, 'München', 'Germany'),
  (10, 'Munchen', 'Germany'),
  (11, 'Berlin', 'Germany'),
  (12, 'Düsseldorf', 'Germany'),
  (13, 'Tokyo', 'Japan'),
  (14, 'Tokio', 'Japan');


In [0]:
CREATE OR REPLACE FUNCTION json_city_name(city_name STRING)
RETURNS STRING
LANGUAGE PYTHON
ENVIRONMENT (
  dependencies = '["simplejson==3.19.*", "Unidecode==1.3.*"]',
  environment_version = 'None'
)
AS $$
  import simplejson as json
  from unidecode import unidecode
  return json.dumps({"city": unidecode(city_name)})
$$;


In [0]:
SELECT city_name, json_city_name(city_name) AS city_json
FROM city_data;

In [0]:
CREATE OR REPLACE FUNCTION json_all_cities(city_names ARRAY<STRING>)
RETURNS STRING
LANGUAGE PYTHON
ENVIRONMENT (
  dependencies = '["simplejson==3.19.*", "Unidecode==1.3.*"]',
  environment_version = 'None'
)
AS $$
  import pandas as pd
  import simplejson as json
  from unidecode import unidecode
  norm_names = [unidecode(name) for name in city_names]
  counts = pd.Series(norm_names).value_counts().to_dict()
  return json.dumps(counts, indent=2)
$$;


In [0]:
SELECT city_data.region, json_all_cities(collect_list(city_data.city_name)) AS cities_json
FROM city_data
GROUP BY city_data.region;

In [0]:
CREATE OR REPLACE FUNCTION duplicate_percentage(city_names ARRAY<STRING>)
RETURNS ARRAY<FLOAT>
LANGUAGE PYTHON
ENVIRONMENT (
  dependencies = '["Unidecode==1.3.*"]',
  environment_version = 'None'
)
AS $$
  import pandas as pd
  from unidecode import unidecode

  norm = [unidecode(x) for x in city_names]
  s = pd.Series(norm)
  value_counts = s.value_counts()
  duplicates = value_counts[value_counts > 1].sum()
  result = round(100 * duplicates / len(s), 2) if len(s) > 0 else 0.0
  return [result]
$$;

In [0]:
SELECT city_data.region, duplicate_percentage(collect_list(city_data.city_name)) AS dup_pct
FROM city_data
GROUP BY city_data.region;

In [0]:
CREATE OR REPLACE FUNCTION city_entropy(city_names ARRAY<STRING>)
RETURNS ARRAY<FLOAT>
LANGUAGE PYTHON
ENVIRONMENT (
  dependencies = '["Unidecode==1.3.*"]',
  environment_version = 'None'
)
AS $$
  import pandas as pd
  import numpy as np
  from unidecode import unidecode

  norm = [unidecode(x) for x in city_names]
  s = pd.Series(norm)
  probs = s.value_counts(normalize=True)
  entropy = -np.sum(probs * np.log2(probs))
  return [round(float(entropy), 3)]
$$;


In [0]:
SELECT city_data.region, city_entropy(collect_list(city_data.city_name)) AS entropy
FROM city_data
GROUP BY city_data.region;