Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OSM-POI: Include brand property [DRAFT] #69

Draft
wants to merge 43 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
86ef2c5
add name-suggestion-index download
IritaSee Jan 3, 2022
1c62f71
add repo checking
IritaSee Jan 3, 2022
b74d373
[undone] interating over json
IritaSee Jan 3, 2022
d6fb766
add brand_name_downloader
IritaSee Jan 5, 2022
ac14e90
add name downloader to main, add operator naming
IritaSee Jan 5, 2022
f5d401f
rename to more suitable function names
IritaSee Jan 6, 2022
600c3ef
add staticmethod for download_names
IritaSee Jan 6, 2022
017b99d
add staticmethod to download_names
IritaSee Jan 6, 2022
96ae427
fix variable names to add context
IritaSee Jan 6, 2022
69ac660
fix variable typo
IritaSee Jan 7, 2022
d2c2460
add debug venv folder to ignore
IritaSee Jan 7, 2022
f173ec5
change - to None as default value
IritaSee Jan 7, 2022
2ff61be
add operator:wikidata
IritaSee Jan 7, 2022
3c93476
add func to match brands and operators, then add to spark
IritaSee Jan 7, 2022
2f3896f
fix algorithm
IritaSee Jan 8, 2022
a572b30
remove unused code
IritaSee Jan 8, 2022
d2467a7
update fuzzywuzzy to thefuzz in osm-poi related
IritaSee Jan 8, 2022
e74f64b
fix nan processing
IritaSee Jan 8, 2022
1290708
fix: change search to brand and operator
IritaSee Jan 10, 2022
6230a6b
recreate matching function
IritaSee Jan 11, 2022
6519b76
apply withcolumn in main matching function
IritaSee Jan 11, 2022
e78b8f7
fix typo
IritaSee Jan 12, 2022
3e16563
remove is_operator
IritaSee Jan 13, 2022
027f2d6
join operator and brand name matching function
IritaSee Jan 13, 2022
73f719e
remove duplicate name/operator
IritaSee Jan 13, 2022
310e06c
rework function to simply match names and input
IritaSee Jan 13, 2022
a91e266
fix error
IritaSee Jan 13, 2022
32e767f
Merge branch 'master' into feature/include-brand-property
IritaSee Jan 15, 2022
848f984
add brand_matched operator_matched name_matched
IritaSee Jan 20, 2022
d101e2d
fix run_cli convert add extra cd
IritaSee Jan 22, 2022
724ac03
readjust dowloader to new temp folder
IritaSee Jan 22, 2022
c2bb65c
add default statement
IritaSee Jan 24, 2022
898945b
update temp dir
IritaSee Jan 24, 2022
f08ea40
add downloading message
IritaSee Jan 24, 2022
1c153fb
add empty as return
IritaSee Jan 27, 2022
1c4b9dc
fix missleadnig var name
IritaSee Jan 27, 2022
2489e94
revert irrelevant change to this branch
IritaSee Jan 27, 2022
c01806f
code cleanup
IritaSee Jan 27, 2022
a149f1b
delete reference repo, rename reference file
IritaSee Jan 27, 2022
f4e651b
add id sorting for reference file, change print to log
IritaSee Jan 27, 2022
fae5761
remove reference repo, ignore reference file
IritaSee Jan 27, 2022
70f7255
Merge branch 'master' into feature/include-brand-property
Feb 2, 2022
0661a62
Resolve formatting and linting errors; Remove name matching UDF from …
Feb 3, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,5 @@ env
!.env.local
__pycache__


debug
kuwala/scripts/windows/
3 changes: 2 additions & 1 deletion kuwala/common/python_utils/src/FileSelector.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from time import sleep
import urllib.error

from fuzzywuzzy import fuzz
from hdx.data.dataset import Dataset
from hdx.data.organization import Organization
from hdx.hdx_configuration import Configuration
Expand All @@ -12,6 +11,7 @@
from pyquery import PyQuery
import questionary
import requests.exceptions
from thefuzz import fuzz

CONTINENTS = [
{"code": "af", "name": "Africa", "geofabrik": "africa"},
Expand All @@ -27,6 +27,7 @@

def select_local_country(directory):
continents = os.listdir(directory)
continents.remove("brand_names.csv")
continent_names = list(
map(
lambda c: pcc.convert_continent_code_to_continent_name(c.upper()),
Expand Down
2 changes: 1 addition & 1 deletion kuwala/common/python_utils/src/spark_udfs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import json

from fuzzywuzzy import fuzz
import h3
from pyspark.sql.functions import udf
from pyspark.sql.types import (
Expand All @@ -13,6 +12,7 @@
StructType,
)
from shapely.geometry import shape
from thefuzz import fuzz

DEFAULT_RESOLUTION = 11

Expand Down
2 changes: 1 addition & 1 deletion kuwala/core/cli/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ docopt==0.6.2
email-validator==1.1.3
et-xmlfile==1.1.0
exchangerates==0.3.4
fuzzywuzzy==0.18.0
thefuzz==0.19.0
greenlet==1.1.1
hdx-python-api==5.2.4
hdx-python-country==2.9.5
Expand Down
2 changes: 1 addition & 1 deletion kuwala/core/database/importer/sql/create_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ CREATE TABLE IF NOT EXISTS google_osm_poi_matching (
CONSTRAINT fk_google_osm_poi_matching_osm_id FOREIGN KEY(osm_type, osm_id) REFERENCES osm_poi(osm_type, osm_id)
);

-- Creation of google_osm_poi_matching table
-- Creation of google_custom_poi_matching table

CREATE TABLE IF NOT EXISTS google_custom_poi_matching (
custom_id text NOT NULL PRIMARY KEY,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def import_population_density(
database_properties,
continent,
country,
population_density_date,
population_density_date="",
):
start_time = time.time()

Expand Down
2 changes: 1 addition & 1 deletion kuwala/pipelines/osm-poi/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ docopt==0.6.2
email-validator==1.1.3
et-xmlfile==1.1.0
exchangerates==0.3.4
fuzzywuzzy==0.18.0
thefuzz==0.19.0
greenlet==1.1.1
h3==3.7.3
hdx-python-api==5.2.4
Expand Down
60 changes: 59 additions & 1 deletion kuwala/pipelines/osm-poi/src/Downloader.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
import json
import logging as log
import os
import shutil
import urllib.request as req
import zipfile

import pandas as pd
from python_utils.src.FileDownloader import download_file
from python_utils.src.FileSelector import select_osm_file


class Downloader:
@staticmethod
def start(args):
def download_pbf(args):
file = None

if args.url is None:
Expand All @@ -27,3 +33,55 @@ def start(args):
file_path += "/pbf/geo_fabrik.osm.pbf"

download_file(url=args.url or file["url"], path=file_path)

IritaSee marked this conversation as resolved.
Show resolved Hide resolved
@staticmethod
def download_names():
temp_files_dir = "../../../tmp/kuwala/osm_files/"
# here, instead of cloning the repository that recommended using extra library,
# we download the whole repo in zip, then extract it.
if not os.path.exists(temp_files_dir + "name-suggestion-index-main"):
log.info("Downloading brand and operator name reference...")
download_link = "https://github.com/osmlab/name-suggestion-index/archive/refs/heads/main.zip"
req.urlretrieve(download_link, temp_files_dir + "main.zip")
with zipfile.ZipFile(temp_files_dir + "main.zip", "r") as zip_ref:
zip_ref.extractall(temp_files_dir)
os.remove(temp_files_dir + "main.zip")

file_paths = [
temp_files_dir + "name-suggestion-index-main/data/brands",
temp_files_dir + "name-suggestion-index-main/data/operators",
]
data = {"id": [], "display_name": [], "wiki_data": []}
log.info("Composing brand and operator name list...")
for file_path in file_paths:
for folder in os.listdir(file_path):
if os.path.isdir(os.path.join(file_path, folder)):
for file in os.listdir(os.path.join(file_path, folder)):
with open(os.path.join(file_path, folder, file)) as f:
file_content = json.load(f)
for item in file_content["items"]:
wiki_data = id = display_name = None
if "id" in item.keys():
id = dict(item)["id"]
if "displayName" in item.keys():
display_name = dict(item)["displayName"]
if "tags" in item.keys():
if "brand:wikidata" in list(item["tags"].keys()):
wiki_data = dict(item["tags"].items())[
"brand:wikidata"
]
elif "operator:wikidata" in list(item["tags"].keys()):
wiki_data = dict(item["tags"].items())[
"operator:wikidata"
]
if str(id) != "nan":
data["id"].append(id)
data["display_name"].append(display_name)
data["wiki_data"].append(wiki_data)

shutil.rmtree(temp_files_dir + "name-suggestion-index-main")
df = pd.DataFrame(data)
df.drop_duplicates(subset=["display_name", "wiki_data"])
df = df.sort_values("id")
df.to_csv(temp_files_dir + "brand_names.csv", index=False)
log.info("Done!")
3 changes: 2 additions & 1 deletion kuwala/pipelines/osm-poi/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
action = "download" if option == choices[0] else "process"

if action == "download":
Downloader.start(args)
Downloader.download_pbf(args)
Downloader.download_names()
else:
Processor.start(args)