# Extracting and Loading OpenStreetMaps data.

In [1]:
import numpy as np
import pandas as pd
import missingno as msno
import matplotlib
import sys
from pyspark.sql import SparkSession, functions, types


# Initialize a Spark Session.
spark = SparkSession.builder.appName('Preprocess data').getOrCreate()
spark.sparkContext.setLogLevel('WARN')


In [2]:
# Setting up the Schema
amenity_schema = types.StructType([
    types.StructField('lat', types.DoubleType(), nullable=False),
    types.StructField('lon', types.DoubleType(), nullable=False),
    types.StructField('timestamp', types.TimestampType(), nullable=False),
    types.StructField('amenity', types.StringType(), nullable=False),
    types.StructField('name', types.StringType(), nullable=True),
    types.StructField('tags', types.MapType(types.StringType(), types.StringType()), nullable=True),
])

### Amenity Data

In [3]:
# Load
data_amenities = spark.read.json(path = '../Datasets/amenities-vancouver.json.gz', schema = amenity_schema)

# Feature Schema for our dataset.
data_amenities.printSchema()

# Wikimedia tags are included.
# Leave name unfiltered to attempt combining with WikiData later.
data_amenities = data_amenities.drop('timestamp')

data_amenities.write.json('amenities-cleaned', mode = 'overwrite')
data_amenities.show()

root
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- amenity: string (nullable = true)
 |-- name: string (nullable = true)
 |-- tags: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+----------+------------+----------------+--------------------+--------------------+
|       lat|         lon|         amenity|                name|                tags|
+----------+------------+----------------+--------------------+--------------------+
| 49.260812| -123.125736|            cafe|           Starbucks|[brand:wikidata -...|
| 49.260953| -123.125704|       fast_food|          Salad Loop|[opening_hours ->...|
|49.3734231|-123.2918935|         toilets|                null|                  []|
|49.2498481| -122.959708|             bbq|                null|                  []|
|49.3708976|-123.2804478|place_of_worship|St. Monica's Angl...|[addr:housenumber...|
| 49.219983| -122.9

### Non-Amenity Data

In [4]:
# Load
data_non_amenities = spark.read.json(path = '../Datasets/nonAmenities-vancouver.json.gz', schema = amenity_schema)

data_non_amenities = data_non_amenities.drop('timestamp', 'amenity')
data_non_amenities.write.json('non-amenities', mode = 'overwrite')
data_non_amenities.show()


+----------+------------+--------------------+--------------------+
|       lat|         lon|                name|                tags|
+----------+------------+--------------------+--------------------+
|49.2792968|-122.9203518|Simon Fraser Univ...|[tourism -> infor...|
|49.1537742|-122.5255942|   Eagle Acres Dairy|[tourism -> attra...|
|49.2791033|-123.1236705|HI Vancouver Central|[guest_house -> h...|
|49.1490843|-122.9354361|                null|[tourism -> infor...|
|49.1866222|-122.9793512|                null|[tourism -> infor...|
|49.1850046|-122.9885831|                null|[tourism -> infor...|
|49.1918063|-122.9987891|                null|[tourism -> infor...|
|49.1976296|-123.0122037|                null|[tourism -> infor...|
|49.1933186|-123.0003269|                null|[tourism -> picni...|
|49.3239711| -123.102426|       Sleep Country|[brand:wikidata -...|
|49.1664654|-122.4501176|                null|[tourism -> infor...|
| 49.166774|-122.4512603|                null|[t

### Transportation Data

In [5]:
# Load
data_transportation = spark.read.json(path = '../Datasets/transportVancouver.json.gz', schema = amenity_schema)

# Make sure entries have a WikiData Tag.
data_transportation = data_transportation.filter(data_transportation['tags']['wikidata'].isNotNull())
data_transportation = data_transportation.drop('timestamp', 'amenity')

data_transportation.write.json('transportation', mode = 'overwrite')
data_transportation.show()

+----------+------------+--------------------+--------------------+
|       lat|         lon|                name|                tags|
+----------+------------+--------------------+--------------------+
|49.2781599|-122.8477927|        Moody Centre|[wheelchair -> ye...|
|49.2731524|-123.1004436|Main Streetâ€“Scien...|[wheelchair -> ye...|
|49.2820152|-123.1189358|Vancouver City Ce...|[subway -> yes, p...|
|49.2666472|-123.1154245|     Olympic Village|[subway -> yes, p...|
|49.2260351|-123.1164945| Langara-49th Avenue|[subway -> yes, p...|
|49.2096173|-123.1169376|        Marine Drive|[subway -> yes, p...|
|49.2803394|-122.7940816|             Lincoln|[wheelchair -> ye...|
|49.2607932|-123.0328813|              Rupert|[subway -> yes, p...|
|49.2124293|-122.6062266|          Port Haney|[alt_name -> Port...|
|49.1991139| -122.850605|             Gateway|[wheelchair -> ye...|
|49.1895143|-122.8478763|      Surrey Central|[wheelchair -> ye...|
|49.1827755|-122.8447317|         King George|