# Data from HOTOSM

## Data source is from https://data.humdata.org/dataset/hotosm_hti_roads

In [15]:
# Start by importing the kml module
from fastkml import kml

In [19]:
# Read file into string and convert to UTF-8 (Python3 style)
with open("hotosm_hti_roads_lines.kml", 'rt') as myfile:
    doc=myfile.read().encode("utf-8")

In [38]:
# Create the KML object to store the parsed result
k = kml.KML()

In [39]:
# Read in the KML string
k.from_string(doc)

In [40]:
# Next we perform some simple sanity checks

# Check that the number of features is correct
# This corresponds to the single ``Document``

features = list(k.features())

In [41]:
len(features)

1

In [42]:
# Check that we can access the features as a generator
# (The two Placemarks of the Document)
features[0].features()

<generator object _Container.features at 0x7f9eac1745f0>

In [43]:
f2 = list(features[0].features())
len(f2)

1

In [44]:
f2

[<fastkml.kml.Folder at 0x7f9ee6914220>]

In [45]:
# Check specifics of the first Placemark in the Document
f2[0]

<fastkml.kml.Folder at 0x7f9ee6914220>

In [46]:
f2[0].description

In [47]:
f2[0].name

'Roads'

In [48]:
# Verify that we can print back out the KML object as a string
print(k.to_string(prettyprint=True))

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [51]:
import geopandas as gpd
import fiona

gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'
df_kml = gpd.read_file('hotosm_hti_roads_lines.kml', driver='KML')

  for feature in features_lst:


In [58]:
df_kml.head(20)

Unnamed: 0,Name,Description,geometry
0,Boulevard Harry Truman,,"LINESTRING (-72.34976 18.53712, -72.34973 18.5..."
1,Boulevard Harry Truman,,"LINESTRING (-72.34980 18.53691, -72.34976 18.5..."
2,Route Nationale # 2,,"LINESTRING (-72.41565 18.53889, -72.41503 18.5..."
3,Boulevard des Industries,,"LINESTRING (-72.30498 18.57513, -72.30525 18.5..."
4,Boulevard des Industries,,"LINESTRING (-72.31261 18.58026, -72.31262 18.5..."
5,,,"LINESTRING (-72.11753 18.64945, -72.11733 18.6..."
6,Rue Toussaint Louverture,,"LINESTRING (-72.09070 18.65815, -72.09076 18.6..."
7,Rue des Miracles,,"LINESTRING (-72.34478 18.54902, -72.34463 18.5..."
8,Rue du Magasin de l'Etat,,"LINESTRING (-72.34347 18.55451, -72.34359 18.5..."
9,Rue du Centre,,"LINESTRING (-72.34402 18.54090, -72.34429 18.5..."


In [52]:
import osmnx as ox
import networkx as nx
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
from pyproj import CRS
%matplotlib inline
import seaborn as sns
import folium

In [61]:
def basic_eda(df, datafile_name="Datafile name"):
    print(f"\n**************************",datafile_name,"******************************\n")
    print(f"Performing basic EDA on {datafile_name} dataset\n")
    print(f"* The dataset has {df.shape[0]} rows and {df.shape[1]} columns.")
    print("\n* First 10 rows of the dataset:\n")
    display(df.head(10))


In [63]:
basic_eda(df_kml, "Haiti Roads Data from HOTOSM KML Dataset")


************************** Haiti Roads Data from HOTOSM KML Dataset ******************************

Performing basic EDA on Haiti Roads Data from HOTOSM KML Dataset dataset

* The dataset has 112875 rows and 3 columns.

* First 10 rows of the dataset:



Unnamed: 0,Name,Description,geometry
0,Boulevard Harry Truman,,"LINESTRING (-72.34976 18.53712, -72.34973 18.5..."
1,Boulevard Harry Truman,,"LINESTRING (-72.34980 18.53691, -72.34976 18.5..."
2,Route Nationale # 2,,"LINESTRING (-72.41565 18.53889, -72.41503 18.5..."
3,Boulevard des Industries,,"LINESTRING (-72.30498 18.57513, -72.30525 18.5..."
4,Boulevard des Industries,,"LINESTRING (-72.31261 18.58026, -72.31262 18.5..."
5,,,"LINESTRING (-72.11753 18.64945, -72.11733 18.6..."
6,Rue Toussaint Louverture,,"LINESTRING (-72.09070 18.65815, -72.09076 18.6..."
7,Rue des Miracles,,"LINESTRING (-72.34478 18.54902, -72.34463 18.5..."
8,Rue du Magasin de l'Etat,,"LINESTRING (-72.34347 18.55451, -72.34359 18.5..."
9,Rue du Centre,,"LINESTRING (-72.34402 18.54090, -72.34429 18.5..."


In [64]:
percent = (df_kml.isnull().sum()/df_kml.isnull().count()*100).sort_values(ascending = False).round(2)
sum_missing = df_kml.isna().sum().sort_values(ascending = False)
missing_data  = pd.concat([percent, sum_missing], axis=1, keys=['Percent', "Missing Count"])
missing_data.head(20)

Unnamed: 0,Percent,Missing Count
Name,0.0,0
Description,0.0,0
geometry,0.0,0
