In [5]:
# Imports
import pandas as pd
from pykml import parser
import os
from os import path

# Functions
def extract_all_spots():
    # for each file that ends with kml in the files folder, extract the name, description, and coordinates and add them to a dictionary, if the place has a LineString, skip it
    places = []
    for fileName in os.listdir('files'):
        if fileName.endswith('.kml'):
            kml_file = path.join(f'files/{fileName}')
            with open(kml_file) as f:
                doc = parser.parse(f).getroot()
            for place in doc.Document.Folder.Placemark:
                try:
                    if place.LineString:
                        continue
                except:
                    pass
                try:
                    name = place.name
                except:
                    name = None
                try:
                    description = place.description
                except:
                    description = None
                try:
                    coordinates = place.Point.coordinates
                except:
                    coordinates = None
                places.append({'name': name, 'description': description, 'coordinates': coordinates})

    # convert the dictionary to a dataframe
    df = pd.DataFrame(places)
    return df

def data_cleaning(df):
    # store values as strings
    df['name'] = df['name'].astype(str)
    df['description'] = df['description'].astype(str)
    df['coordinates'] = df['coordinates'].astype(str)

    # extract the latitude, longitude from the coordinates column
    df['latitude'] = df['coordinates'].str.split(',', expand=True)[1]
    df['longitude'] = df['coordinates'].str.split(',', expand=True)[0]

    # remove first 3 characters from longitude column
    df['longitude'] = df['longitude'].str[3:]

    # if name starts with " remove it
    df['name'] = df['name'].str.replace('"', '')

    # drop coordinates column
    df = df.drop(columns=['coordinates'])

    # fill na description with 'missing'
    df['description'] = df['description'].fillna('missing')

    # drop na
    df = df.dropna()

    # output
    return df

# Main
df = extract_all_spots()
df = data_cleaning(df)
df.to_csv('spots.csv', index=False)

# Inspect
print(f"Spots: {df.shape[0]}")

Spots: 74


  if place.LineString:
