# 3.2

## 3.2.a Simple Geohash Index

In [1]:
import os
import sys
import gzip
import json
from pathlib import Path
import csv

import pandas as pd
import s3fs
import pyarrow as pa
from pyarrow.json import read_json
import pyarrow.parquet as pq
import fastavro
import pygeohash as pgh
import snappy
import jsonschema
from jsonschema.exceptions import ValidationError


endpoint_url='https://storage.budsc.midwest-datascience.com'

current_dir = Path(os.getcwd()).absolute()
schema_dir = current_dir.joinpath('schemas')
schema_dir.mkdir(parents=True, exist_ok=True)
results_dir = current_dir.joinpath('results')
results_dir.mkdir(parents=True, exist_ok=True)

def read_jsonl_data():
    s3 = s3fs.S3FileSystem(
        anon=True,
        client_kwargs={
            'endpoint_url': endpoint_url
        }
    )
    src_data_path = 'data/processed/openflights/routes.jsonl.gz'
    with s3.open(src_data_path, 'rb') as f_gz:
        with gzip.open(f_gz, 'rb') as f:
            records = [json.loads(line) for line in f.readlines()]
        

    return records

records = read_jsonl_data()

In [2]:


df = pd.json_normalize(records)

df = df.rename({'dst_airport.latitude': 'dst_airport_latitude', 'dst_airport.longitude': 'dst_airport_longitude'}, axis=1)  # new method
df.head()


Unnamed: 0,codeshare,equipment,airline.airline_id,airline.name,airline.alias,airline.iata,airline.icao,airline.callsign,airline.country,airline.active,...,dst_airport_latitude,dst_airport_longitude,dst_airport.altitude,dst_airport.timezone,dst_airport.dst,dst_airport.tz_id,dst_airport.type,dst_airport.source,dst_airport,src_airport
0,False,[CR2],410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,...,55.606201,49.278702,411.0,3.0,N,Europe/Moscow,airport,OurAirports,,
1,False,[CR2],410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,...,55.606201,49.278702,411.0,3.0,N,Europe/Moscow,airport,OurAirports,,
2,False,[CR2],410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,...,44.225101,43.081902,1054.0,3.0,N,Europe/Moscow,airport,OurAirports,,
3,False,[CR2],410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,...,55.606201,49.278702,411.0,3.0,N,Europe/Moscow,airport,OurAirports,,
4,False,[CR2],410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,...,55.0126,82.650703,365.0,7.0,N,Asia/Krasnoyarsk,airport,OurAirports,,


In [10]:
dst_airport_latitude = df['dst_airport_latitude']
dst_airport_longitude = df['dst_airport_longitude']

df['geohash'] = df.apply(lambda x: pgh.encode(x.dst_airport_latitude,x.dst_airport_longitude,precision=5), axis=1)
df.head(5)

Unnamed: 0,codeshare,equipment,airline.airline_id,airline.name,airline.alias,airline.iata,airline.icao,airline.callsign,airline.country,airline.active,...,dst_airport_longitude,dst_airport.altitude,dst_airport.timezone,dst_airport.dst,dst_airport.tz_id,dst_airport.type,dst_airport.source,dst_airport,src_airport,geohash
0,False,[CR2],410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,...,49.278702,411.0,3.0,N,Europe/Moscow,airport,OurAirports,,,v1gh3
1,False,[CR2],410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,...,49.278702,411.0,3.0,N,Europe/Moscow,airport,OurAirports,,,v1gh3
2,False,[CR2],410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,...,43.081902,1054.0,3.0,N,Europe/Moscow,airport,OurAirports,,,szyes
3,False,[CR2],410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,...,49.278702,411.0,3.0,N,Europe/Moscow,airport,OurAirports,,,v1gh3
4,False,[CR2],410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,...,82.650703,365.0,7.0,N,Asia/Krasnoyarsk,airport,OurAirports,,,vcfbb


In [11]:
df['geohash']

0        v1gh3
1        v1gh3
2        szyes
3        v1gh3
4        vcfbb
         ...  
67658    r1f90
67659    txsuy
67660    ucfgn
67661    tx5z0
67662    txsuy
Name: geohash, Length: 67663, dtype: object

In [4]:
df.to_json(r'/home/jovyan/dsc650/schemas/results/geoindex\geoindex.json')
