In [1]:
import pandas as pd
import flask
from fastparquet import ParquetFile
import json

In [2]:
airports_csv = pd.read_csv('data/airports.csv', sep=",")
airports_csv = airports_csv.astype(str)

In [3]:
airports_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322 entries, 0 to 321
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   IATA_CODE  322 non-null    object
 1   AIRPORT    322 non-null    object
 2   CITY       322 non-null    object
 3   STATE      322 non-null    object
 4   COUNTRY    322 non-null    object
 5   LATITUDE   322 non-null    object
 6   LONGITUDE  322 non-null    object
dtypes: object(7)
memory usage: 17.7+ KB


In [4]:
airports_csv.to_parquet('data/airports_parquet', engine='fastparquet')

In [5]:
airports_parquet = ParquetFile("data/airports_parquet").to_pandas()
airports_parquet.head()

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
0,ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.4404
1,ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.6819
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919
3,ABR,Aberdeen Regional Airport,Aberdeen,SD,USA,45.44906,-98.42183
4,ABY,Southwest Georgia Regional Airport,Albany,GA,USA,31.53552,-84.19447


In [6]:
airports_parquet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322 entries, 0 to 321
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   IATA_CODE  322 non-null    object
 1   AIRPORT    322 non-null    object
 2   CITY       322 non-null    object
 3   STATE      322 non-null    object
 4   COUNTRY    322 non-null    object
 5   LATITUDE   322 non-null    object
 6   LONGITUDE  322 non-null    object
dtypes: object(7)
memory usage: 17.7+ KB


In [13]:
def get_schema(schema_name):
    with open(schema_name, 'r') as schema_file:
        schema = json.load(schema_file)
    return schema

In [41]:
def get_schema2(schema_name):
    try:
        with open(schema_name, 'r') as schema_file:
            schema = json.load(schema_file)
    except ValueError as e:
        raise Exception(f"Error loading schema: {e}")
    return schema

In [43]:
schema = get_schema("airports_schema.json")

In [44]:
airports_transformed = airports_parquet.copy()
airports_transformed.columns = airports_transformed.columns.str.lower()
airports_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322 entries, 0 to 321
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   iata_code  322 non-null    object
 1   airport    322 non-null    object
 2   city       322 non-null    object
 3   state      322 non-null    object
 4   country    322 non-null    object
 5   latitude   322 non-null    object
 6   longitude  322 non-null    object
dtypes: object(7)
memory usage: 17.7+ KB


In [45]:
airports_transformed = airports_transformed.astype(schema)
airports_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 322 entries, 0 to 321
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   iata_code  322 non-null    object 
 1   airport    322 non-null    object 
 2   city       322 non-null    object 
 3   state      322 non-null    object 
 4   country    322 non-null    object 
 5   latitude   319 non-null    float64
 6   longitude  319 non-null    float64
dtypes: float64(2), object(5)
memory usage: 17.7+ KB


In [38]:
schema

{'iata_code': 'str',
 'airport': 'str',
 'city': 'str',
 'state': 'str',
 'country': 'str',
 'latitude': 'float',
 'longitude': 'float'}