In [179]:
import pandas as pd
import requests
import io
from os.path import exists
import os

# Data Keys
keys = ['AIRPORT_DATA', 'AMTRAK']
KEYS = dict(zip(keys, keys))

# DataLoader Class
A generic class to load data from a URL source, and optionally drop and/or rename columns.
Only handles loading and basic processing, everything else should be done through the DF itself.

In [163]:
# TODO: add filter kwarg
class DataLoader():
    KWARGS = [
        'column_remapping',
        'columns_to_keep'
    ]
    ALLOWED_FILETYPES = ['csv', 'json']
    
    def __init__(self, data_name, source_type, source_url = '', **kwargs):
        if source_url == '':
            raise Exception('Must provide source_url or source_filepath for data')
        
        if source_type not in DataLoader.ALLOWED_FILETYPES:
            raise Exception(f"{source_type} not in allowed types")
        
        if data_name != data_name.upper():
            raise Exception('data_name must be uppercase')
        
        self.data_name = data_name
        self.source_type = source_type
        self.source_url = source_url
        self.df = None
        self.filepath = f"data/{self.data_name}.csv"
        
        self.column_remapping = kwargs.get('column_remapping', {})
        self.columns_to_keep = kwargs.get('columns_to_keep', None)
        
        try:
            os.mkdir("data")
        except OSError as error:
            pass
    
    def fetch_from_source(self):
        r = requests.get(self.source_url)
        df = self.df
        if r.ok:
            data = r.content.decode('utf8')
            if self.source_type == 'csv':
                df = pd.read_csv(io.StringIO(data))
            elif self.source_type == 'json':
                df = pd.read_json(io.StringIO(data))
            else:
                raise Exception("Unsupported data type")
            self.df = df
        else:
            print(r.reason)
            print(r.text)
            raise Exception(f"Error loading {self.data_name}")
    
    def load_from_file(self):
        self.df = pd.read_csv(self.filepath)
    
    def _check_file_existance(self):
        return exists(self.filepath)
    
    def save_to_file(self):
        if self.df is not None:
            self.df.to_csv(self.filepath)
        else:
            raise Exception("No data available to save")
    
    def _transform_source(self):
        # rename columns based on user-provided remapping
        self.df.rename(columns = self.column_remapping, inplace=True)
        self.df.reset_index(drop=True)
        
        # drop all columns not explicitly included in user-provide column_remapping + columns_to_keep
        # tip: pass [] to columns_to_keep to drop everything not remapped
        if self.columns_to_keep is not None:
            columns_to_keep = set(self.columns_to_keep + list(self.column_remapping.values()))
            cols_to_drop = [col for col in self.df.columns if col not in columns_to_keep]
            self.df.drop(cols_to_drop, axis=1, inplace=True)
    
    def retrieve(self):
        # return the most-live data. In-mem df -> local file -> remote source
        if self.df is not None:
            return self.df
        elif self._check_file_existance():
            self.load_from_file()
            return self.df
        else:
            self.fetch_from_source()
            self._transform_source()
            self.save_to_file()
            return self.df

In [180]:
class Utils():
    def lower(str):
        return str.lower()
    
    def upper(str):
        return str.upper()
    
    def camel_to_snake(str):
        return ''.join(['_' + s.lower() if s.isupper() else s for s in str]).lstrip('_')


In [176]:
keys_to_remap = [
    'ARPT_NAME',
    'ARPT_ID',
    'ICAO_ID',
    'DIST_CITY_TO_AIRPORT',
    'FACILITY_USE_CODE',
    'STATE_CODE',
    'CITY'
]
airport_loader = DataLoader(
    KEYS['AIRPORT_DATA'],
    'csv',
    'https://opendata.arcgis.com/api/v3/datasets/e2e88905639b415abe621a6a861b4eca_0/downloads/data?format=csv&spatialRefId=3857&where=1%3D1',
    column_remapping = {
        **dict(zip(keys_to_remap, map(Utils.lower, keys_to_remap))),
        **{
            'LAT_DECIMAL': 'lat',
            'LNG_DECIMAL': 'lng',
        }
    },
    columns_to_keep = []
)

In [182]:
keys_to_remap = [
    'StnType',
    'ZipCode',
    'State',
    'City'
]
amtrak_loader = DataLoader(
    KEYS['AMTRAK'],
    'csv',
    'https://opendata.arcgis.com/api/v3/datasets/4cf728602fa3428ba0a08d30efbb5f45_0/downloads/data?format=csv&spatialRefId=3857&where=1%3D1',
    column_remapping = {
        **dict(zip(keys_to_remap, map(Utils.camel_to_snake, keys_to_remap))),
        **{
            'Address1': 'address',
            'StationNam': 'station_name'
        }
    },
    columns_to_keep = []
)

In [183]:
df = amtrak_loader.retrieve()
[print(col) for col in df.columns]
df


stn_type
zip_code
state
city
address
station_name


Unnamed: 0,stn_type,zip_code,state,city,address,station_name
0,BUS,48801,MI,Alma,1105 Willow Run Drive,"Alma, MI"
1,BUS,12211,NY,Albany,737 Albany Shaker Road,"Albany, NY"
2,BUS,54421,WI,Colby,1210 North Division St.,"Abbotsford-Colby, WI"
3,TRAIN,21001,MD,Aberdeen,18 East Bel Air Avenue,"Aberdeen, MD"
4,TRAIN,87102,NM,Albuquerque,320 1st Street SW,"Albuquerque, NM"
...,...,...,...,...,...,...
1083,BUS,95389,CA,Yosemite National Park,9006 Yosemite Lodge Drive,"Yosemite National Park, CA"
1084,BUS,95389,CA,Yosemite National Park,Tioga Pass Road,"Yosemite National Park, CA"
1085,BUS,95389,CA,Yosemite National Park,9035 Village Drive,"Yosemite National Park, CA"
1086,BUS,95389,CA,Yosemite National Park,Old Tioga Road at Highway 120 East,"Yosemite National Park, CA"
