In [None]:
from src.data import RawDataset, Dataset
from src.utils import list_dir, head_file, load_json, save_json
from src import workflow

import matplotlib.pyplot as plt
plt.style.use('seaborn')


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Create Raw Dataset

In [None]:
raw_ds = RawDataset("fremont_bike")

In [None]:
raw_ds.add_url(url="https://data.seattle.gov/api/views/65db-xm6k/rows.csv?accessType=DOWNLOAD", file_name="fremont.csv")

In [None]:
license_txt = "Public Domain"

readme_txt = """
Fremont Bridge Hourly Bicycle Counts by Month October 2012 to present

The Dataset is provided by the Seattle Department of Transportation Open Data initiative and is available from:
https://data.seattle.gov/Transportation/Fremont-Bridge-Hourly-Bicycle-Counts-by-Month-Octo/65db-xm6k

Description
-----------
The Fremont Bridge Bicycle Counter records the number of bikes that cross the bridge using the 
pedestrian/bicycle pathways. Inductive loops on the east and west pathways count the passing of bicycles 
regardless of travel direction. The data consists of a date/time field: Date, east pathway count field: 
Fremont Bridge NB, and west pathway count field: Fremont Bridge SB. The count fields represent the 
total bicycles detected during the specified one hour period. Direction of travel is not specified, 
but in general most traffic in the Fremont Bridge NB field is travelling northbound and most traffic in
the Fremont Bridge SB field is travelling southbound.

Data Format
-----------
The Dataset consists of a csv file with three columns:
  Date, Fremont Bridge East Sidewalk, Fremont Bridge West Sidewalk
Data consist of counts (in each direction) grouped by hour.

"""


In [None]:
raw_ds.add_metadata(contents=license_txt, kind='LICENSE')
raw_ds.add_metadata(contents=readme_txt, kind='DESCR')

In [None]:
raw_ds.fetch()

In [None]:
# What do we have?
unpack_dir = raw_ds.unpack()
print(unpack_dir)
list_dir(unpack_dir)

In [None]:
# Quick check of data format
print(head_file(unpack_dir / 'fremont.csv'))

## Process data

In [None]:
import pandas as pd

In [None]:
# look at jvdp's example code. It basicall:
data = pd.read_csv(unpack_dir / 'fremont.csv', index_col='Date')

try:
    data.index = pd.to_datetime(data.index, format='%m/%d/%Y %I:%M:%S %p')
except TypeError:
    data.index = pd.to_datetime(data.index)
data.columns = ['West', 'East']
data['Total'] = data['West'] + data['East']
data.head()

In [None]:
data = data.reset_index().values; data

In [None]:
#%%file -a ../src/data/localdata.py
#__all__ += ['process_fremont_bike']
def process_fremont_bike(dataset_name="fremont_bike", metadata=None):
    """Process Seattle DoT's Fremont Bridge Hourly Bicycle Counts
    Data is available as a CSV.
    Parse into a pandas.Dataframe and add a total column
    """
    if metadata is None:
        metadata = {}
    data = pd.read_csv(interim_data_path / dataset_name / 'fremont.csv', index_col='Date')

    try:
        data.index = pd.to_datetime(data.index, format='%m/%d/%Y %I:%M:%S %p')
    except TypeError:
        data.index = pd.to_datetime(data.index)
    data.columns = ['West', 'East']
    data['Total'] = data['West'] + data['East']

    return {
        "dataset_name":dataset_name,
        "metadata": metadata,
        "data":data,
        "target":None
    }

In [None]:
from src.data import process_fremont_bike

In [None]:
raw_ds.load_function = process_fremont_bike

### Load the Raw Dataset

In [None]:
# Save the raw dataset
from src.data import Dataset
#add_raw_dataset(raw_ds)

workflow.add_raw_dataset(raw_ds)
workflow.available_raw_datasets()

In [None]:
#from src.paths import src_module_dir
#dataset_list_fq = src_module_dir / 'data' / 'datasets.json'
#dataset_list = load_json(dataset_list_fq)

# workflow.create_dataset(raw_dataset_name=raw_ds.name) 
workflow.available_datasets()

In [None]:
workflow.get_transformer_list()
#dataset_list.append({"output_dataset":'fremont_bike', 'raw_dataset_name':'fremont_bike'})

In [None]:
workflow.add_transformer(from_raw='fremont_bike')
workflow.get_transformer_list()

In [None]:
workflow.del_transformer(-1)
workflow.get_transformer_list()

In [None]:
logger.setLevel(logging.INFO)
workflow.apply_transforms()

In [None]:
!cd .. &&  make transform_data # same thing, but from the command line

In [None]:
'fremont_bike' in workflow.available_datasets()

## Work with the Dataset

In [None]:
ds = Dataset.load('fremont_bike')

In [None]:
ds.data.shape

In [None]:
type(ds.data)

In [None]:
ds.data.head()

In [None]:
### Create a transformer to pivot the data
#ds.data.pivot_table()
ds.data.index.time.shape

In [None]:
def index_to_date_time(dset, suffix='dt'):
    """Transformer: Extract a datetime index into Date and Time columns"""
    df = dset.data.copy()
    df['Time']=df.index.time
    df['Date']=df.index.date
    df.reset_index(inplace=True, drop=True)
    new_ds = Dataset(dataset_name=f"{dset.name}_{suffix}", metadata=dset.metadata, data=df)
    return new_ds

In [None]:
ds2 = index_to_date_time(ds)

In [None]:
# A transformer takes a dataset and returns a dataset
def pivot(dset, **pivot_opts):
    """Pivot data stored as a Pandas Dataframe
    
    pivot_opts:
        keyword arguments passed to pandas.Dataframe.pivot_table
    """
    pivoted = dset.data.pivot_table(i**pivot_opts)
    ds_pivot = Dataset(name=f"{dset.name}_pivoted", metadata=dset.metadata, data=pivoted, target=None)
    ds_pivot.metadata['pivot_opts'] = pivot_opts

    return ds_pivot


In [None]:
dsp.data.shape

In [None]:
dsp.data.plot(legend=False, alpha=0.01);

In [None]:
from src.data.transformers import available_transformers

In [None]:
available_transformers()

In [None]:
from src.paths import src_module_dir

transform_pipeline = [
    ('index_to_date_time',{}),
    ('pivot', {'values':'Total', 'index':'Time', 'columns':'Date'})
]

workflow.add_transformer(from_raw="fremont_bike", output_dataset="fremont_bike_pivot", transformations=transform_pipeline)
workflow.add_transformer(input_dataset="fremont_bike", output_dataset="fremont_bike_pivot2", transformations=transform_pipeline)


In [None]:
workflow.get_transformer_list()

In [None]:
workflow.apply_transforms()
#!cd .. && make process_data

In [None]:
'fremont_bike_pivot' in workflow.available_datasets()

## Two different pipelines should give us the same dataset
One came from the raw dataset. The other from the dataset that was generated from the raw dataset

In [None]:
dsp = Dataset.load('fremont_bike_pivot')
dsp.DATA_HASH

In [None]:
dsp2 = Dataset.load('fremont_bike_pivot2')
dsp2.DATA_HASH

In [None]:
dsp.data.plot(legend=False, alpha=0.01);

In [None]:
dsp.data.shape