## Setup

In [7]:
import pandas as pd
from sodapy import Socrata
import numpy as np
from tqdm import tqdm

# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.seattle.gov", None)

# Read corresponding sdot blocks between 08-18 from ground truth data
block_data = pd.read_csv('data/sdot_12.csv')
sdot_raw = pd.read_csv('data/sdot_12_raw.csv')



In [3]:
def run_query(client, data_id, element_key, date):
    query = "select occupancydatetime, paidoccupancy, sourceelementkey, parkingspacecount where sourceelementkey = " + str(element_key) + " and occupancydatetime between '" + str(date) + "T08:00:00' and '" + str(date) + "T19:59:00'"
    results = client.get(data_id, query=query)
    return results

## Query data from Seattle Open Data API

In [3]:
query_results = [run_query(client, "bwk6-iycu", element_key, date) for element_key, date in zip(tqdm(block_data['Element Key']), block_data['Date'])]

100%|███████████████████████████████████████████████████████████████| 803/803 [04:57<00:00,  2.70it/s]


## Filter blocks datetime pair that does not have full 08-18 period

In [4]:
occ = pd.DataFrame()
for group in query_results:
    if len(group) == 720:
        df = pd.DataFrame.from_records(group)
        occ = pd.concat([occ, df])

In [5]:
occ["occupancydatetime"] = pd.to_datetime(occ["occupancydatetime"], format="%Y-%m-%dT%X.000")

In [6]:
occ.to_csv("data/sdot_12_raw.csv",index=False)

## Get each block parking capacity

In [4]:
cap = sdot_raw[["sourceelementkey", "parkingspacecount"]]
cap = cap.drop_duplicates()
cap.head()

Unnamed: 0,sourceelementkey,parkingspacecount
0,1013,4
1440,1021,9
2160,1022,9
2880,1037,12
3600,1277,9


In [249]:
# write to csv file
# cap.to_csv("data/sdot_12_cap.csv",index=False)

In [10]:
sdot_raw["occupancydatetime"] = pd.to_datetime(sdot_raw["occupancydatetime"], format="%Y-%m-%d %X")

In [11]:
naive_occ = sdot_raw.groupby(["sourceelementkey", sdot_raw["occupancydatetime"].dt.hour, "parkingspacecount"])["paidoccupancy"].agg(lambda x: pd.Series.mode(x)[0])

In [15]:
naive_df = pd.DataFrame(naive_occ)
naive_df = naive_df.reset_index()
naive_df["paidoccupancy"] = naive_df["paidoccupancy"].astype(int)/naive_df["parkingspacecount"].astype(int)
naive_df = naive_df.drop("parkingspacecount", axis="columns")
naive_df = naive_df.rename(columns={"occupancydatetime": "hour"})
naive_df.head(50)

Unnamed: 0,sourceelementkey,hour,paidoccupancy
0,1013,8,0.25
1,1013,9,0.75
2,1013,10,0.5
3,1013,11,0.5
4,1013,12,0.25
5,1013,13,0.25
6,1013,14,0.75
7,1013,15,0.75
8,1013,16,1.5
9,1013,17,1.0


In [16]:
naive_df.loc[naive_df["sourceelementkey"] == 1021]

Unnamed: 0,sourceelementkey,hour,paidoccupancy
12,1021,8,0.555556
13,1021,9,0.555556
14,1021,10,0.222222
15,1021,11,0.222222
16,1021,12,0.444444
17,1021,13,0.888889
18,1021,14,0.888889
19,1021,15,0.666667
20,1021,16,0.888889
21,1021,17,0.888889


In [17]:
naive_df.to_csv("data/sdot_12_naive.csv",index=False)