In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, date, time

In [2]:
directory = './data/ground_truth/sdot/2022_01'

In [3]:
occ = pd.DataFrame()

In [4]:
col_name = None
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file
    if not filename.startswith('.') and os.path.isfile(f):
        df = pd.read_csv(f)
        if len(occ.columns) == 0 or df.columns.values.tolist() == occ.columns.values.tolist():
            occ = pd.concat([occ, df], ignore_index=True)
    

In [5]:
def convert_time(timestr):
    try:
        return datetime.strptime(timestr, '%I%p').time()
    except:
        return np.nan
    
def convert_date(datestr):
    try:
        return datetime.strptime(datestr, '%m/%d/%Y').date()
    except:
        return np.nan

In [6]:
occ = occ.drop(columns=['Unnamed: 0','Route','Label','Load Zone','RPZ for that Block','ADA','Rideshare','ILLEGAL','Notes','Pictures'])
occ['Time'] = occ.apply(lambda r: convert_time(r['Time']), axis=1)
occ['Date'] = occ.apply(lambda r: convert_date(r['Date']), axis=1)
occ.dropna(subset=['Time','Date','Element Key'], inplace=True)
occ['Element Key'] = occ.apply(lambda r: int(r['Element Key']), axis=1)
occ = occ.sort_values(by=['Element Key', 'Date', 'Time'])

In [7]:
occ.head()

Unnamed: 0,Time,Date,Element Key,Location,Side of Street,PAID
17549,09:00:00,2021-10-27,1001,1ST AVE BETWEEN CHERRY ST AND COLUMBIA ST,SW,5
17550,10:00:00,2021-10-27,1001,1ST AVE BETWEEN CHERRY ST AND COLUMBIA ST,SW,5
17551,11:00:00,2021-10-27,1001,1ST AVE BETWEEN CHERRY ST AND COLUMBIA ST,SW,5
17552,12:00:00,2021-10-27,1001,1ST AVE BETWEEN CHERRY ST AND COLUMBIA ST,SW,5
17553,13:00:00,2021-10-27,1001,1ST AVE BETWEEN CHERRY ST AND COLUMBIA ST,SW,5


In [8]:
grouped = occ.groupby(['Element Key','Date'])
groups = grouped.filter(lambda r: len(r.index) == 12)

In [16]:
groups = groups.reset_index(drop=True)
groups.head(50)

Unnamed: 0,Time,Date,Element Key,Location,Side of Street,PAID
0,08:00:00,2022-04-29,1013,1ST AVE BETWEEN SENECA ST AND UNIVERSITY ST,SW,2
1,09:00:00,2022-04-29,1013,1ST AVE BETWEEN SENECA ST AND UNIVERSITY ST,SW,0
2,10:00:00,2022-04-29,1013,1ST AVE BETWEEN SENECA ST AND UNIVERSITY ST,SW,1
3,11:00:00,2022-04-29,1013,1ST AVE BETWEEN SENECA ST AND UNIVERSITY ST,SW,0
4,12:00:00,2022-04-29,1013,1ST AVE BETWEEN SENECA ST AND UNIVERSITY ST,SW,1
5,13:00:00,2022-04-29,1013,1ST AVE BETWEEN SENECA ST AND UNIVERSITY ST,SW,1
6,14:00:00,2022-04-29,1013,1ST AVE BETWEEN SENECA ST AND UNIVERSITY ST,SW,3
7,15:00:00,2022-04-29,1013,1ST AVE BETWEEN SENECA ST AND UNIVERSITY ST,SW,4
8,16:00:00,2022-04-29,1013,1ST AVE BETWEEN SENECA ST AND UNIVERSITY ST,SW,5
9,17:00:00,2022-04-29,1013,1ST AVE BETWEEN SENECA ST AND UNIVERSITY ST,SW,4


In [10]:
groups.tail()

Unnamed: 0,Time,Date,Element Key,Location,Side of Street,PAID
9631,15:00:00,2022-04-27,136322,NE PACIFIC ST BETWEEN BROOKLYN AVE NE AND UNIV...,N,5
9632,16:00:00,2022-04-27,136322,NE PACIFIC ST BETWEEN BROOKLYN AVE NE AND UNIV...,N,3
9633,17:00:00,2022-04-27,136322,NE PACIFIC ST BETWEEN BROOKLYN AVE NE AND UNIV...,N,1
9634,18:00:00,2022-04-27,136322,NE PACIFIC ST BETWEEN BROOKLYN AVE NE AND UNIV...,N,0
9635,19:00:00,2022-04-27,136322,NE PACIFIC ST BETWEEN BROOKLYN AVE NE AND UNIV...,N,0


In [11]:
block_data = groups.drop_duplicates(subset=["Date", "Element Key"], keep="first")
block_data = block_data.drop(columns=["Time","Location","Side of Street","PAID"])

In [12]:
block_data.head(1000)

Unnamed: 0,Date,Element Key
0,2022-04-29,1013
12,2022-08-22,1013
24,2022-04-29,1014
36,2022-08-22,1014
48,2022-05-03,1017
...,...,...
9576,2022-08-22,123943
9588,2022-05-05,131238
9600,2022-04-29,136041
9612,2022-08-22,136041


In [14]:
block_data.to_csv("data/sdot_12.csv",index=False)