In [1]:
import pandas as pd
import numpy as np
import plotly.express as px


In [2]:
def generate_query(endpoint_url, query, limit):

    # combine endpoint_url, query, and limit into a single string
    raw_query = (f"{endpoint_url}?$query="
                 f"{query}%20"
                 f"limit {limit}"
                )
    
    # get rid of control characters
    for replacements in ((" ", "%20"), ("\n", "%20")):
        raw_query = raw_query.replace(*replacements)
    
    return raw_query

In [45]:
endpoint_url = "https://data.cincinnati-oh.gov/resource/4cjh-bm8b.json"

query = """select address, count(*) as n
           where service_code IN ('"TLGR-PRV"','"LITR-PRV"','"DUMP-PVS"')
           and requested_date>='2022-03-01'
           and requested_date <'2023-04-01'
           group by address
           order by n desc
           """


query_url = generate_query(endpoint_url = endpoint_url,
                           query = query,
                           limit = 1000000)

In [46]:
requests_by_address = pd.read_json(query_url)

In [47]:
requests_by_address['clean_address'] = requests_by_address.address.str.replace(', CINC', '', regex=False) \
    .str.split('-').str[0] \
    .str.strip() \
    .str.replace('"', '')

In [48]:
requests_by_address = requests_by_address \
    .groupby(['clean_address'], as_index=False) \
    .agg('sum') \
    .sort_values(by = 'n', ascending=False)

In [49]:
requests_by_address.query('clean_address.str.contains("WOODBINE")', engine='python')

Unnamed: 0,clean_address,n
6480,8338 WOODBINE AV,7
6525,8452 WOODBINE AV,6
6909,MILLSDALE ST & WOODBINE AV,4
6447,8248 WOODBINE AV,2
6510,8402 WOODBINE AV,2
7015,WOODBINE AV & MILLSDALE ST,2
6873,HUNSFORD ST & WOODBINE AV,1
7014,WOODBINE AV & HUNSFORD ST,1
6485,8342 WOODBINE AV,1
6486,8346 WOODBINE AV,1


In [50]:
total_requests = np.sum(requests_by_address['n'])

In [51]:
requests_by_address['pct_requests'] = requests_by_address['n'] / total_requests

In [52]:
requests_by_address['row_number'] = requests_by_address.reset_index().index

In [53]:
total_rows = len(requests_by_address)

In [54]:
requests_by_address['row_percentile'] = np.ceil(((requests_by_address['row_number'] + 1) / total_rows) * 100)

In [55]:
agg_requests = requests_by_address \
    .groupby(['row_percentile'], as_index=False) \
    .aggregate({'pct_requests':'sum'})

In [56]:
agg_requests['cumsum'] = np.ceil(agg_requests['pct_requests'].cumsum()*100)

In [57]:
first_row = pd.DataFrame(np.array([(0)]))

In [58]:
agg_requests.loc[len(agg_requests)] = 0

In [59]:
agg_requests = agg_requests.sort_values(by = 'row_percentile')

In [60]:
def get_percentile_requests(agg_requests, percentile):
    return agg_requests.query(f"row_percentile == {percentile}")['cumsum']

percentile_1 = get_percentile_requests(agg_requests, 1)
percentile_20 = get_percentile_requests(agg_requests, 20)


In [61]:
yx_line = pd.DataFrame(np.array([[0, 1], [0, 1]]),
                    columns=['x', 'y'])

In [65]:
fig = px.line(agg_requests, x='row_percentile', y='cumsum',
             template='plotly_white', 
             title="Graph 1. Concentration of Requests by Address in the Past Year (Ending March 2023)<br><sup>Interpreted as X% of Addresses Make Up Y% of Requests",
             labels={
                     "row_percentile": "% of Unique Addresses",
                     "cumsum": "% of Total Requests"
                    },
            )

# add 45 degree line
fig.update_layout(shapes = [{'type': 'line', 'y0': 0, 'y1': 100, 'x0': 0, 'x1': 100}])

fig.show()

In [71]:
requests_by_address.query("row_percentile == 1").describe()

Unnamed: 0,n,pct_requests,row_number,row_percentile
count,67.0,67.0,67.0,67.0
mean,9.731343,0.000969,33.0,1.0
std,6.202595,0.000617,19.485037,0.0
min,6.0,0.000597,0.0,1.0
25%,7.0,0.000697,16.5,1.0
50%,8.0,0.000796,33.0,1.0
75%,10.0,0.000995,49.5,1.0
max,53.0,0.005275,66.0,1.0


In [74]:
?px.box

In [78]:
fig2 = px.box(requests_by_address.query("row_percentile == 1"), y="n")
fig2.show()

In [24]:
endpoint_url = "https://data.cincinnati-oh.gov/resource/4cjh-bm8b.json"

query = """select address, longitude, latitude, requested_date, 
           date_extract_y(requested_date) as year,
           date_extract_m(requested_date) as month
           where service_code IN ('"TLGR-PRV"','"LITR-PRV"','"DUMP-PVS"')
           and requested_date>='2020-01-01'
           and requested_date<='2023-02-28'
           """


query_url = generate_query(endpoint_url = endpoint_url,
                           query = query,
                           limit = 1000000)

In [25]:
long_lat = pd.read_json(query_url)

In [26]:
long_lat

Unnamed: 0,address,longitude,latitude,requested_date,year,month
0,"""2562 MONTANA AV - GJ0906536800 """,-84.580468,39.155308,2021-12-06T00:00:00.000,2021,12
1,"""2630 KIPLING AV, CINC - GJ0910741898""",-84.578407,39.201201,2021-12-06T00:00:00.000,2021,12
2,"""2500 FERGUSON RD, CINC - GJ0743734248""",-84.600079,39.132725,2021-12-06T00:00:00.000,2021,12
3,"""2334 ROHS ST, CINC - GJ1427533700""",-84.521198,39.126141,2021-12-06T00:00:00.000,2021,12
4,"""2823 HIGHLAND AV, CINC - GJ1582034394""",-84.503126,39.132005,2021-12-06T00:00:00.000,2021,12
...,...,...,...,...,...,...
26572,"""2571 LIDDELL ST, CINC - GJ1206234216""",-84.546622,39.131319,2023-02-28T00:00:00.000,2023,2
26573,"""4840 SPRING GROVE AV, CINC - GJ1497938343""",-84.511592,39.167769,2023-02-28T00:00:00.000,2023,2
26574,"""4006 PADDOCK RD, CINC - GJ1775737191""",-84.479843,39.156702,2023-02-28T00:00:00.000,2023,2
26575,"""2388 HARRISON AV, CINC - GJ0926934997""",-84.578667,39.139032,2023-02-28T00:00:00.000,2023,2


In [27]:
long_lat.to_csv("./long_lat_requests.csv")

In [27]:
endpoint_url = "https://data.cincinnati-oh.gov/resource/4cjh-bm8b.json"

query = """select count(distinct service_code) as n_services
           where requested_date>='2020-01-01'
           and requested_date<='2023-02-28'
           """


query_url = generate_query(endpoint_url = endpoint_url,
                           query = query,
                           limit = 1000000)

In [29]:
pd.read_json(query_url)

Unnamed: 0,n_services
0,533


In [79]:
endpoint_url = "https://data.cincinnati-oh.gov/resource/4cjh-bm8b.json"

query = """select service_code, longitude, latitude, requested_date, 
           date_extract_y(requested_date) as year,
           date_extract_m(requested_date) as month
           where service_code IN ('"TLGR-PRV"','"LITR-PRV"','"DUMP-PVS"')
           and requested_date>='2020-01-01'
           and requested_date<='2023-02-28'
           """


query_url = generate_query(endpoint_url = endpoint_url,
                           query = query,
                           limit = 1000000)