In [2]:
from sodapy import Socrata
from plotly import graph_objects as go
from datetime import datetime, date, time, timedelta
from datetime import datetime
# from highcharts import Highchart
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
import plotly.express as px
import datetime
import numpy as np
import utils
import plotly
from ipywidgets import widgets
import ipywidgets
from ipywidgets import interact
%matplotlib inline

from matplotlib.dates import DateFormatter
pd.options.display.max_colwidth = 200
import warnings
warnings.filterwarnings('ignore')

In [3]:
## Build client
usrname = "jjensen@mointern.nyc.gov"
pword = "Timspw19!"
client = Socrata("data.cityofnewyork.us","TnnbmTZvKbFz7tJ4NpzkfUG6C", username=usrname, password=pword)

## Make API calls
auth_json_downloads = client.get("beb8-u7wp", limit=2000000)
auth_json_dailyviews = client.get("qiif-mxcx", limit=50000)
auth_json_authentications = client.get("8aih-pmax", limit = 20000)
auth_json_datasetfacts = client.get("gzid-z3nh", limit = 500000)
auth_json_public_inventory = client.get("5tqd-u88y", limit=5000)
auth_json_api_calls = client.get("u7pi-wq4y", limit=2000)
auth_json_the_plan = client.get("qj2z-ibhs", limit=5000)


## Get the private asset inventory
## only necessary because the columns `asset_type` is only in the private and not the public asset inventory
auth_json_asset_inventory = client.get("r8cp-r4rc", limit = 40000)

In [34]:
## Convert json files into pandas dataframes
dv = pd.DataFrame(auth_json_dailyviews)
dauth = pd.DataFrame(auth_json_authentications)
dfacts = pd.DataFrame(auth_json_datasetfacts)
ddownloads = pd.DataFrame(auth_json_downloads)
public = pd.DataFrame(auth_json_public_inventory)
private = pd.DataFrame(auth_json_asset_inventory)
the_plan = pd.DataFrame(auth_json_the_plan)

In [35]:
# Create merged_filter, the dataframe that has only assets defined as datasets
filter_list = ['dataset','filter', 'gis map']
## Cleaning dates entered in the wrong format
bad_index_list = list(public.query('date_made_public == "August 9, 2-019"').index)
for val in bad_index_list:
    public.at[val, 'date_made_public'] = '8/9/2019'

# need to merge in private so that we can get the asset type
dupes = private[['u_id', 'agency', 'type','name', 'parent_uid','derived_view']]

public = utils.date_transformation(public, 'date_made_public')
upUntilfy20 = utils.apply_date_mask(public, 'date_made_public', fy=False)
merged_total = pd.merge(public, dupes, how='left', on='u_id')
merged_filter = merged_total[merged_total.type.isin(filter_list)]


# There are only 87 derived assets, so our total number of datasets would greatly increase 
merged_total[['name_x', 'derived_view']].query('derived_view == "true"')

Unnamed: 0,name_x,derived_view
2,311 Service Requests from 2010 to Present,true
7,311 Call Center Inquiry,true
11,Customer Service Module,true
28,Voting/Poll Sites Map,true
31,Board of Standards and Appeals (BSA) Decisions Map,true
...,...,...
2637,TLC Driver Exam Testing Center Locations (Map),true
2639,2016 Green Taxi Trip Data,true
2642,2016 Yellow Taxi Trip Data,true
2652,2012 Yellow Taxi Trip Data,true


In [36]:
public['last_update_date_data'] = pd.to_datetime(public['last_update_date_data'])
merged_filter['last_update_date_data'] = pd.to_datetime(public['last_update_date_data'])
merged_filter['last_update_date_data'] = merged_filter['last_update_date_data'].dt.tz_convert(None)

In [37]:
# merged filter is a copy of the asset inventory that is filtered to only include assets defined as datasets
freshness_df = merged_filter[['name_x','agency_x','u_id','update_frequency',\
                              'date_made_public','last_update_date_data','automation']]

# Remove datasets with update frequencies for which we cannot determine freshness
freshness_df = freshness_df[~freshness_df['update_frequency'].isin(['Historical Data', 'As needed'])].reset_index()
freshness_df.rename(columns={'agency_x':'agency'}, inplace=True)

# Removes automated datasets 
# freshness_df = freshness_df[freshness_df['automation'] != "Yes"]

In [38]:
FREQ_DICTIONARY = {
    
    'Daily' : [np.timedelta64(1, 'D')],
    'Weekly' : [np.timedelta64(1, 'W')],
    'Biweekly ': [np.timedelta64(4, 'D')],
    'Monthly': [np.timedelta64(1,'M')],
    'Quarterly':[np.timedelta64(3, 'M')],
    '2 to 4 times per year' : [np.timedelta64(6, 'M')],
    'Triannually': [np.timedelta64(4, 'M')],
    'Biannually ': [np.timedelta64(6, 'M')],
    'Annually': [np.timedelta64(1, 'Y')],
    'Weekdays': [np.timedelta64(2, 'D')],
    'Hourly': [np.timedelta64(1, 'h')],
    'Several times per day': [np.timedelta64(1, 'D')]
}

def calculate_freshness(df, date_col, fresh_col):
    '''
    Returns the asset inventory dataframe with an additional true/false column that
    indicates whether or not a dataset is fresh. Fresh is defined as having a 
    last_update_date_data timestamp within the stated update frequency
    
    Inputs:
        df: a dataframe, one that has filtered inapplicable update frequencies
            and all automated datasets
        date_col: last updated timestamp
        fresh_col: name of the binary fresh or stale column
    Returns:
        dataframe indicating if each dataset is stale or fresh
    '''
    
    df[fresh_col] = np.nan
    
    for frequency in df.update_frequency.unique():
                    
        
        temp = df.query('update_frequency == "{}"'.format(frequency))  
        today = datetime.datetime.today()
        idx = temp.index
        
#         if frequency == 'Weekdays':
#             np.busday_count(today, temp['last_update_date_data'])
            
        df.loc[idx, fresh_col] = (today - temp['last_update_date_data']) < FREQ_DICTIONARY[frequency][0]
    
    return df

In [39]:
# Percent of datasets fresh or stale
fresh_df = calculate_freshness(freshness_df, 'last_update_date_data', 'fresh')
fresh_df['agency'] = fresh_df['agency'].astype(str)

# get index value for all automated datasets
auto_idx = np.where(freshness_df['automation']=='Yes')

# update fresh_df to be true for all automated datasets
fresh_df.fresh.iloc[auto_idx] = True

pct_fresh = fresh_df.groupby('fresh').size().reset_index().rename(columns={0:'count'})
pct_fresh['pct'] = pct_fresh['count'].apply(lambda x: x/pct_fresh['count'].sum())
print(pct_fresh)
print('\n\n')

# Stale datasets by update_frequency
uf = fresh_df.query('fresh == False').groupby('update_frequency')\
    .size().reset_index().rename(columns={0:'Count'}).sort_values(by='Count')

 # 324 are annual, which could mean they're mistagged and should be historical
    
colors = ['gold', 'mediumturquoise', 'darkorange', 'lightgreen']


labels = uf['update_frequency']
values = uf['Count']

fig = go.Figure(data=[go.Pie(labels=labels, values=values)])

fig.update_traces(hoverinfo='label+value', textinfo='percent', textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='#000000', width=2)))

fig.update_layout(
     title=go.layout.Title(
        text="Stale Datasets According to their Update Frequency",
        xref="paper",
        x=.5)
     )
           
fig.show()

   fresh  count       pct
0  False    561  0.440345
1   True    713  0.559655





In [40]:
ALL = 'ALL'

def unique_sorted_values_plus_ALL(array):
    unique = array.unique().tolist()
    unique.sort()
    unique.insert(0, ALL)
    return unique

In [41]:
# dropdown_agency = widgets.Dropdown(options = unique_sorted_values_plus_ALL(fresh_df.agency))

# output_agency = widgets.Output()

# def dropdown_agency_eventhandler(change):
#     output_agency.clear_output()
#     with output_agency:
#         if (change.new == ALL):
#             display(fresh_df_)
#         else:
#             display(fresh_df[fresh_df.agency == change.new])

# dropdown_agency.observe(dropdown_agency_eventhandler, names='value')

# display(dropdown_agency, embed=True)

# display(output_agency, embed=True)

In [42]:
## Testing out interact

@interact
def show_agency_and_staleness(column=list(fresh_df.agency.unique())):
    print(column)
    return fresh_df.query('agency == "{}"'.format(column))



interactive(children=(Dropdown(description='column', options=('311', 'Board of Standards and Appeals (BSA)', '…

In [30]:
# # .. ipywidgets-setup::

# from ipywidgets import VBox, jsdlink, IntSlider, Button

# # .. ipywidgets-display::

# s1, s2 = IntSlider(max=200, value=100), IntSlider(value=40)
# b = Button(icon='legal')
# jsdlink((s1, 'value'), (s2, 'max'))
VBox([s1, s2, b])

VBox(children=(IntSlider(value=100, max=200), IntSlider(value=40), Button(icon='legal', style=ButtonStyle())))

In [25]:
# from ipywidgets import IntSlider
# from ipywidgets.embed import embed_minimal_html

# drop = widgets.Dropdown(
#         options=list(fresh_df.agency.unique()),
#         value='311',
#         description='Agency:'
#     )

# # slider = show_agency_and_staleness(column=list(fresh_df.agency.unique()))
# # embed_minimal_html('export.html', views=[slider], title='Widgets export')