In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib import cm
import requests
from random import shuffle
from pandas.io.json import json_normalize

In [None]:
# This is data from the city of new york site about permitted events.
# For more info: https://data.cityofnewyork.us/City-Government/NYC-Permitted-Event-Information/tvpp-9vvx

In [None]:
# Decided to go with the api rather than csv since it's updated daily

In [None]:
json_path = 'https://data.cityofnewyork.us/resource/8end-qv57.json'
df = pd.read_json(json_path)
print(df[:5])

In [None]:
# How many permits are there for upcoming events by borough:

In [None]:
df.groupby(['event_borough']).size()

In [None]:
df.groupby(['event_borough','event_type']).size()

In [None]:
# Graphing number of upcoming events by type and borough

In [None]:
bar = df.groupby(['event_borough','event_type']).size().unstack().plot(kind='bar', 
                                                                       stacked=True, 
                                                                       fontsize = 'x-large', 
                                                                       colormap='gist_ncar')
bar.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize=20)
box = bar.get_position()
bar.set_position([box.x0, box.y0, box.width * .75, box.height * .75])
bar.set_xlabel('NYC Borough')
bar.set_title('Types of Events by NYC Borough', fontsize=50, fontweight='bold')
plt.show(bar)

In [None]:
#checking that the date times are perceived as such
print(type(df['start_date_time'][3]))
print(type(df['end_date_time'][3]))

In [None]:
# Let's see on what day of the week is there the most upcoming events?

In [None]:
df['start_weekday'] = df['start_date_time'].dt.weekday_name

In [None]:
print(df['start_weekday'][:5])

In [None]:
df['end_weekday'] = df['end_date_time'].dt.weekday_name

In [None]:
print(df['end_weekday'][:5])

In [None]:
wk0 = df[['start_weekday', 'event_id']].groupby(['start_weekday']).count()
wk0.sort_values('event_id', ascending=False)

In [None]:
# Most upcoming events are on Thursday and the least are on Monday
# Now we can take a look at day of the week events by borough and type

In [None]:
wk1 = df[['start_weekday', 'event_borough','event_type', 'event_id']].groupby(['start_weekday', 'event_borough','event_type']).count()

In [None]:
wk1

In [None]:
wk1

In [None]:
sort1 = wk1.sort_values('event_id', ascending=False)

In [None]:
sort1

In [None]:
# The most upcoming permitted events of one type in one borough are 49 special events in Manhattan on Thursdays and Sundays
# Followed by 47 permitted construction events on Thursdays in Manhattan

In [None]:
# Now I'm wondering what is a Special Event though...

In [None]:
sp = df.loc[(df['event_type'] == 'Special Event')]

In [None]:
sp

In [None]:
# Looking at the event names in the filtered df snapshot, some of these "Special Events" are parades, some farmer's markets
# How many permits have been obtained with a event type mislabled as "Special Events"?

In [None]:
df.groupby('event_type').size()

In [None]:
df.groupby(['event_agency', 'event_type']).size()

In [None]:
# Looks like the Parks Department is the only one to label things as "Special Events" 
# We can take a closer look at the Special Events dataframe now: sp

In [None]:
sp_not_special = sp.loc[df['event_name'].str.contains(re.compile(
    'theater|constuction|athletic|parade|block|farmer|plaza|production|sale|street', re.IGNORECASE))]

In [None]:
sp_not_special['event_id'].count()

In [None]:
# 84 mislabled Special Events: let's fix that

In [None]:
df

In [None]:
def new_event_type(row):
    if "Special" in row.event_type and "Farmer" in row.event_name:
        return "Farmers Market"
    if "Special" in row.event_type and "Green" in row.event_name:
        return "Farmers Market"
    if "Special" in row.event_type and "arade" in row.event_name:
        return "Parade"
    if "Special" in row.event_type and "Run" in row.event_name:
        return "Athletic Race / Tour"
    if "Special" in row.event_type and "Athletic" in row.event_name:
        return "Athletic Race / Tour"
    if "Special" in row.event_type and "Block" in row.event_name:
        return "Block Party"
    if "Special" in row.event_type and "Film" in row.event_name:
        return "Production Event"
    if "Special" in row.event_type and "Sale" in row.event_name:
        return "Sidewalk Sale"
    if "Special" in row.event_type and "Festival" in row.event_name:
        return "Street Festival"
    # we can add some more categories so that special event is more clear
    if "Special" in row.event_type and "Holiday" in row.event_name:
        return "Holiday Celebration"
    if "Special" in row.event_type and "class" in row.event_name:
        return "Educational Event"
    if "Special" in row.event_type and "Party" in row.event_name:
        return "Party"
    if "Special" in row.event_type and "Protest" in row.event_name:
        return "Protest"
    if "Special" in row.event_type and "Winter" in row.event_name:
        return "Holiday Celebration"
    else:
        return row.event_type
df.loc[:, 'new_event_type'] = df.apply(new_event_type, axis = 1)

In [None]:
df

In [None]:
# While we are cleaning, let remove those commas in the police_precinct column

In [None]:
df['police_precinct'] = df.police_precinct.str.replace(',' , '')

In [None]:
df

In [None]:
# Now that we have a new_event_type column that gives some more specification to the Special Events,
# we can look make a events by type revised graph

In [None]:
# NOTE: make pivot & then to percentage and then graph
# NOTE2: after that see if can plot on map using addresses, if not maybe latitude and long of precincts?

In [None]:
df

In [None]:
updated_type = df[['new_event_type', 'event_borough', 'event_id']].groupby(['new_event_type', 'event_borough']).count()

In [None]:
updated_type

In [None]:
df2 = pd.DataFrame({'Percentage_total_events': df.groupby(('new_event_type', 'event_borough')).size() / len(df)})

In [None]:
df2

In [None]:
updated_type.event_id

In [None]:
res = df2.apply(lambda x: x.sort_values(ascending=False).head(17))

In [None]:
color_url = 'https://xkcd.com/color/rgb.txt'
r = requests.get(color_url)


In [None]:
ll = []
for line in r:
    rev = str(line).split('\\')
    ll.extend(rev)

In [None]:
color_list1 = []
for l in ll:
    if 't#' in l:
        rev = str(l).split('t')
        color_list1.append(rev[1])
    else:
        continue
color_list1


In [None]:
color_list2 = shuffle(color_list1)

In [None]:
df3 = df2.apply(lambda x: x.sort_values(ascending=False).head(10))
pie1 = plt.pie(df3, shadow=True, startangle=10, colors=color_list2)
labels= df3.index
sizes= df3.values * 100
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 25
fig_size[1] = 15
plt.axis('equal')
plt.title('Distribution of top 10 events by revised type and NYC borough', fontsize=40, fontweight='bold')
plt.legend(loc='lower right',fontsize=7, bbox_to_anchor = (1.12, -0.1), labels=['%s, %1.1f %%' % (l, s) for l, s in zip(labels, sizes)], prop={'size':30})
plt.show()

In [None]:
df

In [None]:
# Let's see if we can map the locations of these events using the police_precinct locations
# But first we need to get the locations of the police_precincts
# put this infomation http://services5.arcgis.com/GfwWNkhOj9bNBqoJ/arcgis/rest/services/nypp/FeatureServer/0/query?where=1=1&outFields=*&outSR=4326&f=geojson
#into the file nyc_police.geojson

In [None]:
from geojson import Point, Polygon
import geopandas as gpd

In [None]:
p1 = Point(0,0)
polygon = Polygon([(0,0),(1,1),(1,0)])
geo = '/Users/isobel/Desktop/nyc_police.geojson'
police_file = gpd.read_file(geo)
print(police_file.head(5))

In [None]:
file2 = police_file.to_json()