# Holly's Data Science Capstone Projects

## Part 1: Install Packages

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


## Part 2: Import Data 

### Dataset 1: Locations of community centres in Edinburgh

In [4]:
df=pd.read_csv("https://data.edinburghopendata.info/dataset/2a8a1335-06be-4766-96f8-07638610f1bf/resource/36ead7e6-8234-4317-a73a-5575c1da64f8/download/directoryexport61.csv")
df = df[['Toilet','Location']]
df.rename(columns={'Location':'Toilet_Location'},inplace=True)
df.head()

Unnamed: 0,Toilet,Toilet_Location
0,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895"
1,Bath Street Public Toilet,"55.952976,-3.114088"
2,Bingham Community Centre,"55.9403279768,-3.12461868252"
3,Blackhall Library,"55.9616673126,-3.2611578066"
4,Braid Hills Golf Course,"55.9163063539,-3.20571551039"


### Dataset 2: Locations of Play Areas

In [5]:
df2=pd.read_csv("https://data.edinburghopendata.info/dataset/a22a4332-b73f-4477-834c-76d22ed8f993/resource/0e3b0a25-5bae-45b0-beb4-e63178305741/download/directoryexport60.csv")
df2 = df2[['Site','Location map']]
df2.rename(columns={'Location map':'PlayArea_Location'},inplace=True)
df2.head()

Unnamed: 0,Site,PlayArea_Location
0,Admirality Street,"55.97615152741767,-3.1789112091064453"
1,Allison Park,"55.95463010760622,-3.410053253173828"
2,Ardshiel Avenue (Torrence Park),"55.953523,-3.29278"
3,Bailie Place (Rear Nos. 1-5),"55.943291,-3.111828"
4,Balgreen Park,"55.94174996445649,-3.2571029663085938"


## Process first two datasets

### We only want to keep the play areas which have nearby toilets.  To identify these, I will do a many-to-many merge, drop the pairs with too large a distance between, and then keep only the unique pplay areas remaining.

In [6]:
# First, let's drop any na data
df=df.dropna()
df.reset_index(drop=True)
df2=df2.dropna()
df2.reset_index(drop=True)

# Next, let's check the size of each dataframe so we can verify the merge worked well
print('The first dataset has {} rows'.format(df.shape[0]))
print('The second dataset has {} rows'.format(df2.shape[0]))
print('The merged dataset should have {} rows'.format(df.shape[0]*df2.shape[0]))

The first dataset has 97 rows
The second dataset has 154 rows
The merged dataset should have 14938 rows


In [7]:
## We have to create a temporary variable with the value 1 for all rows, 
## as merge won't let you join without a join key
df['tmp'] = 1
df2['tmp'] = 1
df=pd.merge(df,df2, on=['tmp'])
df=df.drop('tmp', axis=1)
print('The merged dataset has {} rows'.format(df.shape[0]))
df.head()

The merged dataset has 14938 rows


Unnamed: 0,Toilet,Toilet_Location,Site,PlayArea_Location
0,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",Admirality Street,"55.97615152741767,-3.1789112091064453"
1,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",Allison Park,"55.95463010760622,-3.410053253173828"
2,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",Ardshiel Avenue (Torrence Park),"55.953523,-3.29278"
3,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",Bailie Place (Rear Nos. 1-5),"55.943291,-3.111828"
4,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",Balgreen Park,"55.94174996445649,-3.2571029663085938"


### Great! Next we need to work out the distance between the toilets and the parks

In [8]:
import geopy.distance
df['nearest_toilet_km'] = round(df.apply(lambda row: geopy.distance.distance(row['Toilet_Location'], 
                                                                         row['PlayArea_Location']).km, 
                                     axis=1),1)
# Let's drop those with distance more than 1km
df=df[df['nearest_toilet_km']<=1]

df.head()

Unnamed: 0,Toilet,Toilet_Location,Site,PlayArea_Location,nearest_toilet_km
9,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",Boswall/Royston Mains Gardens,"55.976224,-3.231857",0.5
41,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",East Pilton Park,"55.973702148062415,-3.2271480560302734",0.3
56,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",Granton Crescent,"55.979609210714784,-3.2244014739990234",0.9
57,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",Granton Mains East,"55.976127514647395,-3.2420825958251953",0.9
146,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",West Pilton Park,"55.9707722945514,-3.246030807495117",1.0


In [9]:
# Make a df of the playareas, keeping only unique sites
df_playareas=df[['Site','PlayArea_Location','nearest_toilet_km']]
df_playareas.rename(columns={'Site':'Location'},inplace=True)
df_playareas.sort_values(by=['Location','nearest_toilet_km'], ascending=True, axis=0, inplace=True)
df_playareas['Location_Type']='Play Area'
df_playareas = df_playareas.groupby('Location', axis=0).first()
df_playareas.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0_level_0,PlayArea_Location,nearest_toilet_km,Location_Type
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Admirality Street,"55.97615152741767,-3.1789112091064453",0.3,Play Area
Allison Park,"55.95463010760622,-3.410053253173828",0.5,Play Area
Ardshiel Avenue (Torrence Park),"55.953523,-3.29278",0.3,Play Area
Bailie Place (Rear Nos. 1-5),"55.943291,-3.111828",0.5,Play Area
Balgreen Park,"55.94174996445649,-3.2571029663085938",0.4,Play Area


In [10]:
# Split the coordinates into latitude and longitude, as the next function will need
df_playareas[['Latitude','Longitude']]=df_playareas['PlayArea_Location'].str.split(',', expand=True)
df_playareas=df_playareas.drop('PlayArea_Location', axis=1)
df_playareas.head()

Unnamed: 0_level_0,nearest_toilet_km,Location_Type,Latitude,Longitude
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Admirality Street,0.3,Play Area,55.97615152741767,-3.1789112091064453
Allison Park,0.5,Play Area,55.95463010760622,-3.410053253173828
Ardshiel Avenue (Torrence Park),0.3,Play Area,55.953523,-3.29278
Bailie Place (Rear Nos. 1-5),0.5,Play Area,55.943291,-3.111828
Balgreen Park,0.4,Play Area,55.94174996445649,-3.2571029663085938


### Now I want to tidy up the toilets list, so that I can append it later

In [11]:
df_toilets=df[['Toilet','Toilet_Location']]
df_toilets[['Latitude','Longitude']]=df_toilets['Toilet_Location'].str.split(',', expand=True)
df_toilets=df_toilets.drop('Toilet_Location', axis=1)
df_toilets.rename(columns={'Toilet':'Location'},inplace=True)
df_toilets['Location_Type']='Toilet'
df_toilets.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,Location,Latitude,Longitude,Location_Type
9,Ainslie Park Leisure Centre,55.972014831,-3.2304253895,Toilet
41,Ainslie Park Leisure Centre,55.972014831,-3.2304253895,Toilet
56,Ainslie Park Leisure Centre,55.972014831,-3.2304253895,Toilet
57,Ainslie Park Leisure Centre,55.972014831,-3.2304253895,Toilet
146,Ainslie Park Leisure Centre,55.972014831,-3.2304253895,Toilet


### Dataset 3: Locations of Nearby Venues

In [20]:
def getNearbyVenues(names, latitudes, longitudes):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id=PBIUIPDGY42UF5KTZOLDQI1DDLGCSYDTTC54CN5QVSI4KV0L&client_secret=KZ4S0TCURB2O2MP4AOQTC3CBXKTE54KG1VCVKPHHS5EH4IYB&v=20180605&ll={},{}&radius=1000&limit=50'.format(
            lat, lng)
        results = requests.get(url).json()["response"]['groups'][0]['items']
        venues_list.append([(
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        nearby_venues.columns = ['Location', 
                  'Latitude', 
                  'Longitude', 
                  'Category']
    return(nearby_venues)

In [24]:
lat=df_playareas['Latitude'][0]
lng=df_playareas['Longitude'][0]
url = 'https://api.foursquare.com/v2/venues/explore?&client_id=PBIUIPDGY42UF5KTZOLDQI1DDLGCSYDTTC54CN5QVSI4KV0L&client_secret=KZ4S0TCURB2O2MP4AOQTC3CBXKTE54KG1VCVKPHHS5EH4IYB&v=20180605&ll={},{}&radius=1000&limit=50'.format(
            lat, lng)
requests.get(url).json()

{'meta': {'code': 429,
  'errorType': 'quota_exceeded',
  'errorDetail': 'Quota exceeded',
  'requestId': '5ef51a5c2f5bc4555b811d70'},
 'response': {}}

In [21]:
venues = getNearbyVenues(names=df_playareas.index,
                                   latitudes=df_playareas['Latitude'],
                                   longitudes=df_playareas['Longitude'])
venues.head()

KeyError: 'groups'

In [None]:
# We are only interested in coffee shops and cafes
playarea_venues=playarea_venues[playarea_venues['Category'].isin(["Café","Coffee Shop"])]
playarea_venues=playarea_venues.drop('Category', axis=1)
playarea_venues['Location_Type']=;Coffee Shop'
playarea_venues.head(10)

In [None]:
# Link them together
df_merged=pd.append(dfx_slice,playarea_venues)
df_merged=df_merged[['Play_Area','nearest_toilet_km','Venue','coffee_km']]
df_merged.head()