# Holly's Data Science Capstone Projects

## Part 1: Install Packages

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


## Part 2: Import Data 

### Dataset 1: Locations of community centres in Edinburgh

In [2]:
df=pd.read_csv("https://data.edinburghopendata.info/dataset/2a8a1335-06be-4766-96f8-07638610f1bf/resource/36ead7e6-8234-4317-a73a-5575c1da64f8/download/directoryexport61.csv")
df = df[['Toilet','Location']]
df.head()

Unnamed: 0,Toilet,Location
0,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895"
1,Bath Street Public Toilet,"55.952976,-3.114088"
2,Bingham Community Centre,"55.9403279768,-3.12461868252"
3,Blackhall Library,"55.9616673126,-3.2611578066"
4,Braid Hills Golf Course,"55.9163063539,-3.20571551039"


### Dataset 2: Locations of Play Areas

In [3]:
df2=pd.read_csv("https://data.edinburghopendata.info/dataset/a22a4332-b73f-4477-834c-76d22ed8f993/resource/0e3b0a25-5bae-45b0-beb4-e63178305741/download/directoryexport60.csv")
df2 = df2[['Site','Location map']]
df2.head()

Unnamed: 0,Site,Location map
0,Admirality Street,"55.97615152741767,-3.1789112091064453"
1,Allison Park,"55.95463010760622,-3.410053253173828"
2,Ardshiel Avenue (Torrence Park),"55.953523,-3.29278"
3,Bailie Place (Rear Nos. 1-5),"55.943291,-3.111828"
4,Balgreen Park,"55.94174996445649,-3.2571029663085938"


## Process first two datasets

### We only want to keep the play areas which have nearby toilets.  To identify these, I will do a many-to-many merge, drop the pairs with too large a distance between, and then keep only the unique pplay areas remaining.

In [4]:
# First, let's drop any na data
df=df.dropna()
df.reset_index(drop=True)
df2=df2.dropna()
df2.reset_index(drop=True)

# Next, let's check the size of each dataframe so we can verify the merge worked well
print('The first dataset has {} rows'.format(df.shape[0]))
print('The second dataset has {} rows'.format(df2.shape[0]))
print('The merged dataset should have {} rows'.format(df.shape[0]*df2.shape[0]))

The first dataset has 97 rows
The second dataset has 154 rows
The merged dataset should have 14938 rows


In [5]:
## We have to create a temporary variable with the value 1 for all rows, 
## as merge won't let you join without a join key
df['tmp'] = 1
df2['tmp'] = 1
dfx=pd.merge(df,df2, on=['tmp'])
dfx=dfx.drop('tmp', axis=1)
print('The merged dataset has {} rows'.format(dfx.shape[0]))
dfx.head()

The merged dataset has 14938 rows


Unnamed: 0,Toilet,Location,Site,Location map
0,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",Admirality Street,"55.97615152741767,-3.1789112091064453"
1,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",Allison Park,"55.95463010760622,-3.410053253173828"
2,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",Ardshiel Avenue (Torrence Park),"55.953523,-3.29278"
3,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",Bailie Place (Rear Nos. 1-5),"55.943291,-3.111828"
4,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",Balgreen Park,"55.94174996445649,-3.2571029663085938"


### Great! Next we need to work out the distance between the toilets and the parks

In [6]:
import geopy.distance
dfx['distance'] = round(dfx.apply(lambda row: geopy.distance.distance(row['Location'], 
                                                                         row['Location map']).km, 
                                     axis=1),1)
dfx.head()

Unnamed: 0,Toilet,Location,Site,Location map,distance
0,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",Admirality Street,"55.97615152741767,-3.1789112091064453",3.2
1,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",Allison Park,"55.95463010760622,-3.410053253173828",11.4
2,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",Ardshiel Avenue (Torrence Park),"55.953523,-3.29278",4.4
3,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",Bailie Place (Rear Nos. 1-5),"55.943291,-3.111828",8.1
4,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",Balgreen Park,"55.94174996445649,-3.2571029663085938",3.8


In [7]:
# Let's drop those with distance more than 2km
dfx=dfx[dfx['distance']<=2]
dfx.head()

Unnamed: 0,Toilet,Location,Site,Location map,distance
9,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",Boswall/Royston Mains Gardens,"55.976224,-3.231857",0.5
41,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",East Pilton Park,"55.973702148062415,-3.2271480560302734",0.3
42,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",Easter Drylaw Park,"55.96270204269681,-3.243541717529297",1.3
56,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",Granton Crescent,"55.979609210714784,-3.2244014739990234",0.9
57,Ainslie Park Leisure Centre,"55.972014831,-3.2304253895",Granton Mains East,"55.976127514647395,-3.2420825958251953",0.9


In [8]:
# Keep only unique sites
dfx=dfx[['Site','Location map','distance']]
dfx.sort_values(by=['Site','distance'], ascending=True, axis=0, inplace=True)
dfx.head()

Unnamed: 0,Site,Location map,distance
12936,Admirality Street,"55.97615152741767,-3.1789112091064453",0.3
7238,Admirality Street,"55.97615152741767,-3.1789112091064453",0.6
13090,Admirality Street,"55.97615152741767,-3.1789112091064453",1.9
6777,Allison Park,"55.95463010760622,-3.410053253173828",0.5
6931,Allison Park,"55.95463010760622,-3.410053253173828",0.5


In [9]:
dfx_slice = dfx.groupby('Site', axis=0).head(1)

# rename 'distance' to 'nearest_toilet_km'
dfx_slice.rename(columns={'distance':'nearest_toilet_km',
                          'Site':'Play Area',
                         'Location map':'coords'},inplace=True)

dfx_slice.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,Play Area,coords,nearest_toilet_km
12936,Admirality Street,"55.97615152741767,-3.1789112091064453",0.3
6777,Allison Park,"55.95463010760622,-3.410053253173828",0.5
3390,Ardshiel Avenue (Torrence Park),"55.953523,-3.29278",0.3
7395,Bailie Place (Rear Nos. 1-5),"55.943291,-3.111828",0.5
1236,Balgreen Park,"55.94174996445649,-3.2571029663085938",0.4


### Dataset 3: Locations of XXX

In [10]:
CLIENT_ID = 'PBIUIPDGY42UF5KTZOLDQI1DDLGCSYDTTC54CN5QVSI4KV0L' # your Foursquare ID
CLIENT_SECRET = 'KZ4S0TCURB2O2MP4AOQTC3CBXKTE54KG1VCVKPHHS5EH4IYB' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
radius=5
LIMIT=100

In [11]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [15]:
dfx_slice[['Latitude','Longitude']]=dfx_slice['coords'].str.split(',', expand=True)
dfx_slice.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,Play Area,coords,nearest_toilet_km,Latitude,Longitude
12936,Admirality Street,"55.97615152741767,-3.1789112091064453",0.3,55.97615152741767,-3.1789112091064453
6777,Allison Park,"55.95463010760622,-3.410053253173828",0.5,55.95463010760622,-3.410053253173828
3390,Ardshiel Avenue (Torrence Park),"55.953523,-3.29278",0.3,55.953523,-3.29278
7395,Bailie Place (Rear Nos. 1-5),"55.943291,-3.111828",0.5,55.943291,-3.111828
1236,Balgreen Park,"55.94174996445649,-3.2571029663085938",0.4,55.94174996445649,-3.2571029663085938


In [16]:
playarea_venues = getNearbyVenues(names=dfx_slice['Play Area'],
                                   latitudes=dfx_slice['Latitude'],
                                   longitudes=dfx_slice['Longitude']
                                  )
print("complete!")

Admirality Street
Allison Park 
Ardshiel Avenue (Torrence Park)
Bailie Place (Rear Nos. 1-5)
Balgreen Park
Barony Playarea 
Bingham Park
Blackford Pond
Bloomiehall Park
Boswall/Royston Mains Gardens
Broomhouse Centre
Broomhouse Grove
Broughton Road
Brown Street, Pleasance
Buckstone Circle
Burdiehouse Burn Valley Park 
Calder Park play area
Campbell Park
Carlowrie Crescent
Clermiston Park
Clovenstone Gardens
Clovenstone Park (Block 40)
Colinton Mains Park
Craigentinny Ball Court
Craigevar Square
Craigmillar Castle Park
Craigpark Crescent play area
Cramond Walled Garden
Dalmeny Street park
Davidson's Mains Park
Dolphins Gardens West
Drum Park
Dumbeg Park, off Harvesters Way
Dumbiedykes
Dumbryden Gardens (No. 1-17)
Dumbryden Gardens (No. 46)
Dumbryden Gardens (Nos. 19-36)
Dumbryden Gardens (Nos. 66 front)
Dumbryden Grove
Dundas Avenue
East Pilton Park
Easter Drylaw Park
Echline Avenue
Fairmilehead Park
Falcon Road/Park
Fauldburn Park
Ferniehill Community Park
Figgate Park
Forth Terrace
Ga

KeyboardInterrupt: 