# Holly's Data Science Capstone Projects

## Part 1: Install Packages

In [None]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print('Libraries imported.')

## Part 2: Import Data 

### Dataset 1: Locations of community centres in Edinburgh

In [None]:
df=pd.read_csv("https://data.edinburghopendata.info/dataset/2a8a1335-06be-4766-96f8-07638610f1bf/resource/36ead7e6-8234-4317-a73a-5575c1da64f8/download/directoryexport61.csv")
df = df[['Toilet','Location']]
df.head()

### Dataset 2: Locations of Play Areas

In [None]:
df2=pd.read_csv("https://data.edinburghopendata.info/dataset/a22a4332-b73f-4477-834c-76d22ed8f993/resource/0e3b0a25-5bae-45b0-beb4-e63178305741/download/directoryexport60.csv")
df2 = df2[['Site','Location map']]
df2.head()

## Process first two datasets

### We only want to keep the play areas which have nearby toilets.  To identify these, I will do a many-to-many merge, drop the pairs with too large a distance between, and then keep only the unique pplay areas remaining.

In [None]:
# First, let's drop any na data
df=df.dropna()
df.reset_index(drop=True)
df2=df2.dropna()
df2.reset_index(drop=True)

# Next, let's check the size of each dataframe so we can verify the merge worked well
print('The first dataset has {} rows'.format(df.shape[0]))
print('The second dataset has {} rows'.format(df2.shape[0]))
print('The merged dataset should have {} rows'.format(df.shape[0]*df2.shape[0]))

In [None]:
## We have to create a temporary variable with the value 1 for all rows, 
## as merge won't let you join without a join key
df['tmp'] = 1
df2['tmp'] = 1
dfx=pd.merge(df,df2, on=['tmp'])
dfx=dfx.drop('tmp', axis=1)
print('The merged dataset has {} rows'.format(dfx.shape[0]))
dfx.head()

### Great! Next we need to work out the distance between the toilets and the parks

In [None]:
import geopy.distance
dfx['distance'] = round(dfx.apply(lambda row: geopy.distance.distance(row['Location'], 
                                                                         row['Location map']).km, 
                                     axis=1),1)
dfx.head()

In [None]:
# Let's drop those with distance more than 2km
dfx=dfx[dfx['distance']<=2]
dfx.head()

In [None]:
# Keep only unique sites
dfx=dfx[['Site','Location map','distance']]
dfx.sort_values(by=['Site','distance'], ascending=True, axis=0, inplace=True)
dfx.head()

In [None]:
# rename 'distance' to 'nearest_toilet_km'
dfx.rename(columns={'distance':'nearest_toilet_km',
                          'Site':'PlayArea'},inplace=True)

# keep the first record (distance to closest toilet) per play area
dfx_slice = dfx.groupby('PlayArea', axis=0).first()

# Split the coordinates into latitude and longitude, as the next function will need
dfx_slice[['Latitude','Longitude']]=dfx_slice['Location map'].str.split(',', expand=True)
dfx_slice.head()

### Dataset 3: Locations of Nearby Venues

In [None]:
def getNearbyVenues(names, latitudes, longitudes):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id=PBIUIPDGY42UF5KTZOLDQI1DDLGCSYDTTC54CN5QVSI4KV0L&client_secret=KZ4S0TCURB2O2MP4AOQTC3CBXKTE54KG1VCVKPHHS5EH4IYB&v=20180605&ll={},{}&radius=500&limit=50'.format(
            lat, lng)
        results = requests.get(url).json()["response"]['groups'][0]['items']
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        nearby_venues.columns = ['Play_Area', 
                  'Latitude', 
                  'Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

In [None]:
playarea_venues = getNearbyVenues(names=dfx_slice.index,
                                   latitudes=dfx_slice['Latitude'],
                                   longitudes=dfx_slice['Longitude'])
playarea_venues.head()

In [None]:
# We are only interested in coffee shops and cafes
playarea_venues=playarea_venues[playarea_venues['Venue Category'].isin(["Café","Coffee Shop"])]

In [None]:
playarea_venues.head()

In [None]:
# And we only want those within a kilometre of the park
playarea_venues['nearest_coffee_km'] = round(playarea_venues.apply(lambda row: geopy.distance.distance((row['Latitude'],row['Longitude']), 
                                                                         (row['Venue Latitude'],row['Venue Longitude'])).km, 
                                     axis=1),1)
playarea_venues=playarea_venues[playarea_venues['nearest_coffee_km']<=1]
playarea_venues=playarea_venues[['Play_Area','Venue','nearest_coffee_km']]
playarea_venues.head(10)