# Capstone Project: 
# Starting a new Grocery Store in Dallas, Texas


## 1. Import libraries

In [None]:
! pip install geocoder

In [None]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import geocoder # to get coordinates

import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print("Libraries imported.")

## 2. Scrap neighborhood data from Wikipedia into a DataFrame

In [None]:
# send the GET request
data = requests.get("https://en.wikipedia.org/wiki/Category:Neighborhoods_in_Dallas").text

data
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')


# create a list to store neighborhood data
neighborhoodList = []

# append the data into the list
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text)
    
# create a new DataFrame from the list
dfw_df = pd.DataFrame({"Neighborhood": neighborhoodList})

dfw_df.head()

In [None]:
# print the 
dfw_df.shape

## 3. Get the latitude of longitude of the Neighborhoods

In [None]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Dallas, Texas'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [None]:
# call the function to get the coordinates, store in a new list using list comprehension
coords = [ get_latlng(neighborhood) for neighborhood in dfw_df["Neighborhood"].tolist() ]

In [None]:
coords

In [None]:
# create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

In [None]:
# merge the coordinates into the original dataframe
dfw_df['Latitude'] = df_coords['Latitude']
dfw_df['Longitude'] = df_coords['Longitude']

In [None]:
# check the neighborhoods and the coordinates
print(dfw_df.shape)
dfw_df

In [None]:
# save the DataFrame as CSV file
dfw_df.to_csv("dfw_df.csv", index=False)

## 4. Create a map of Dallas with neighborhoods

In [None]:
# get the coordinates of Dallas
address = 'Dallas, Texas'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Dallas, Texas {}, {}.'.format(latitude, longitude))

In [84]:
# create map of Dallas, Texas using latitude and longitude values
map_dfw = folium.Map(location=[latitude, longitude], zoom_start=10.4)

# add markers to map
for lat, lng, neighborhood in zip(dfw_df['Latitude'], dfw_df['Longitude'], dfw_df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=7,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_dfw)  
    
map_dfw

In [None]:
# save the map as HTML file
map_dfw.save('map_dfw.html')

## 5. Use Foursquare API to explore the neighborhoods

In [None]:
# define Foursquare Credentials and Version
CLIENT_ID = '3TJILUL33Y122CMLX3WN4H245RQZHXPF13MRJ5UWKTG4VS1Z' # your Foursquare ID
CLIENT_SECRET = 'RS2ZHMTYLMYOA40SHDRXXXXD521CGDDA1ZW1M3ZT3P15VLB5' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

### Let's get the top 1000 venues in a radius of 20k meters

In [None]:
radius = 20000
LIMIT = 1000

venues = []

for lat, long, neighborhood in zip(dfw_df['Latitude'], dfw_df['Longitude'], dfw_df['Neighborhood']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [None]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

In [None]:
venues_df.groupby(["Neighborhood"]).count()

In [None]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

In [None]:
# print out the list of categories
venues_df['VenueCategory'].unique()[:50]

In [None]:
# check if the results contain "Shopping Mall"
"Neighborhood" in venues_df['VenueCategory'].unique()

## 6. Analyze Each Neighborhood

In [None]:
# one hot encoding
dfw_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
dfw_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [dfw_onehot.columns[-1]] + list(dfw_onehot.columns[:-1])
dfw_onehot = dfw_onehot[fixed_columns]

print(dfw_onehot.shape)
dfw_onehot.head()

In [None]:
# taking the mean of frequency of occurrence of each category to group rows by neighborhoods
dfw_grouped = dfw_onehot.groupby(["Neighborhoods"]).mean().reset_index()

print(dfw_grouped.shape)
dfw_grouped

### Create a new DataFrame for Grocery Store data only 

In [None]:
dfw_gs = dfw_grouped[["Neighborhoods","Grocery Store"]]

In [None]:
dfw_gs

## 7. Cluster Neighborhoods with k-means algorithm

In [None]:
# set number of clusters
kclusters = 3

dfw_clustering = dfw_gs.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dfw_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

In [None]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
dfw_merged = dfw_gs.copy()

# add clustering labels
dfw_merged["Cluster Labels"] = kmeans.labels_

In [None]:
dfw_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
dfw_merged

In [None]:
# add latitude/longitude for each neighborhood with grocery store data and cluster labels
dfw_merged = dfw_merged.join(dfw_df.set_index("Neighborhood"), on="Neighborhood")

print(dfw_merged.shape)
dfw_merged

In [None]:
# sort the results by Cluster Labels
print(dfw_merged.shape)
dfw_merged.sort_values(["Cluster Labels"], inplace=True)
dfw_merged

## Visualize the clusters by neighborhoods

In [85]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10.4)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dfw_merged['Latitude'], dfw_merged['Longitude'], dfw_merged['Neighborhood'], dfw_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
dfw_merged.loc[dfw_merged['Cluster Labels'] == 1]