# Introduction

The legalization of Marijuana in California has presented many with an opportunity to manifest what was previously just a hobby into a career they could be passionate about; as a result, we've seen the marijuana industry expand exponentially since its legalization. Take San Francisco for example: touted as one of the most progressive areas in one of the most liberal states, it's difficult to go out for a stroll without encountering at least one legal dispensary advertising its services. With all this competition in a relatively new field, is it still practical for aspiring entrepreneurs to invest their time into this industry? To this end, I will try to answer these questions in this project by analyzing the density of Marijuana dispensaries relative to other venues in 19 different San Francisco neighborhoods.

## Necessary Data

This project requires two source of Data:

1. San Francisco data will be sourced via webscraping Wikipedia.Com
2. Geographical data will be sourced via FourSquare.

In [1]:
# Importing Packages
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
from urllib.request import urlopen
print(requests.__version__)

# importing necessary libraries
import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

2.21.0
Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.

Libraries imported.


In [5]:
url = requests.get("https://en.wikipedia.org/wiki/Category:Neighborhoods_in_San_Francisco").text

In [7]:
soup = bs(url, 'html.parser')

In [46]:
neighborhoodList = []
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text[3:])

In [40]:
sf_df = pd.DataFrame({"Neighborhood": neighborhoodList})
sf_df = sf_df["Neighborhood"].str.split(",", expand=True)[0]
sf_df = sf_df.to_frame()

Unnamed: 0,0
0,Barbary Coast
1,Castro District
2,Chinatown
3,Civic Center
4,Financial District


In [41]:
sf_df.rename(columns = { 0 : 'Neighborhood'}, inplace = True)

In [43]:
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, San Francisco, California'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [51]:
import geocoder
coords = [ get_latlng(neighborhood) for neighborhood in sf_df["Neighborhood"].tolist() ]
coords

[[37.78184003332553, -122.40764003332545],
 [37.75849000000005, -122.43476999999996],
 [37.795390000000054, -122.40811999999994],
 [37.77863000000008, -122.41682999999995],
 [37.795780000000036, -122.40047999999996],
 [37.80655000000007, -122.40624999999994],
 [37.769930000000045, -122.44691999999998],
 [37.759690000000035, -122.41805],
 [37.793360000000064, -122.41787],
 [37.799090000000035, -122.40844999999996],
 [37.75660000000005, -122.39913999999999],
 [37.78066000000007, -122.47088999999994],
 [37.80203000000006, -122.41962999999998],
 [37.777570000000026, -122.40434999999997],
 [37.753480000000025, -122.49414999999999],
 [37.78513000000004, -122.41450999999995],
 [37.82489000000004, -122.37086999999997],
 [37.78782000000007, -122.40747999999996],
 [37.780950000000075, -122.43221999999997]]

In [52]:
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

In [53]:
sf_df['Latitude'] = df_coords['Latitude']
sf_df['Longitude'] = df_coords['Longitude']
sf_df.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Barbary Coast,37.78184,-122.40764
1,Castro District,37.75849,-122.43477
2,Chinatown,37.79539,-122.40812
3,Civic Center,37.77863,-122.41683
4,Financial District,37.79578,-122.40048


In [54]:
print(sf_df.shape)

(19, 3)


In [55]:
## Foursquare Credentials

CLIENT_ID = 'JU44KNSUMWI1VNZRX0HDI4X4VYVPUREULZSS1DDQ4FBHHZ3W' # your Foursquare ID
CLIENT_SECRET = 'EPPR5AS5IS0OOJQIUCLLC4DKAPNYE1AF13400IHHIL2QSTIH' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

credentials:
CLIENT_ID: JU44KNSUMWI1VNZRX0HDI4X4VYVPUREULZSS1DDQ4FBHHZ3W
CLIENT_SECRET:EPPR5AS5IS0OOJQIUCLLC4DKAPNYE1AF13400IHHIL2QSTIH


## Creating map of San Francisco

In [56]:
address = 'San Francisco, California'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of San Fransico, California {}, {}.'.format(latitude, longitude))

The geograpical coordinate of San Fransico, California 37.7792808, -122.4192363.


In [58]:
map_sf = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(sf_df['Latitude'], sf_df['Longitude'], sf_df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_sf)  
    
map_sf

## Using foursquare API to explore the neighborhoods:

In [59]:
radius = 2000
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(sf_df['Latitude'], sf_df['Longitude'], sf_df['Neighborhood']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [60]:
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(1829, 7)


Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Barbary Coast,37.78184,-122.40764,Barbary Coast Collective,37.781867,-122.407637,Marijuana Dispensary
1,Barbary Coast,37.78184,-122.40764,Frena Bakery and Cafe,37.7805,-122.40825,Bakery
2,Barbary Coast,37.78184,-122.40764,Mint Plaza,37.782731,-122.40786,Plaza
3,Barbary Coast,37.78184,-122.40764,Blue Bottle Coffee,37.782584,-122.407743,Coffee Shop
4,Barbary Coast,37.78184,-122.40764,Hashiri,37.782994,-122.407833,Japanese Restaurant


## how many venues were returned for each neighborhood?

In [66]:
venues_df.groupby(['Neighborhood','VenueCategory']).count().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude
Neighborhood,VenueCategory,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Barbary Coast,Accessories Store,1,1,1,1,1
Barbary Coast,American Restaurant,2,2,2,2,2
Barbary Coast,Art Gallery,1,1,1,1,1
Barbary Coast,Art Museum,3,3,3,3,3
Barbary Coast,Bakery,2,2,2,2,2


In [67]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))


There are 215 uniques categories.


## Analyze each neighborhood

In [68]:
# one hot encoding
sf_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
sf_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [sf_onehot.columns[-1]] + list(sf_onehot.columns[:-1])
sf_onehot = sf_onehot[fixed_columns]

print(sf_onehot.shape)
sf_onehot.head()

(1829, 216)


Unnamed: 0,Neighborhoods,Accessories Store,Adult Boutique,American Restaurant,Antique Shop,Aquarium,Arcade,Argentinian Restaurant,Art Gallery,Art Museum,...,Tunnel,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Waterfall,Wine Bar,Wine Shop,Winery,Wings Joint,Yoga Studio
0,Barbary Coast,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Barbary Coast,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Barbary Coast,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Barbary Coast,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Barbary Coast,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [69]:
sf_grouped = sf_onehot.groupby(["Neighborhoods"]).mean().reset_index()

print(sf_grouped.shape)
sf_grouped

(19, 216)


Unnamed: 0,Neighborhoods,Accessories Store,Adult Boutique,American Restaurant,Antique Shop,Aquarium,Arcade,Argentinian Restaurant,Art Gallery,Art Museum,...,Tunnel,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Waterfall,Wine Bar,Wine Shop,Winery,Wings Joint,Yoga Studio
0,Barbary Coast,0.01,0.0,0.02,0.0,0.0,0.0,0.0,0.01,0.03,...,0.0,0.0,0.0,0.01,0.0,0.0,0.02,0.0,0.0,0.01
1,Castro District,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.01,0.0,...,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.03
2,Chinatown,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.02,0.0,...,0.0,0.0,0.0,0.0,0.0,0.04,0.01,0.0,0.01,0.01
3,Civic Center,0.01,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.01,...,0.0,0.0,0.0,0.0,0.0,0.03,0.01,0.0,0.0,0.04
4,Financial District,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.01,0.0,0.03,0.01,0.0,0.0,0.02
5,Fisherman's Wharf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,...,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.01
6,Haight-Ashbury,0.02,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,...,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.0,0.03
7,Mission District,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.02,0.0,...,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.05
8,Nob Hill,0.01,0.0,0.02,0.0,0.0,0.0,0.0,0.01,0.0,...,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.01,0.04
9,North Beach,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.01,0.01


In [74]:
sf_MD = sf_grouped[["Neighborhoods","Marijuana Dispensary"]]
sf_MD

Unnamed: 0,Neighborhoods,Marijuana Dispensary
0,Barbary Coast,0.03
1,Castro District,0.01
2,Chinatown,0.0
3,Civic Center,0.03
4,Financial District,0.0
5,Fisherman's Wharf,0.0
6,Haight-Ashbury,0.0
7,Mission District,0.0
8,Nob Hill,0.0
9,North Beach,0.0


In [75]:

# set number of clusters
k = 3

sf_clustering = sf_MD.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=k, random_state=0).fit(sf_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 2, 1, 0, 1, 1, 1, 1, 1, 1], dtype=int32)

In [76]:
sf_merged = sf_MD.copy()

# add clustering labels
sf_merged["Cluster Labels"] = kmeans.labels_
sf_merged.head()

Unnamed: 0,Neighborhoods,Marijuana Dispensary,Cluster Labels
0,Barbary Coast,0.03,0
1,Castro District,0.01,2
2,Chinatown,0.0,1
3,Civic Center,0.03,0
4,Financial District,0.0,1


In [79]:
sf_merged = sf_merged.join(sf_df.set_index("Neighborhood"), on="Neighborhoods")

print(sf_merged.shape)
sf_merged.head() # check the last columns!

(19, 5)


Unnamed: 0,Neighborhoods,Marijuana Dispensary,Cluster Labels,Latitude,Longitude
0,Barbary Coast,0.03,0,37.78184,-122.40764
1,Castro District,0.01,2,37.75849,-122.43477
2,Chinatown,0.0,1,37.79539,-122.40812
3,Civic Center,0.03,0,37.77863,-122.41683
4,Financial District,0.0,1,37.79578,-122.40048


In [80]:
print(sf_merged.shape)
sf_merged.sort_values(["Cluster Labels"], inplace=True)
sf_merged

(19, 5)


Unnamed: 0,Neighborhoods,Marijuana Dispensary,Cluster Labels,Latitude,Longitude
0,Barbary Coast,0.03,0,37.78184,-122.40764
15,Tenderloin,0.03,0,37.78513,-122.41451
3,Civic Center,0.03,0,37.77863,-122.41683
13,South of Market,0.04,0,37.77757,-122.40435
16,Treasure Island,0.0,1,37.82489,-122.37087
14,Sunset District,0.0,1,37.75348,-122.49415
12,Russian Hill,0.0,1,37.80203,-122.41963
10,Potrero Hill,0.0,1,37.7566,-122.39914
9,North Beach,0.0,1,37.79909,-122.40845
7,Mission District,0.0,1,37.75969,-122.41805


## Visualizing the clusters

In [82]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i+x+(i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(sf_merged['Latitude'], sf_merged['Longitude'], sf_merged['Neighborhoods'], sf_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examining the clusters

In [84]:
sf_merged.loc[sf_merged['Cluster Labels'] == 0]

Unnamed: 0,Neighborhoods,Marijuana Dispensary,Cluster Labels,Latitude,Longitude
0,Barbary Coast,0.03,0,37.78184,-122.40764
15,Tenderloin,0.03,0,37.78513,-122.41451
3,Civic Center,0.03,0,37.77863,-122.41683
13,South of Market,0.04,0,37.77757,-122.40435


In [85]:
sf_merged.loc[sf_merged['Cluster Labels'] == 1]

Unnamed: 0,Neighborhoods,Marijuana Dispensary,Cluster Labels,Latitude,Longitude
16,Treasure Island,0.0,1,37.82489,-122.37087
14,Sunset District,0.0,1,37.75348,-122.49415
12,Russian Hill,0.0,1,37.80203,-122.41963
10,Potrero Hill,0.0,1,37.7566,-122.39914
9,North Beach,0.0,1,37.79909,-122.40845
7,Mission District,0.0,1,37.75969,-122.41805
6,Haight-Ashbury,0.0,1,37.76993,-122.44692
5,Fisherman's Wharf,0.0,1,37.80655,-122.40625
4,Financial District,0.0,1,37.79578,-122.40048
2,Chinatown,0.0,1,37.79539,-122.40812


In [86]:
sf_merged.loc[sf_merged['Cluster Labels'] == 2]

Unnamed: 0,Neighborhoods,Marijuana Dispensary,Cluster Labels,Latitude,Longitude
17,Union Square,0.02,2,37.78782,-122.40748
11,Richmond District,0.01,2,37.78066,-122.47089
1,Castro District,0.01,2,37.75849,-122.43477
