In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import geopy
from geopy import distance

In [4]:
activeBusiness = 'data/Listing_of_Active_Businesses.csv'
activeBusinessJSON = 'data/business_clean_parsed.json'

In [33]:
bus_df = pd.read_csv(activeBusiness, dtype={
    'LOCATION ACCOUNT #': 'str',
    'BUSINESSNAME': 'str',
    'DBA NAME': 'str',
    'STREET ADDRESS': 'str',
    'CITY': 'str',
    'ZIP CODE': 'str',
    'LOCATION DESCRIPTION': 'str',
    'MAILING ADDRESS': 'str',
    'MAILING CITY': 'str',
    'MAILING ZIP CODE': 'str',
    'NAICS': 'float',
    'PRIMARY NAICS DESCRIPTION': 'str',
    'COUNCIL DISTRICT': 'str',
    'LOCATION START DATE': 'str',
    'LOCATION END DATE': 'str',
    'LOCATION': 'str'
})

In [110]:
bus_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 535624 entries, 0 to 535623
Data columns (total 16 columns):
LOCATION ACCOUNT #           535624 non-null object
BUSINESS NAME                535624 non-null object
DBA NAME                     186259 non-null object
STREET ADDRESS               535602 non-null object
CITY                         535597 non-null object
ZIP CODE                     535624 non-null object
LOCATION DESCRIPTION         535618 non-null object
MAILING ADDRESS              246638 non-null object
MAILING CITY                 246650 non-null object
MAILING ZIP CODE             246564 non-null object
NAICS                        483574 non-null float64
PRIMARY NAICS DESCRIPTION    483574 non-null object
COUNCIL DISTRICT             535624 non-null object
LOCATION START DATE          532028 non-null object
LOCATION END DATE            0 non-null object
LOCATION                     529804 non-null object
dtypes: float64(1), object(15)
memory usage: 65.4+ MB


### Remove all rows for which there are not coordinate values

In [73]:
bus_coord_filtered = bus_df.dropna(subset=['LOCATION']).reset_index().drop(['index', 'LOCATION END DATE'], axis=1)
bus_coord_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 529804 entries, 0 to 529803
Data columns (total 15 columns):
LOCATION ACCOUNT #           529804 non-null object
BUSINESS NAME                529804 non-null object
DBA NAME                     184379 non-null object
STREET ADDRESS               529783 non-null object
CITY                         529779 non-null object
ZIP CODE                     529804 non-null object
LOCATION DESCRIPTION         529798 non-null object
MAILING ADDRESS              244147 non-null object
MAILING CITY                 244157 non-null object
MAILING ZIP CODE             244071 non-null object
NAICS                        479310 non-null float64
PRIMARY NAICS DESCRIPTION    479310 non-null object
COUNCIL DISTRICT             529804 non-null object
LOCATION START DATE          526322 non-null object
LOCATION                     529804 non-null object
dtypes: float64(1), object(14)
memory usage: 60.6+ MB


### Parse coordinates column data from string to array: (lat, lon)

In [60]:
bus_coord_parsed = bus_coord_filtered.drop(['LOCATION'], axis=1)
lat_coord = []
lon_coord = []

# It was taking far too long to parse and directly add to dataframe, so lists were made and those appended


for index, row in bus_coord_filtered.iterrows():
    coords = [float(num) for num in bus_coord_filtered.loc[index, 'LOCATION'].lstrip("'(").strip("')").replace(' ','').split(',')]
    lat_coord.append(coords[0])
    lon_coord.append(coords[1])
    
bus_coord_parsed['LAT'] = lat_coord
bus_coord_parsed['LON'] = lon_coord

bus_coord_parsed.info()

In [76]:
bus_coord_parsed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 529804 entries, 0 to 529803
Data columns (total 16 columns):
LOCATION ACCOUNT #           529804 non-null object
BUSINESS NAME                529804 non-null object
DBA NAME                     184379 non-null object
STREET ADDRESS               529783 non-null object
CITY                         529779 non-null object
ZIP CODE                     529804 non-null object
LOCATION DESCRIPTION         529798 non-null object
MAILING ADDRESS              244147 non-null object
MAILING CITY                 244157 non-null object
MAILING ZIP CODE             244071 non-null object
NAICS                        479310 non-null float64
PRIMARY NAICS DESCRIPTION    479310 non-null object
COUNCIL DISTRICT             529804 non-null object
LOCATION START DATE          526322 non-null object
LAT                          529804 non-null float64
LON                          529804 non-null float64
dtypes: float64(3), object(13)
memory usage: 64.7+ MB


In [78]:
# Export data to json and csv files

bus_coord_parsed.to_json(path_or_buf='business_clean_parsed.json', orient='records')
bus_coord_parsed.to_csv('business_clean_parsed.csv')

### Generate list of businesses within specified radius distance of target business, by coordinates

In [5]:
# Load cleaned data json file

with open(activeBusinessJSON) as f:
    data = json.load(f)

'\nHere is some comments are these ignored?\nWIll there be an error\n'

In [8]:
# Find the top 10 businesses from data file with the most locations

t10_names = pd.DataFrame(data).groupby('BUSINESS NAME').count().sort_values(by=['LAT'], ascending=False)['LAT'].head(10).index
for name in t10_names:
    print(name)

STARBUCKS CORPORATION
REDBOX AUTOMATED RETAIL LLC
LAZ KARP ASSOCIATES LLC
ABM INDUSTRY GROUPS, LLC
COINSTAR ASSET HOLDINGS, LLC
VOLUNTEERS OF AMERICA OF LOS ANGELES
UNITED VALET PARKING INC
GARFIELD BEACH CVS LLC
T MOBILE WEST LLC
PCAM LLC


The following code generates a list of dictionaries of all businesses within a specified radius of every location of the top ten businesses listed above.

In order to keep the size of the resultant output as small as possible, the majority of information of each business was omitted leaving the "LOCATION ACCOUNT #" as a key value to use to identify businesses from the main dataset when additional information is needed.

In [None]:
# Using default method of geodesic distance is ~50x slower in computation than using great-circle distance.
# Great-circle distance has up to ~0.5% error, more details in GeoPy documentation. 

search_radius = 0.5
top10_proximity_results = []

for busName in t10_names:
    total_search_results = {
        'business_name': busName,
        'results': []
    }
    
    print(busName)
    counter = 0

    for busOrigin in data:
        if (busOrigin['BUSINESS NAME'] == busName):

            counter += 1
            if (counter % 20 == 0):
                print(counter)

            single_search_results = {
                'origin_business': busOrigin['LOCATION ACCOUNT #'],
                'in_proximity': []
            }
            origin_coords = [busOrigin['LAT'], busOrigin['LON']]

            for busDict in data:
                dest_coords = [busDict['LAT'], busDict['LON']]
                distance = geopy.distance.great_circle(origin_coords, dest_coords).miles

                if (distance <= search_radius):
                    single_search_results['in_proximity'].append({
                        'LOCATION ACCOUNT #': busDict['LOCATION ACCOUNT #'],
                        'BUSINESS NAME': busDict['BUSINESS NAME'],
                        'DISTANCE': distance
                    })

            total_search_results['results'].append(single_search_results)
            
    top10_proximity_results.append(total_search_results)

STARBUCKS CORPORATION
20


In [170]:
# Export results as json file

with open('data/top10_proximity_data.json', 'w') as fp:
    json.dump(top10_proximity_results, fp)