In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas
import folium
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from geopy.exc import GeocoderNotFound
import requests
from sklearn.preprocessing import MinMaxScaler

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Population data for the state of Maine

Below is the population projection data for the state of Maine broken down by county. It shows population projects for the following years:
- 2016 (Observed)
- 2021
- 2026
- 2031
- 2036

The data is read directly from the website. Since the dataframe initially sets up a multilevel column index, we drop the top level and also rename column 0 to County since it is unnamed in the source data.

In [3]:
maine_data = pd.read_excel('https://www.maine.gov/dafs/economist/sites/maine.gov.dafs.economist/files/inline-files/MaineStateCountyPopulationProjections2036.xlsx', sheet_name='summary', header=[0, 1])
maine_data.columns = maine_data.columns.droplevel()
maine_data.rename(columns={ maine_data.columns[0]: "COUNTYNAME" }, inplace = True)
maine_data.head()

Unnamed: 0,COUNTYNAME,2016,2021,2026,2031,2036,2016-2021,2021-2026,2026-2031,2031-2036,2016-2036
0,Androscoggin,107269,107967.801439,108578.742776,108769.413854,108643.149064,0.006514,0.005659,0.001756,-0.001161,0.01281
1,Aroostook,68116,67928.772431,67735.738377,67337.098159,66857.364164,-0.002749,-0.002842,-0.005885,-0.007124,-0.018478
2,Cumberland,290905,294711.329744,298631.65098,301376.156664,302943.736659,0.013084,0.013302,0.00919,0.005201,0.041384
3,Franklin,30071,29963.778184,30029.549316,29967.173578,29772.523196,-0.003566,0.002195,-0.002077,-0.006495,-0.009926
4,Hancock,54398,54588.458399,54679.273561,54521.55833,54152.494022,0.003501,0.001664,-0.002884,-0.006769,-0.004513


# County latitude and longitude data for the state of Maine

The data below was downloaded from [weather.gov](https://www.weather.gov/gis/Counties) and contains the latitude and longtude for every county in the USA. Data is filtered to only account for the state of Maine, and the geometry column is dropped since we aren't concerned with the polygon data. These steps make it easier to work with the data.

In [4]:
gdf = geopandas.read_file('data\county_data_from_weather_gov\c_03mr20.shp')
gdf.drop(['geometry'], axis=1, inplace=True)
gdf.head()

Unnamed: 0,STATE,CWA,COUNTYNAME,FIPS,TIME_ZONE,FE_AREA,LON,LAT
0,ME,CAR,Washington,23029,E,se,-67.6361,45.0363
1,GA,CHS,McIntosh,13191,E,se,-81.2646,31.5329
2,GA,CHS,Liberty,13179,E,se,-81.2103,31.7093
3,AS,PPG,Swains Island,60040,S,,-171.0459,-11.0843
4,AS,PPG,Manu'a,60020,S,,-169.506,-14.2219


In [5]:
gdf = gdf[gdf.STATE == "ME"]
gdf.head()

Unnamed: 0,STATE,CWA,COUNTYNAME,FIPS,TIME_ZONE,FE_AREA,LON,LAT
0,ME,CAR,Washington,23029,E,se,-67.6361,45.0363
264,ME,GYX,Androscoggin,23001,E,sw,-70.2045,44.1654
282,ME,GYX,Knox,23013,E,sc,-69.173,44.15
283,ME,GYX,Sagadahoc,23023,E,sc,-69.8613,43.9705
284,ME,GYX,Lincoln,23015,E,sc,-69.5431,44.0752


# Merge the dataframes

We need to combine the data into a single dataframe for ease of working with it. Since both dataframes have a COUNTYNAME column that should include unique values we will use that column as the merge point.

In [6]:
data_merged = maine_data.merge(gdf, on="COUNTYNAME", how = 'inner')
data_merged.head()

Unnamed: 0,COUNTYNAME,2016,2021,2026,2031,2036,2016-2021,2021-2026,2026-2031,2031-2036,2016-2036,STATE,CWA,FIPS,TIME_ZONE,FE_AREA,LON,LAT
0,Androscoggin,107269,107967.801439,108578.742776,108769.413854,108643.149064,0.006514,0.005659,0.001756,-0.001161,0.01281,ME,GYX,23001,E,sw,-70.2045,44.1654
1,Aroostook,68116,67928.772431,67735.738377,67337.098159,66857.364164,-0.002749,-0.002842,-0.005885,-0.007124,-0.018478,ME,CAR,23003,E,nn,-68.5988,46.6588
2,Cumberland,290905,294711.329744,298631.65098,301376.156664,302943.736659,0.013084,0.013302,0.00919,0.005201,0.041384,ME,GYX,23005,E,sw,-70.4096,43.8487
3,Franklin,30071,29963.778184,30029.549316,29967.173578,29772.523196,-0.003566,0.002195,-0.002077,-0.006495,-0.009926,ME,GYX,23007,E,ww,-70.444,44.974
4,Hancock,54398,54588.458399,54679.273561,54521.55833,54152.494022,0.003501,0.001664,-0.002884,-0.006769,-0.004513,ME,CAR,23009,E,se,-68.3531,44.6749


# Create map of Maine

We start by using geolocator to find the coordinates for the center of Maine to create our base map. Then we use the merged data to find the coordinates of the center of each county and we put a marker there with the county name and we make the bubble relfect a scaled representation of the change in population projected from 2016 through 2036.

This provides us our first analytic. The bubble doesn't show growth(loss) only a representation of the projected change in population. The label includes the county name and the information on projected population change.

In [7]:
state = 'Maine'
#we will use this location as center of our map
geolocater = Nominatim(user_agent="dummy-exp")
center = geolocater.geocode(state)
lat = center.latitude
lon = center.longitude
print('The geograpical coordinate of the center of {} is {}, {}.'.format(state, lat, lon))

The geograpical coordinate of the center of Maine is 45.709097, -68.8590201.


In [8]:
map_maine=folium.Map(location=[lat,lon],zoom_start=6)
for lat, long, area, delta_pop in zip(data_merged['LAT'], data_merged['LON'], data_merged['COUNTYNAME'], data_merged['2016-2036']):
    label = folium.Popup('{} County\nPopulation change through 2036: {:.2f}%'.format(area, delta_pop*100), 
                         parse_html=True, 
                         max_width=325, 
                         min_width=325)
    folium.CircleMarker(
        [lat, long],
        radius=abs(delta_pop) * 250 / 2,
        popup=label,
        color='blue',
        fill=False,
        fill_color='blue',
        fill_opacity=1,
        parse_html=False).add_to(map_maine)
    
map_maine

# Maine COVID-19 data

Existing data for analysis:
- Change in projected polulation from 2016 through 2036

Let's add a new data point by gathering the COVID-19 data for each county, updated daily at 5PM EDT, and since we are sourcing it from the web we should have the most updated data daily.

In [9]:
maine_covid = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vRPtRRaID4XRBSnrzGomnTtUUkq5qsq5zj8fGpg5xse8ytsyFUVqAKKypYybVpsU5cHgIbY3BOiynOC/pub?gid=0&single=true&output=csv')
maine_covid.rename(columns={ maine_covid.columns[0]: "COUNTYNAME" }, inplace = True)
maine_covid.head()

Unnamed: 0,COUNTYNAME,DATA_REFRESH_DT,DATA_AS_OF_DT,CASES,DEATHS,RECOVERIES,HOSPITALIZATIONS
0,Androscoggin,2020-07-15,2020-07-14,506,4,467,38
1,Aroostook,2020-07-15,2020-07-14,24,1,20,4
2,Cumberland,2020-07-15,2020-07-14,1901,67,1607,181
3,Franklin,2020-07-15,2020-07-14,41,1,37,3
4,Hancock,2020-07-15,2020-07-14,18,1,15,2


## Let's merge the COVID-19 data into our main dataframe

In [10]:
data_merged = data_merged.merge(maine_covid, on="COUNTYNAME", how = 'inner')
data_merged.head()

Unnamed: 0,COUNTYNAME,2016,2021,2026,2031,2036,2016-2021,2021-2026,2026-2031,2031-2036,2016-2036,STATE,CWA,FIPS,TIME_ZONE,FE_AREA,LON,LAT,DATA_REFRESH_DT,DATA_AS_OF_DT,CASES,DEATHS,RECOVERIES,HOSPITALIZATIONS
0,Androscoggin,107269,107967.801439,108578.742776,108769.413854,108643.149064,0.006514,0.005659,0.001756,-0.001161,0.01281,ME,GYX,23001,E,sw,-70.2045,44.1654,2020-07-15,2020-07-14,506,4,467,38
1,Aroostook,68116,67928.772431,67735.738377,67337.098159,66857.364164,-0.002749,-0.002842,-0.005885,-0.007124,-0.018478,ME,CAR,23003,E,nn,-68.5988,46.6588,2020-07-15,2020-07-14,24,1,20,4
2,Cumberland,290905,294711.329744,298631.65098,301376.156664,302943.736659,0.013084,0.013302,0.00919,0.005201,0.041384,ME,GYX,23005,E,sw,-70.4096,43.8487,2020-07-15,2020-07-14,1901,67,1607,181
3,Franklin,30071,29963.778184,30029.549316,29967.173578,29772.523196,-0.003566,0.002195,-0.002077,-0.006495,-0.009926,ME,GYX,23007,E,ww,-70.444,44.974,2020-07-15,2020-07-14,41,1,37,3
4,Hancock,54398,54588.458399,54679.273561,54521.55833,54152.494022,0.003501,0.001664,-0.002884,-0.006769,-0.004513,ME,CAR,23009,E,se,-68.3531,44.6749,2020-07-15,2020-07-14,18,1,15,2


# Foursquare data

Now that we have some interesting data regarding populaion change projections and the current state of COVID-19, we can explore some venue data from Foursquare.

Maine is heavily dependent on tourism for its economy and seeing where venues are, and how COVID-19 is affecting them currently, could lead us to help predict where we might want to setup more business opportunities to support the economy or pull the plug on businesses that might not fare well in the environment.

In [11]:
api_info = pd.read_csv(r'data\4SQ_API_KEYS.csv')
CID = api_info['CLIENT ID'].values[0]
CSEC = api_info['CLIENT SECRET'].values[0]
VER = '20200715'

In [14]:
categories_url = 'https://api.foursquare.com/v2/venues/categories?client_id={}&client_secret={}&v={}'.format(
            CID, 
            CSEC, 
            VER)
            
results = requests.get(categories_url).json()

print(results)

categories_list = []
# Let's print only the top-level categories and their IDs and also add them to categories_list

def print_categories(categories, level=0, max_level=0):    
    if level>max_level: return
    out = ''
    out += '-'*level
    for category in categories:
        print(out + category['name'] + ' (' + category['id'] + ')')
        print_categories(category['categories'], level+1, max_level)
        categories_list.append((category['name'], category['id']))
        
print_categories(results['response']['categories'], 0, 0)

{'meta': {'code': 429, 'errorType': 'quota_exceeded', 'errorDetail': 'Quota exceeded', 'requestId': '5f0f385c7dadce74d12989f6'}, 'response': {}}


KeyError: 'categories'

In [None]:
def get_venues_count(lat, long, radius, categoryId):
    explore_url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&categoryId={}'.format(
                CID, 
                CSEC, 
                VER,
                lat,
                long,
                radius,
                categoryId)

    # make the GET request
    return requests.get(explore_url).json()['response']['totalResults']

In [None]:
df_venues = data_merged.copy()
for c in categories_list:
    df_venues[c[0]] = 0
    
df_venues.head()

In [None]:
for i, row in df_venues.iterrows():
    for c in categories_list:        
        df_venues.loc[i, c[0]] = get_venues_count(df_venues.iloc[i].LAT, df_venues.iloc[i].LON, radius=1000, categoryId=c[1])
    print('{} County ({}, {}) data gathering complete'.format(df_venues.iloc[i].COUNTYNAME, df_venues.iloc[i].LAT, df_venues.iloc[i].LON))
    df_venues.to_csv(r'data\area_venues.csv')

In [None]:
df_venues=pd.read_csv(r'data\area_venues.csv', index_col=0)
df_venues.head()

# Exploratory Data Analysis

Lets make a box plot for Venues

Normalize the data using MinMaxScaler (scale from 0 to 1). This scales the data and provides an easy to interpret score at the same time.

In [None]:
X = df_venues.values[:,24:]
scaled_dataset = MinMaxScaler().fit_transform(X)
df_scaled = pd.DataFrame(scaled_dataset)
df_scaled.columns = [c[0] for c in categories_list]
df_scaled.head()

In [None]:
plt.figure(figsize=(14,8))
ax = sns.boxplot(data = df_scaled)
ax.set_ylabel('Count of venues (relative)', fontsize=25)
ax.tick_params(labelsize=20)
plt.xticks(rotation=45, ha='right')

plt.show()