# Urban Growth Boundary (UGB)

In [None]:
#import dependencies
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
import scipy.stats as stats
from scipy.stats import linregress
import gmaps

#import API keys: 
from config import api_key

#save data in a CSV
output_file = "output_data/ugb.csv"


In [None]:
# Start of code for Question 1: housing types

In [None]:
# Input cleaned housing data as data frame
housing = pd.read_csv("../Data/HousingTypes_Cleaned.csv")

### Question 1 - Part A: Do cities with a UGB have more multi-unit structures than non-UGB cities?

In [None]:
# Grouping by UGB identifier
units = housing.groupby(["UGB"]).sum()
# Calculating percentages
units = units.div(units["Total Response"], axis=0) * 100

units

In [None]:
# Creating a summary graph of responses 
# Reorganizing data frame for graphing
# Dropping extra columns
units_graph = units.drop(columns=["Total Response", "Median Housing Costs", "MOE Housing Costs"])
# Tranpose data frame
graph_units = units_graph.transpose()
# Reset index
graph_structure = graph_units.reset_index()
graph_structure

In [None]:
# Bar graph for all response options
# Setting the positions and width for the bars
pos = list(range(len(graph_structure["No"])))
width= 0.25

# X tick labels
x_labels = ("1 Unit", "2 Units", "3 or 4 Units", "5 to 9 Units", "10 to 19 Units", "20 to 49 Units", "50 or more Units", "Other")

# Plotting the bars
fig, ax = plt.subplots(figsize=(20,10))

# Create bars for Non UGB
plt.bar(pos, graph_structure["No"], width, color="blue", label=graph_structure['index'][0])
# Create bars for UGB
plt.bar([p + width for p in pos], graph_structure["Yes"], width, color="red", label=graph_structure['index'][1])

# Set x axis
ax.set_xticks([p + .5 * width for p in pos])
ax.set_xticklabels(x_labels, fontsize=16)
plt.xlim(min(pos)-width*2, max(pos)+width*3)

# Set y axis and labels
ax.tick_params(axis='y', labelsize=14)
plt.ylim(0,60)
plt.ylabel("Percent", fontsize=16)

# Add title and legend
plt.title("Percent of Responses for Units in Structure by UGB Cities", fontsize=20)
plt.legend(["Non UGB Cities", "UGB Cities"], fontsize=18)

# Save graph
plt.savefig("../Images/UnitsinStructure.png")

# Sizing graph to frame and displaying
plt.tight_layout()
plt.show()

In [None]:
# Multi-unit structures
# New data frame
structure = housing.copy()

# Creating single and multi unit variables
structure["Single Unit"] = structure["1 unit"]
structure["Multi Units"] = structure["2 units"] + structure["3 or 4 units"] + structure["5 to 9 units"]\
+ structure["10 to 19 units"] + structure["20 to 49 units"] + structure["50 or more units"]

# Updating Total Columns
structure["Updated Total"] = structure["Single Unit"] + structure["Multi Units"]

# Cleaned data frame
structure_type = structure[["City", "State", "UGB", "Single Unit", "Multi Units", "Updated Total"]].copy()
structure_type

In [None]:
# Grouping by UGB
type_graph = structure_type.groupby(["UGB"]).sum()

# Calucaling percents
type_graph = type_graph.div(type_graph["Updated Total"], axis=0) * 100

# Reset index
type_graph = type_graph.reset_index()
type_graph

In [None]:
# Bar graph for single units and multi units
# Setting the positions and width for the bars
posi = list(range(len(type_graph["Single Unit"])))
width = 0.25

# X Tick labels
xlabels = ("Non UGB Cities", "UGB Cities")

# Plotting the bars
fig, ax = plt.subplots(figsize=(18,10))

#Create bar set 1
plt.bar(posi, type_graph["Single Unit"], width, color="blue", label=type_graph['UGB'][0])
#Create bar set 2
plt.bar([p + width for p in posi], type_graph["Multi Units"], width, color="red", label=type_graph['UGB'][1])

# Set x axis
ax.set_xticks([p + .5 * width for p in posi])
ax.set_xticklabels(xlabels, fontsize=16)
plt.xlim(min(posi)-width*2, max(posi)+width*3)

# Set y axis and labels
ax.tick_params(axis='y', labelsize=14)
plt.ylim(0,60)
plt.ylabel("Percent", fontsize=16)

# Add title and legend
plt.title("Percent of Responses for Single-Unit and Multi-Unit Structures", fontsize=20)
plt.legend(["Single-Unit", "Multi-Unit"], fontsize=18)

# Save graph
plt.savefig("../Images/SingleUnitvMultiUnit.png")

# Sizing graph to frame and displaying
plt.tight_layout()
plt.show()

In [None]:
# Building data frame to run chi-squared
# Finding Observed Values
ugb_types = structure_type.loc[structure_type["UGB"] == "Yes"]
ugb_singles = ugb_types["Single Unit"].sum()
ugb_multis = ugb_types["Multi Units"].sum()
ugb_total = ugb_types["Updated Total"].sum()

# Finding rate to calculate Expected Values
single_rate = (type_graph.iloc[0, 1]) / 100
multi_rate = (type_graph.iloc[0, 2]) / 100

# Calculate expected values
single_expected = ugb_total * single_rate
multi_expected = ugb_total * multi_rate

# Building data frame
ugb_chi = pd.DataFrame({
    "Type": ["Single Unit", "Multi Unit"],
    "Observed": [ugb_singles, ugb_multis],
    "Expected": [round(single_expected,2), round(multi_expected,2)]
})
ugb_chi

In [None]:
# Chi-Squared Goodness of Fit test
# Critical value
critical_value = stats.chi2.ppf(q = 0.95, df = 1)
print(f"The critical value = {round(critical_value,2)}")

# Chi-Squared Test
stats.chisquare(ugb_chi["Observed"], ugb_chi["Expected"])


### Question 1 - Part B: Do cities with a UGB have higher overall cost of housing than non-UGB cities?

In [None]:
# Grouping by UGB identifier
costs = housing[["City", "State", "UGB", "Median Housing Costs", "MOE Housing Costs"]]
costs

In [None]:
# Variables for t-test
costs_ugb = costs.loc[costs["UGB"] == "Yes"]
ugb_cost = costs_ugb["Median Housing Costs"]
costs_non = costs.loc[costs["UGB"] == "No"]
non_cost = costs_non["Median Housing Costs"]

# Independent T-test
stats.ttest_ind(ugb_cost, non_cost, equal_var=False)

In [None]:
# Creating a Box plot to view data distribution for ugb and non-ugb
# Creating simple data frame to graph from
costs_box = costs[["UGB", "Median Housing Costs"]].copy()
costs_box.set_index("UGB", inplace=True)

# Creating list of values for each box
ugb_box = list(costs_box.loc["Yes", "Median Housing Costs"])
non_ugb = list(costs_box.loc["No", "Median Housing Costs"])

# Creating variable list to pass to boxplot
cities_box = [ugb_box, non_ugb]

# Specifying outlier dots and median line
flierprops = dict(markerfacecolor='black', marker='o', markersize=12)
medianprops = dict(linestyle='-', color="blue", linewidth=2.5)

# Generating figure and drawing boxplot
fig, ax = plt.subplots(figsize=(15,10))
bp = ax.boxplot(cities_box, flierprops=flierprops, medianprops=medianprops)

# Updating y axis
ax.tick_params(axis='y', labelsize=14)
plt.ylim(500,3000)

#Adding title and labels
ax.set_title("Distribution of Median Housing Costs", fontsize=20)
ax.set_ylabel("Median Housing Prices ($)", fontsize=18)
ax.set_xticklabels(["UGB Cities", "Non UGB Cities"], fontsize=18)

# Save graph
plt.savefig("../Images/MedianHousing.png")

# Adjust to frame and display
plt.tight_layout()
plt.show()

In [None]:
# End of code for Question 1: housing types

In [None]:
# Start of code for Question 2: public transportation

In [None]:
# End of code for Question 2: public transportation

In [None]:
# Start of code for Question 3: population growth

# Question 3: Do UGB cities have lower population growth than the top 100 US Cities?  Lower than the avg US population growth?

In [None]:
#setup dependencies
import pandas as pd
import numpy as np 
import scipy.stats as stats
import matplotlib.pyplot as plt
from scipy.stats import linregress
import gmaps
import requests
import json
from config import gkey

In [None]:
#import csvs
UGB_Popcsv = "UGB_pop.csv"
UGB_Growcsv = "UGB_PopGrowth.csv"
top100_Popcsv = "Top100_CitiesPop.csv"
top100_Growcsv = "Pop_Growth.csv"

In [None]:
#set up dataframes and check output
#UGB pop and growth
#top 100 pop and growth
UGBpop_df = pd.read_csv(UGB_Popcsv)
UGBgro_df = pd.read_csv(UGB_Growcsv)
Toppop_df = pd.read_csv(top100_Popcsv)
Topgro_df = pd.read_csv(top100_Growcsv)

In [None]:
#merge the top100 Pop and growth, output
top100_df = pd.merge(Toppop_df, Topgro_df, how = "left", on = ["ID", "ID"])
#top100_df.head()

In [None]:
#merge UGB pop and growth, output
UGB_df = pd.merge(UGBpop_df, UGBgro_df, how = "left", on = ["ID", "ID"])
#UGB_df

In [None]:
#reformat the df's, (remove Rank_x, Rank_y, City_y, State_y,
#Estimated Population 2017, Estimated Population 2018)
UGB_df.drop(["Rank_x", "Rank_y", "City_y", "State_y","Estimated Population 2017", "Estimated Population 2018"], axis = 1, inplace = True)
UGB_df.rename(columns = {"Rank_x": "Rank", "City_x": "City", "State_x": "State"}, inplace = True)
UGB_df

In [None]:
#repeat for the top100 cities, rename City_x and State_x
top100_df.drop(["Rank_x", "Rank_y", "City_y", "State_y","Estimated Population 2017", "Estimated Population 2018"], axis = 1, inplace = True)
top100_df.rename(columns = {"Rank_x": "Rank", "City_x": "City", "State_x": "State"}, inplace = True)
top100_df

In [None]:
#create statistics
UGB_summ_df = {}
UGB_summ_df = pd.DataFrame()

UGB_summ_df["Avg Population"] = [UGB_df["2018_Estimate"].mean()]
UGB_summ_df["Median Population"] = [UGB_df["2018_Estimate"].median()]
UGB_summ_df["Population Variance"] = [UGB_df["2018_Estimate"].var()]
UGB_summ_df["Population Standard Deviation"] = [UGB_df["2018_Estimate"].std()]
UGB_summ_df["Population SEM"] = [UGB_df["2018_Estimate"].sem()]

UGB_summ_df["Avg Pop Growth"] = [UGB_df["Percent Difference"].mean()]
UGB_summ_df["Median Pop Growth"] = [UGB_df["Percent Difference"].median()]
UGB_summ_df["Pop Growth Variance"] = [UGB_df["Percent Difference"].var()]
UGB_summ_df["Pop Gorwth Standard Deviation"] = [UGB_df["Percent Difference"].std()]
UGB_summ_df["Pop Growth SEM"] = [UGB_df["Percent Difference"].sem()]

UGB_summ_df = pd.DataFrame(UGB_summ_df)
UGB_summ_df

In [None]:
top100_summ_df = {}
top100_summ_df = pd.DataFrame()

top100_summ_df["Avg Population"] = [top100_df["2018_Estimate"].mean()]
top100_summ_df["Median Population"] = [top100_df["2018_Estimate"].median()]
top100_summ_df["Population Variance"] = [top100_df["2018_Estimate"].var()]
top100_summ_df["Population Standard Deviation"] = [top100_df["2018_Estimate"].std()]
top100_summ_df["Population SEM"] = [top100_df["2018_Estimate"].sem()]

top100_summ_df["Avg Pop Growth"] = [top100_df["Percent Difference"].mean()]
top100_summ_df["Median Pop Growth"] = [top100_df["Percent Difference"].median()]
top100_summ_df["Pop Growth Variance"] = [top100_df["Percent Difference"].var()]
top100_summ_df["Pop Gorwth Standard Deviation"] = [top100_df["Percent Difference"].std()]
top100_summ_df["Pop Growth SEM"] = [top100_df["Percent Difference"].sem()]

top100_sum_df = pd.DataFrame(top100_summ_df)
top100_sum_df

In [None]:
#barchart of UGB cities population
plt.bar(UGB_df["City"], UGB_df["2018_Estimate"], width = 0.5, color = 'lightskyblue')
plt.title("Populations of UGB Cities")
plt.xlabel("City")
plt.ylabel("Population")
plt.xlim(-1,10)
plt.ylim(0,1050000)
plt.xticks(rotation = 45)

In [None]:
#bar chart of top100, UGB and total US
#compare to total US population growth = .62%, https://www.multpl.com/us-population-growth-rate/table/by-year
plt.bar("Top 100 Cities", top100_summ_df["Avg Pop Growth"], color = 'lightskyblue')
plt.bar("UGB Cities", UGB_summ_df["Avg Pop Growth"], color = 'purple' )
plt.bar("Entire US", [.62], color = 'crimson')
plt.title("Average Population Growth by Subset")
plt.xlabel("Subset")
plt.ylabel("Average Population Growth (%)")
plt.xlim(-1,3)
plt.ylim(0,1)
plt.xticks(rotation = 45)

In [None]:
UGB_growth = ((UGB_summ_df["Avg Pop Growth"]/100) * top100_summ_df["Avg Population"])
top100_growth = ((top100_summ_df["Avg Pop Growth"]/100) * top100_summ_df["Avg Population"])
US_growth = (.0062 * top100_summ_df["Avg Population"])

Pop_growth_df = {}
Pop_growth_df = pd.DataFrame()
Pop_growth_df["UGB Pop Increase"] = UGB_growth
Pop_growth_df["Top 100 City Pop Increase"] = top100_growth
Pop_growth_df["USA Pop Increase"] = US_growth
Pop_growth_df = pd.DataFrame(Pop_growth_df)
Pop_growth_df

In [None]:
UGB_sample = (UGB_df["Percent Difference"])
top100_sample = (top100_df["Percent Difference"])
stats.ttest_ind(UGB_sample, top100_sample, equal_var=False)

In [None]:
#creat heatmap of UGB cities population growth and decline
target_cities = ["San Jose, California", "Seattle, Washington", "Portland, Oregon", "Miami, Florida", "Virginia Beach, Virginia",
                "Minneapolis, Minnesota", "Honolulu, Hawaii", "Lexington, Kentucky", "St. Paul, Minnesota", "Boulder, Colorado"]

for city in target_cities:
    params = {"address": city, "key": gkey}
    base_url = "https://maps.googleapis.com/maps/api/geocode/json"
    response = requests.get(base_url, params = params)

    cities_geo = response.json()

    lat = cities_geo["results"][0]["geometry"]["location"]["lat"]
    lng = cities_geo["results"][0]["geometry"]["location"]["lng"]

    print(f"{city}: {lat}, {lng}")

In [None]:
#create data frame of the outputs
cities_df = {}
cities_df = pd.DataFrame()

cities_df["City"] = ["San Jose, California", "Seattle, Washington", "Portland, Oregon", "Miami, Florida", "Virginia Beach, Virginia",
                "Minneapolis, Minnesota", "Honolulu, Hawaii", "Lexington, Kentucky", "St. Paul, Minnesota", "Boulder, Colorado"]
cities_df["Lat"] = [37.3382082, 47.6062095, 45.5051064,
                           25.7616798, 36.8529263, 44.977753,
                           21.3069444, 38.0405837, 44.9537029, 40.0149856]
cities_df["Lng"] = ["-121.8863286", "-122.3320708", "-122.6750261", "-80.1917902", "-75.97798499999999", "-93.2650108",
                    "-157.8583333", "-84.5037164", "-93.0899578", "-105.2705456"]

cities_df["Population Growth"] = UGB_df["Percent Difference"]
cities_df = pd.DataFrame(cities_df)
cities_df

In [None]:
#create heatmap 
locations = cities_df[["Lat", "Lng"]].astype(float)
growth = cities_df["Population Growth"].astype(float)

fig = gmaps.figure()

heat_layer = gmaps.heatmap_layer(locations, weights = (growth + .6), 
                                 dissipating = False, max_intensity = 2.7,
                                 point_radius = 1)

fig.add_layer(heat_layer)

fig

In [None]:
#positive cities; Seattle, Portland, Miami, Virginia Beach, Lexington, Minneapolis, St. Paul 
#negative cities; San Jose, Honolulu, Boulder

In [None]:
# End of code for Question 3: population growth

# Question 4:  Do cities with a UGB have a higher population density than U.S. cities?


In [None]:
#format top 100 cities dataframe
populationcsv = "../Data/top_100clean.csv"
population = pd.read_csv(populationcsv)
population = population.rename(columns = {"Persons by km2": "Persons per km2", "2010 Census": "Census Population"})
population.set_index('City', inplace=True)
population

In [None]:
#ugb dataframe
ugb = population.loc[population["UGB "] == "Yes", :]
ugb

In [None]:
#non-ugb cities dataframe
non_ugb = population.loc[population["UGB "] == "No", :]

In [None]:
#set density variables
non_ugb_density = non_ugb["Persons per km2"]
density = population["Persons per km2"]
ugb_density = ugb["Persons per km2"]

In [None]:
#summary statistics
ugb_density_mean = ugb_density.mean()
density_mean = density.mean()
ugb_density_median = ugb_density.median()
density_median = density.median()
min_dens = density.min()
max_dens = density.max()
min_ugb = ugb_density.min()
max_ugb = ugb_density.max()

In [None]:
#summary statistics dataframe
density_summary = pd.DataFrame({'Mean Density':[density_mean], 'Median Density': [density_median], "Minimun Density": [min_dens],
                                "Maximum Density": [max_dens]})
density_summary


In [None]:
#summary stats for UGB cities
ugb_density_summary = pd.DataFrame({'UGB Mean Density':[ugb_density_mean], 'UGB Median Density': [ugb_density_median], 
                                    "UGB Minimun Density": [min_ugb], "UGB Maximum Density": [max_ugb]})
ugb_density_summary

In [None]:
#independent t-test of population density
stats.ttest_ind(non_ugb_density, ugb_density, equal_var=False)

In [None]:
#comfigure gmaps
gmaps.configure(api_key=api_key)

In [None]:
#change data types to float
locations = population[["Latitude", "Longitude"]].astype(float)
ugb_locations =ugb_df[["Latitude", "Longitude"]].astype(float)
density= population["Persons per km2"].astype(float)
ugb_density = ugb_df["Persons per km2"].astype(float)

In [None]:
#heatmap layer
fig = gmaps.figure()

heat_layer = gmaps.heatmap_layer(locations, weights=density, 
                                 dissipating=False, max_intensity=5000,
                                 point_radius = 1)

fig.add_layer(heat_layer)

In [None]:
#add ugb markers
ugb_layer = gmaps.symbol_layer(
    ugb_locations, fill_color='rgba(0, 150, 0, 0.4)',
    stroke_color='rgba(0, 0, 150, 0.4)', scale=4)
    
fig = gmaps.figure()
fig.add_layer(ugb_layer)

In [None]:
#add layers and show figure
fig = gmaps.figure()
fig.add_layer(heat_layer)
fig.add_layer(ugb_layer)

fig

In [None]:
#create boxplots
data = [density, ugb_density]
fig1, ax1 = plt.subplots()
ax1.set_title('Population Density for Top 100 US Cities vs UGB Cities')
ax1.set_ylabel('Persons per km2')
ax1.boxplot(data)
ax1.set_xticklabels(['Top 100 Cities', 'UGB Cities'])
top = 5000
bottom = 0
ax1.set_ylim(bottom, top)
plt.show()
plt.savefig("../Images/PopD_boxplot.png")
plt.show()

In [None]:
#Top 100 cities quartiles and outliers
quartiles = density.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of population density is: {lowerq}")
print(f"The upper quartile of population density is: {upperq}")
print(f"The interquartile range of population density is: {iqr}")
print(f"The the median of population density is: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

In [None]:
#UGB cities quartiles and outliers
quartiles = ugb_density.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of UGB population density is: {lowerq}")
print(f"The upper quartile of UGB population density is: {upperq}")
print(f"The interquartile range of UGB population density is: {iqr}")
print(f"The the median of UGB population density is: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

In [None]:
# End of code for Question 4: population density