In [None]:
import pandas as pd
from pprint import pprint as pp
import matplotlib.pyplot as plt
import math as m

In [None]:
raw_data = pd.read_csv("data/California_Fire_Incidents.csv")
raw_data.head()

In [None]:
# Output names of all columns
for name in raw_data.columns:
    print(name);

In [None]:
# Review all column stats
raw_data.describe(include="all")

In [None]:
# Review only Longitude and Latitude data
raw_data[["Latitude", "Longitude", "Location"]].describe(include="all")

In [None]:
# Required data cleaning for lattitude
# Longitude should be within [-180:180] - raw data range is [-124.196:118.908]
# Latitude should be within [-90:90] - raw data range is [-120.258:5487.0000]
raw_data.shape # (1636, 40)

clean_data_1 = raw_data[((raw_data['Latitude'] > -90) & (raw_data['Latitude'] < 90))]
clean_data_1[["Latitude", "Longitude", "Location"]].describe(include="all")

In [None]:
plt.figure(figsize=(10,5))
plt.scatter(clean_data_1.Latitude, clean_data_1.Longitude)
# plt.colorbar()
plt.ylabel("Longitude, deg")
plt.xlabel("Latitude, deg")
plt.title("Bubble map of amount of acres burned")
plt.grid(alpha=0.3)

In [None]:
# Further, if we refer to known lat/lon ranges for california, we need to remove some of the existing
# non-California outliers
# from https://www.netstate.com/states/geography/ca_geography.html we have 
#      Longitude: 114° 8' W to 124° 24' W
#      Latitude: 32° 30' N to 42° N
clean_data_2 = clean_data_1[((clean_data_1['Latitude'] > 32) & (clean_data_1['Latitude'] < 42))]
clean_data_3 = clean_data_2[((clean_data_2['Longitude'] < -114) & (clean_data_2['Latitude'] > -124))]
print("Number of records after Latitude sorting\t" + str(clean_data_2.shape))
print("Number of records after Longitude sorting\t" + str(clean_data_3.shape))


In [None]:
plt.figure(figsize=(5,10))
plt.scatter(clean_data_3.Latitude, clean_data_3.Longitude)
# plt.colorbar()
plt.ylabel("Longitude, deg")
plt.xlabel("Latitude, deg")
plt.title("Bubble map of amount of acres burned")
plt.grid(alpha=0.3)

### Source California County information from Wikipedia 
This is done to access listing of all teh counties, as well as their Population and Area data

In [None]:
# Read in list of California counties from Wikipedia
raw_california_counties = pd.read_html("https://en.wikipedia.org/wiki/List_of_counties_in_California")
len(raw_california_counties)

In [None]:
# Separating 2nd rendered table object on the page
raw_california_counties[1]
# List all available columns
raw_california_counties[1].columns

In [None]:
# Clean California counties data
clean_california_counties_1 = raw_california_counties[1]
clean_california_counties_1.head()

In [None]:
clean_california_counties_1.columns=['County', 'FIPS', 'Seat', 'Date_Established', 'FormedFrom',
       'Etymology', 'Law','Population_Jul_2019', 'Area', 'Map']
clean_california_counties_1.head()

In [None]:
clean_california_counties_2 = clean_california_counties_1[['County', 'Date_Established','Population_Jul_2019', 'Area']]
clean_california_counties_2.head()

In [None]:
# Remove string "County" from the column "County"
clean_california_counties_2.County = clean_california_counties_2.County.str.replace(" County", "")
clean_california_counties_2.head()

In [None]:
# split area column into two for diferent units
clean_california_counties_2[["AreaSqMi", "AreaKm2"]] = clean_california_counties_2.Area.str.split("(",expand=True)
print(clean_california_counties_2.columns)
#       Drop old Area column
clean_california_counties_2 = clean_california_counties_2.drop(["Area"], axis=1)
clean_california_counties_2.head()

In [None]:
# Find which special characters are used for spaces in column header 
clean_california_counties_2.iloc[0,3]
clean_california_counties_2.iloc[0,4]

In [None]:
# Remove unit references from data including using special '\xa0' combination for blank spaces
clean_california_counties_2.AreaSqMi = (clean_california_counties_2.AreaSqMi.str.replace("\xa0sq\xa0mi","",regex=False).str.replace(",","",regex=False)).astype('int64')
clean_california_counties_2.AreaKm2  = (clean_california_counties_2.AreaKm2.str.replace("\xa0km2)","", regex=False).str.replace(",","",regex=False)).astype('int64')
clean_california_counties_2.head()

In [None]:
clean_california_counties_2.describe(include='all')

In [None]:
# Sort DF by population in descending order to view to 5 most populated counties
clean_california_counties_2.sort_values(['Population_Jul_2019'], ascending=False).head()

In [None]:
# Calculate population density per square mile 
clean_california_counties_2['PopDensityPerSqMi'] = clean_california_counties_2['Population_Jul_2019']
clean_california_counties_2['PopDensityPerSqMi'] = clean_california_counties_2.Population_Jul_2019/clean_california_counties_2.AreaSqMi

# Sort DF by population density in descending order to view to 5 most densly populated counties
clean_california_counties_2.sort_values(['PopDensityPerSqMi'], ascending=False).head()

In [None]:
x = clean_california_counties_2.Population_Jul_2019
y = clean_california_counties_2.AreaSqMi
z = clean_california_counties_2.County
size = clean_california_counties_2.PopDensityPerSqMi/10

plt.figure(figsize=(15,10))
scatter = plt.scatter(x, y, s=size, alpha=0.4, c = "black")
# produce a legend with a cross section of sizes from the scatter

plt.xlabel("Population @ Jul'19")
plt.ylabel("Area, sq.mi")
plt.grid(alpha=0.3)
plt.xscale("log")
plt.yscale("log")
    
handles, labels = scatter.legend_elements("sizes", num=4)
legend = plt.legend(handles, labels, 
           loc = "lower left", 
           title = "Population Density\nper sq.mi x 100", 
           labelspacing = 3,
           handlelength=5,
           borderpad = 2)

for handle in legend.legendHandles:
    handle.set_color('blue')
    
for i, txt in enumerate(z):
    plt.annotate(txt, (x[i], y[i]))


In [None]:
# data from https://www.counties.org/data-and-research, 
# https://www.counties.org/sites/main/files/file-attachments/datapile_-_headline_datasets_-_current.xlsx

raw_california_coucus = pd.read_excel("./data/datapile_-_headline_datasets_-_current.xlsx", sheet_name="County & Government")
clean_california_caucus = raw_california_coucus[['County','Caucus']]
clean_california_caucus.head()

In [None]:
# MErge counties data with caucus
clean_california_counties_merged = pd.merge(clean_california_counties_2, clean_california_caucus, on='County')
clean_california_counties_merged.head()

In [None]:
clean_california_counties_merged.Caucus.unique()

In [None]:
# Prep data for plotting input
x = clean_california_counties_2.Population_Jul_2019
y = clean_california_counties_2.AreaSqMi
counties = clean_california_counties_2.County

# Define variety of markers to be used 
shape = ["o", "v", "s"]

# Setup figure
plt.figure(figsize=(15,10))

i=0
# Run through each county type
for item in clean_california_counties_merged.Caucus.unique():
    # Extract data for specific county type     
    data = clean_california_counties_merged[clean_california_counties_merged['Caucus'] == item]
    
    # redefine subsets for coordinates     
    x_sub = data.Population_Jul_2019
    y_sub = data.AreaSqMi
    counties_sub = data.County
    
    plt.scatter(x_sub, y_sub, 
        s = 150,
        c = data.PopDensityPerSqMi,
        cmap = 'jet',
        marker = shape[i],
        alpha = 0.4, 
        label = item 
        )
    i+=1
# Set up, label and view colorbar
cbar = plt.colorbar()
cbar.set_label('Population density per 1 sq.mi')

#  Set up X, Y axes labels    
plt.xlabel("Population @ Jul'19")
plt.ylabel("Area, sq.mi")

#  Set up legend
legend = plt.legend(loc="upper left", fontsize=10, labelspacing=1)

# change the marker size manually to amke the same
for handle in legend.legendHandles:
    handle.set_sizes([100.0])

# view grid and set transparency to 30%
plt.grid(alpha=0.3)

# Change both x and y scales to logarithmic
plt.xscale("log")
plt.yscale("log")



for i, txt in enumerate(counties):
    plt.annotate(txt, (x[i]*1.1, y[i]*1.15), fontsize = 10)