# Geospatial Analysis

This analysis can give a geospatial overview of various cancer metrics across different counties. The following can be examined:

- **Average Annual Count(avganncount)**: Average annual count of cancer cases in the county.
- **Average Deaths Per Year (avgdeathsperyear)**: Average number of deaths per year in the county.
- **Incidence Rate (incidencerate)**: Number of incidents per 100k individuals in each county.
- **Target Death Rate (target_deathrate)**: Number of deaths per 100k individuals in each county.

By visualizing these on a map, you can look for geographic patterns.

## Cleaning the data

In [None]:
import pandas as pd

In [None]:
# Load the datasets into Pandas dataframes
household_df = pd.read_csv("Resources/avg-household-size.csv")
cancer_df = pd.read_csv("Resources/cancer_reg.csv")

In [None]:
# Merge the household and cancer data into a single dataframe
df = pd.merge(household_df, cancer_df, on="geography", how="inner")
df.info()

In [None]:
# Clean up some of the columns
df["pctprivatecoveragealone"].fillna(df["pctprivatecoveragealone"].mean(), inplace=True)
df["pctemployed16_over"].fillna(df["pctemployed16_over"].mean(), inplace=True)
df.drop(columns=['pctsomecol18_24', 'index_x', 'index_y'], axis=1, inplace=True)

In [None]:
# Calculate summary statistics for the dataframe
df.describe().T

In [None]:
# Median age has some unusual values
display(df[df["medianage"] > 80]["medianage"])
df[df["medianage"] < 80]["medianage"].describe()

In [None]:
# I don't know what to do with these and I don't really care.
df['medianage'] = df['medianage'].apply(lambda x: x / 10 if x > 80 else x)

In [None]:
# In the dataset we are using some of the cancer data was incorrectly imputed.
for column in ["avganncount", "avgdeathsperyear", "target_deathrate", "incidencerate"]:
    display(df[column].value_counts().head(5))

In [None]:
# Recalculate the means
counties = df[df["avganncount"] == 1962.667684]["geography"]

avganncount_mean = df[~df["geography"].isin(counties)]["avganncount"].mean()
incidencerate_mean = df[~df["geography"].isin(counties)]["incidencerate"].mean()

print(f"New avganncount mean: {avganncount_mean}")
print(f"New incidencerate mean: {incidencerate_mean}")

df.loc[df["geography"].isin(counties), "avganncount"] = avganncount_mean
df.loc[df["geography"].isin(counties), "incidencerate"] = incidencerate_mean

# Choropleth maps 

In [None]:
# I took this from https://plotly.com/python/choropleth-maps/
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

import plotly.express as px

choropleth_df = df.copy()

df['fips'] = df['statefips'].apply(lambda x: f"{int(x):02d}") + df['countyfips'].apply(lambda x: f"{int(x):03d}")

fig = px.choropleth(choropleth_df, geojson=counties, locations='fips', color='target_deathrate', color_continuous_scale="Plasma", scope="usa", labels={'target_deathrate':'Target Death Rate'})
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

fig.write_image("results/figures/county_level_cancer_mortality_rates.png")
fig.show()


# Regional Analysis

I'm referencing the official [Census Regions and Divisions of the United States](https://www2.census.gov/geo/pdfs/maps-data/maps/reference/us_regdiv.pdf) PDF.

In [None]:
# West Region
pacific_fips = [2, 6, 15, 41, 53]
mountain_fips = [4, 8, 16, 30, 32, 35, 49, 56]

# Midwest Region
west_north_central_fips = [19, 20, 27, 29, 31, 38, 46]
east_north_central_fips = [17, 18, 26, 39, 55]

# Northeast Region
middle_atlantic_fips = [34, 36, 42]
new_england_fips = [9, 23, 25, 33, 44, 50]

# South Region
west_south_central_fips = [5, 22, 40, 48]
east_south_central_fips = [1, 21, 28, 47]
south_atlantic_fips = [10, 11, 12, 13, 24, 37, 45, 51, 54]

In [None]:
west_fips = pacific_fips + mountain_fips
midwest_fips = west_north_central_fips + east_north_central_fips
northeast_fips = middle_atlantic_fips + new_england_fips
south_fips = west_south_central_fips + east_south_central_fips + south_atlantic_fips

In [None]:
import numpy as np

df['region'] = np.select(
    [
        df['statefips'].isin(west_fips),
        df['statefips'].isin(midwest_fips),
        df['statefips'].isin(northeast_fips),
        df['statefips'].isin(south_fips),
    ],
    [
        'West', 'Midwest', 'Northeast', 'South',
    ],
    default='Unknown'
)

fig = px.box(df[df['region'] != 'Unknown'], x='region', y='target_deathrate',
             labels={'target_deathrate': 'Target Death Rate'},
             color='region')

fig.write_image("results/figures/region_level_cancer_mortality_rates.png")
fig.show()