-------
# US EPA Air Quality
-------

### Approach
1. Filter sensors that covers > 84% dates [2019 - 2020]
2. Fill in missing median_temp based on sensor latitude using Linear Regression model.


In [None]:
import plotly.offline as py 
py.init_notebook_mode(connected=True)
import plotly.graph_objects as go

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import panel as pn
pn.extension('tabulator')
import warnings
warnings.filterwarnings('ignore')

# choropleth map
import plotly.express as px
from urllib.request import urlopen
import json

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
epa_prep = pd.read_csv('/kaggle/input/epa-cbsa-filled/epa_preprocessed.csv')
epa_prep.shape

In [None]:
epa_prep.drop(columns='Unnamed: 0', inplace=True)
epa_prep['date'] = pd.to_datetime(epa_prep['date'])
epa_prep.dtypes

------------------
## Dictionary
------------------

In [None]:
# snesor_id: cbsa map
sensor_cbsa_dict = epa_prep[['sensor_id','cbsa']].set_index('sensor_id').to_dict()['cbsa']
sensor_cbsa_dict0 = epa_prep[['sensor_id','cbsa0']].set_index('sensor_id').to_dict()['cbsa0']
sensor_cbsa_dict1 = epa_prep[['sensor_id','cbsa1']].set_index('sensor_id').to_dict()['cbsa1']
sensor_state_dict = epa_prep[['sensor_id','state']].set_index('sensor_id').to_dict()['state']
# example dict call
sensor_cbsa_dict['01-033-1002']

------------
## Key Columns
------------

In [None]:
# check for missing values in sensor related data
key_cols = ['date','sensor_id','sensor_lat', 'sensor_long','geometry','cbsa', 'ozone','pm25','color','education','language','income','temp']
epa_prep[key_cols].isnull().sum()

Great! No missing values in sensor related data.

In [None]:
epa = epa_prep[key_cols]
epa['date'] = pd.to_datetime(epa['date'])
epa.head(3)

In [None]:
epa.set_index(epa.date, inplace=True)
epa = epa.drop(columns='date')
epa.head(2)

In [None]:
# count of unique values
epa.nunique().sort_values(ascending=False).to_frame()

sensor id, lat, long have the same 525 unique values. This means that we can group by 'sensor_id' using median without changing the values of sensor_lat/long. 

-------
# CBSA fips Dictionary
-------

In [None]:
# import fips file
fips_df = pd.read_csv('../input/cbsa-code/cbsa2fipsxw.csv')
fips_df.head(1).T
fips_df1 = fips_df[['cbsacode','cbsatitle','statename','fipsstatecode','fipscountycode']].dropna()
fips_df2 = fips_df1[['cbsacode','fipsstatecode','fipscountycode']].astype(int)
fips_df3 = fips_df1[['cbsatitle','statename']]

fips = pd.merge(fips_df3, fips_df2, left_on=fips_df3.index, right_on=fips_df2.index).drop(columns='key_0')

In [None]:
# add leading '0' to fips codes
fips['fipsstatecode']=fips['fipsstatecode'].apply(lambda x: '{0:0>2}'.format(x))
fips['fipscountycode']=fips['fipscountycode'].apply(lambda x: '{0:0>3}'.format(x))
fips['fips'] = fips['fipsstatecode'] + fips['fipscountycode']

# create {cbsa : fips} dict
fips_dict = fips.set_index('cbsatitle')['fips'].to_dict()

------
# add fips to EPA data
------

In [None]:
epa['fips'] = epa.cbsa.map(fips_dict)
epa.isnull().sum()

In [None]:
epa.head()

In [None]:
# get index with missing fips
epa = epa.reset_index()
epa = epa.set_index('sensor_id')
missing_fips_list = epa[epa.fips.isnull()]['cbsa'].unique().tolist()
missing_fips_list

In [None]:
epa.head(1)

-----------
## County fips
-----------

In [None]:
# {county : fips} dict
county_fips = pd.read_csv('/kaggle/input/county-fips/county_fips.csv', usecols=['FIPS','Name','State'])
county_fips['county'] = county_fips['Name'] + ', ' + county_fips['State']
county_fips_dict = county_fips[['county','FIPS']].set_index('county').to_dict()['FIPS']
county_fips

In [None]:
missing_fips_index = epa[epa.fips.isnull()].index
missing_fips_index

In [None]:
epa.isnull().sum()

In [None]:
epa[['fips']]

In [None]:
epa['fips'] = np.where(epa.index.isin(missing_fips_index)==True, epa['cbsa'].map(county_fips_dict), epa['fips'])
epa.isnull().sum()

In [None]:
missing_section_index = epa[epa.fips.isnull()].index
missing_section_index

In [None]:
# fill in missing section fips
section_fips_dict = {'Bishop, CA':'06798'}

# map fips to cbsa
epa['fips'] = np.where(epa.index.isin(missing_section_index)==True, epa['cbsa'].map(section_fips_dict), epa['fips'])
epa.isnull().sum()

---------
## Unique sensor list
---------

In [None]:
# 525 unique sensors
sensor_list_grouped = epa.groupby('sensor_id').median().sort_values('income',ascending=False)
sensor_list_grouped = sensor_list_grouped.rename(columns={'temp':'median_temp'})

In [None]:
# count of mesured dates in each sensor
sensor_date_counts = epa.reset_index().groupby('sensor_id').nunique()['date'].to_frame()

In [None]:
# merge sensor list with count of dates
sensor_list = pd.merge(sensor_list_grouped, sensor_date_counts, left_on=sensor_list_grouped.index, right_on=sensor_date_counts.index).rename(columns={'key_0':'sensor_id','date':'date_counts'}).set_index('sensor_id')
sensor_list.sort_values('date_counts', ascending=False)

In [None]:
# date count distribution
plt.figure(figsize=(3,3))
sensor_list['date_counts'].sort_values().plot(kind='box')
plt.title('525 sensors with count of measured dates')
plt.show()

In [None]:
# keep only sensors with more than 612 days of measured data
sensor_list.date_counts.quantile(0.5)
print(f'filtered data covers {round(612/365/2*100,0)}% dates from 2019-2020')

In [None]:
# complete fips dict
fips_dict = epa[['fips']].to_dict()['fips']

In [None]:
sensor_list['fips'] = sensor_list.index.map(fips_dict)
sensor_list.isnull().sum()

In [None]:
# apply dictionary
sensor_list['cbsa'] = sensor_list.index.map(sensor_cbsa_dict)
sensor_list['cbsa0'] = sensor_list.index.map(sensor_cbsa_dict0)
sensor_list['cbsa1'] = sensor_list.index.map(sensor_cbsa_dict1)
sensor_list['state'] = sensor_list.index.map(sensor_state_dict)
sensor_list

-------
## Choropleth Map - All 525 Sensors
-------

In [None]:
# pm 2.5 concentration map
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

df = sensor_list
fig = px.choropleth(df, geojson=counties, locations='fips', color='pm25',
                    scope="usa",
                    hover_name='cbsa',
                    color_continuous_scale="rdbu_r", #Viridis
                    range_color=(0, 13),
                    labels={'pm25':'PM 2.5 ug/m3'}
                          )

fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0}, 
                  title='PM 2.5 Concentration')
print('PM 2.5 Concentration:')
fig.show()

In [None]:
# Ozone concentration map
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

df = sensor_list
fig = px.choropleth(df, geojson=counties, locations='fips', color='ozone',
                    scope="usa",
                    hover_name='cbsa',
                    color_continuous_scale="rdbu_r", #Viridis
                    labels={'ozone':'Ozone ppm'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
print('Ozone Concentration:')
fig.show()

In [None]:
# people of color concentration map
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

df = sensor_list
fig = px.choropleth(df, geojson=counties, locations='fips', color='color',
                    scope="usa",
                    hover_name='cbsa',
                    color_continuous_scale="solar_r", #Viridis
                    labels={'color':'People of Color %'}
#                     range_color=(0, 0.05),
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
print('People of Color %:')
fig.show()

In [None]:
# Low income distribution map
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

df = sensor_list
fig = px.choropleth(df, geojson=counties, locations='fips', color='income',
                    scope="usa",
                    hover_name='cbsa',
                    color_continuous_scale="rdpu", #Viridis
                    labels={'income':'Low Income %'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
print('Low Income Distribution:')
fig.show()

In [None]:
# Low education distribution map
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

df = sensor_list
fig = px.choropleth(df, geojson=counties, locations='fips', color='education',
                    scope="usa",
                    hover_name='cbsa',
                    color_continuous_scale="rdpu", #Viridis
                    labels={'education':'Less than Highschool Education %'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
print('Low Education Distribution:')
fig.show()

In [None]:
# High language isolation distribution map
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

df = sensor_list
fig = px.choropleth(df, geojson=counties, locations='fips', color='language',
                    scope="usa",
                    hover_name='cbsa',
                    color_continuous_scale="rdpu", #Viridis
                    range_color=(0,0.3),
                    labels={'language':'Language Isolation %'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
print('High Language Isolation Distribution:')
fig.show()

--------
## Filter Sensors (525 -> 262)
--------

In [None]:
# sensors with > 84% dates are covered within 2019-2020
# quantity of data reduced to 50% (525 sensors -> 262 sensors)

sensor_list_new = sensor_list[sensor_list.date_counts > sensor_list.date_counts.median()]

# apply dictionary
sensor_list_new['cbsa'] = sensor_list_new.index.map(sensor_cbsa_dict)
sensor_list_new['cbsa0'] = sensor_list_new.index.map(sensor_cbsa_dict0)
sensor_list_new['cbsa1'] = sensor_list_new.index.map(sensor_cbsa_dict1)
sensor_list_new['state'] = sensor_list_new.index.map(sensor_state_dict)

sensor_list_new.head(1)

In [None]:
# outliers in pm2.5, language, education
fig, axes = plt.subplots(1,6, figsize=(10,3))
sensor_list_new[['pm25']].boxplot(ax=axes[0])
sensor_list_new[['ozone']].boxplot(ax=axes[1])
sensor_list_new[['color']].boxplot(ax=axes[2])
sensor_list_new[['income']].boxplot(ax=axes[3])
sensor_list_new[['language']].boxplot(ax=axes[4])
sensor_list_new[['education']].boxplot(ax=axes[5])

plt.tight_layout()
plt.show()

In [None]:
# date count stats
print('525 sensors - max 726 dates, min 41 dates, 75% percentile of data has 690 days of data:')
display(sensor_list_new.date_counts.describe().astype(int).to_frame())

In [None]:
# plot sensor distribution - violin
fig, axes = plt.subplots(2,4, figsize=(10,6))
sns.violinplot(data= sensor_list_new, y='pm25', ax=axes[0,0])
sns.violinplot(data= sensor_list_new, y='ozone', ax=axes[0,1])
sns.violinplot(data= sensor_list_new, y='median_temp', ax=axes[0,2])
sns.violinplot(data= sensor_list_new, y='color', ax=axes[1,0])
sns.violinplot(data= sensor_list_new, y='income', ax=axes[1,1])
sns.violinplot(data= sensor_list_new, y='language', ax=axes[1,2])
sns.violinplot(data= sensor_list_new, y='education', ax=axes[1,3])

# plot sensor distribution - box
sensor_list_new[['pm25']].boxplot(ax=axes[0,0])
sensor_list_new[['ozone']].boxplot(ax=axes[0,1])
sensor_list_new[['median_temp']].boxplot(ax=axes[0,2])
sensor_list_new[['color']].boxplot(ax=axes[1,0])
sensor_list_new[['income']].boxplot(ax=axes[1,1])
sensor_list_new[['language']].boxplot(ax=axes[1,2])
sensor_list_new[['education']].boxplot(ax=axes[1,3])

plt.suptitle('Sensor Distribution')
plt.tight_layout()
plt.show()

In [None]:
# plot sensor distribution - swarm
fig, axes = plt.subplots(1,3, figsize=(8,3))
sns.swarmplot(data= sensor_list_new, y='pm25', ax=axes[0])
sns.swarmplot(data= sensor_list_new, y='ozone', ax=axes[1])
sns.swarmplot(data= sensor_list_new, y='median_temp', ax=axes[2])

plt.suptitle('Sensor distribution vs Pollutants vs Median temp')
plt.tight_layout()
plt.show()

In [None]:
# community characteristics
fig, axes = plt.subplots(1,4, figsize=(10,3))
sns.swarmplot(data= sensor_list_new, y='color', ax=axes[0])
sns.swarmplot(data= sensor_list_new, y='income', ax=axes[1])
sns.swarmplot(data= sensor_list_new, y='language', ax=axes[2])
sns.swarmplot(data= sensor_list_new, y='education', ax=axes[3])

plt.suptitle('Sensor distribution vs Community Characteristics')
plt.tight_layout()
plt.show()

#### Data Coverage:
We have a fair coverage of sensor measurement:
* wide range of people of colour
* Low income communities.

Most communities have:
* low language barrier
* low education limitation

EJ communities are mostly defined by:
* `poeple of color` proportion >  `Low income` proportion

### Bivariate Analysis

---------
## Pair Plot
---------

In [None]:
# sns.pairplot(sensor_list_new, kind='kde')
# plt.show()

#### Observations:
- higher latitude = lower temperature (this makes sense as US is in the northern hemisphere)
- higher temp = higher PM2.5 concentration
- higher temp = more people of color

### Correlations:
* PM2.5 concentration is higher in warmer temperature (Causation).
* Because people of color tend to live in warmer area, they are more exposed to PM2.5 (Correlation).

-----------
## Heatmap
-----------

In [None]:
corr = sensor_list_new.corr()
sns.heatmap(corr, cmap='coolwarm', annot=True, fmt='.1f')
plt.title('Bivariate Analysis')
plt.show()

#### Observations:
* multi-colinearity between (pm2.5, color, education, language, income, temp)

let's perform PCA to identify 

In [None]:
target = sensor_list_new[sensor_list_new.median_temp.isnull()==True]
train = sensor_list_new[sensor_list_new.median_temp.isnull()==False]

# train
train_X = train['sensor_lat'].values.reshape(len(train['median_temp']),1)
train_y = train['median_temp'].values.reshape(len(train['median_temp']),1)

# target (to predict)
target_X = target['sensor_lat'].values.reshape(len(target['median_temp']),1)
empty = []
target_y = pd.DataFrame(data=empty, index=target.index)#, columns='pred_median_temp')


In [None]:
train_X.shape, train_y.shape, target_X.shape, target_y.shape

-------------
## Linear Regression (fill in missing Temp)
------------

* using the strong linear relationsip between temp vs sensor_lat,
* let's fill in missing temp based on sensor_lat

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# fit model with sensor latitude vs median_temp
lr = LinearRegression()
lr.fit(train_X, train_y)

# intercept & coefficient of fitted model
coeffs = np.array(list(lr.intercept_.flatten()) + list(lr.coef_.flatten()))
coeffs = list(coeffs)

Almost perfect reverse linear relationship (slope = -1) between sensor_lat vs median_temp.

In [None]:
# train model r squared
print(f'Train model R-squared: {lr.score(train_X, train_y)}')

~80% of train data is explained by this linear model: **y = 52.66 -1.01 x**

In [None]:
# fill in missing values from linear regression model
pred = lr.predict(target_X)
target_y['pred_median_temp'] = pred
target_y

In [None]:
df1 = sensor_list_new[np.isnan(sensor_list_new.median_temp)==False]
df1.shape

In [None]:
df2 = sensor_list_new[np.isnan(sensor_list_new.median_temp)==True]
df2.drop(columns='median_temp', inplace=True)
df2['median_temp'] = pred
df2.shape

In [None]:
# merge df1, df2
sensor_list_new = pd.concat([df1, df2])
sensor_list_new.shape

In [None]:
# no more missing values
sensor_list_new.isnull().sum()

In [None]:
sensor_list_new.index

In [None]:
df1.index

In [None]:
# Linear Regression Plot
sns.lmplot(x='sensor_lat', y='median_temp', data=sensor_list_new)
plt.legend(['Predicted Temp','Regression Model'])

sns.scatterplot(x='sensor_lat', y='median_temp', data=sensor_list_new[sensor_list_new.index.isin(df1.index)], color='#FA8072' ,alpha=1.0)

plt.title('Actual & Predicted Median Temp vs Sensor Latitude')
plt.show()

-------
## Bivariate Analysis
-------

In [None]:
corr = sensor_list_new.corr()
sns.heatmap(corr, cmap='coolwarm', annot=True, fmt='.1f')
plt.title('Bivariate Analysis')
plt.show()

--------------
# fips Map
--------------

In [None]:
sensor_list_new

In [None]:
sensor_list_new[['fips','color']].shape

------
## Choropleth Map - filtered Sensors
------

In [None]:
# Ozone concentration map
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

df = sensor_list_new
color_max = df.ozone.max()
fig = px.choropleth(df, geojson=counties, locations='fips', color='ozone',
                    scope="usa",
                    hover_name='cbsa',
                    color_continuous_scale="rdbu_r", #Viridis
                    range_color=(0,color_max),
                    labels={'ozone':'Ozone ppm'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
print('Ozone Concentration:')
fig.show()

In [None]:
# pm 2.5 concentration map
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

df = sensor_list_new
color_max = df.pm25.max()
fig = px.choropleth(df, geojson=counties, locations='fips', color='pm25',
                    scope="usa",
                    hover_name='cbsa',
                    color_continuous_scale="rdbu_r", #Viridis
                    range_color=(0,color_max),
                    labels={'pm25':'PM 2.5 ug/m3'}
                          )

fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0}, 
                  title='PM 2.5 Concentration')
print('PM 2.5 Concentration:')
fig.show()

----------------
## Identify which sensor belongs to EJ communities
----------------

In [None]:
# set up for all plots
df = sensor_list_new
df['text'] = '| State: ' + df['state'].astype(str) + '  | Color: ' + df['color'].astype(str) + '  | Income: ' + df['income'].astype(str) + '  | Education: ' + df['education'].astype(str) + '  | Ozone: ' + df['ozone'].astype(str)

In [None]:
# high people of color plot
df = sensor_list_new
fig = go.Figure(data=go.Scattergeo(
        lon = df['sensor_long'],
        lat = df['sensor_lat'],
        mode = 'markers',
        text = df['text'],
        marker_color = df['color'],
        marker = dict(
            size = 8,
            opacity = 0.8,
            reversescale = False,
            autocolorscale = False,
            symbol = 'square',
            line = dict(
                width=1,
                color='rgba(102, 102, 102)'
            ),
            colorscale = 'ylgnbu',
            cmin = 0,
            cmax = df['color'].max(),
            colorbar_title="People of Color %"
        ))
        )


fig.update_layout(
        title = 'High People of Color Distribution<br>Sensor Locations with > 84% dates covered in 2019-2020',
        geo_scope='usa',
    )
fig.show()

In [None]:
# people of color
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

# df = pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/fips-unemp-16.csv",
#                    dtype={"fips": str})

df = sensor_list_new
color_max = df.color.max()
fig = px.choropleth(df, geojson=counties, locations='fips', color='color', #unemp
                           color_continuous_scale="ylgnbu",
                           hover_name='cbsa',
                           scope="usa",
                           range_color=(0,color_max),
                           labels={'color':'People of Color %'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
print('High People of Color Distribution:')
fig.show()

In [None]:
# low income plot
df = sensor_list_new

fig = go.Figure(data=go.Scattergeo(
        lon = df['sensor_long'],
        lat = df['sensor_lat'],
        mode = 'markers',
        text = df['text'],
        marker_color = df['income'],
        marker = dict(
            size = 8,
            opacity = 0.8,
            reversescale = False,
            autocolorscale = False,
            symbol = 'square',
            line = dict(
                width=1,
                color='rgba(102, 102, 102)'
            ),
            colorscale = 'hot_r',
            cmin = 0,
            cmax = df['income'].max(),
            colorbar_title="Low income %"
        ))
        )



fig.update_layout(
        title = 'Low Income Distribution<br>Sensor Locations with > 84% dates covered in 2019-2020',
        geo_scope='usa',
    )
fig.show()

In [None]:
# Low income distribution map
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

df = sensor_list_new
color_max = df.income.max()
fig = px.choropleth(df, geojson=counties, locations='fips', color='income',
                    scope="usa",
                    hover_name='cbsa',
                    color_continuous_scale="hot_r", #Viridis
                    range_color=(0,color_max),
                    labels={'income':'Low Income %'},
                    
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
print('Low Income Distribution:')
fig.show()

In [None]:
# low education plot
df = sensor_list_new

fig = go.Figure(data=go.Scattergeo(
        lon = df['sensor_long'],
        lat = df['sensor_lat'],
        mode = 'markers',
        text = df['text'],
        marker_color = df['education'],
        marker = dict(
            size = 8,
            opacity = 0.8,
            reversescale = False,
            autocolorscale = False,
            symbol = 'square',
            line = dict(
                width=1,
                color='rgba(102, 102, 102)'
            ),
            colorscale = 'gray_r',
            cmin = 0,
            cmax = df['education'].max(),
            colorbar_title="Low education %"
        ))
        )



fig.update_layout(
        title = 'Low Education Distribution<br>Sensor Locations with > 84% dates covered in 2019-2020',
        geo_scope='usa',
    )
fig.show()

In [None]:
# Low education distribution map
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

df = sensor_list_new
color_max = df.education.max()
fig = px.choropleth(df, geojson=counties, locations='fips', color='education',
                    scope="usa",
                    hover_name='cbsa',
                    color_continuous_scale="gray_r", #Viridis
                    labels={'education':'Less than Highschool Education %'},
                    range_color=(0, color_max)
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
print('Low Education Distribution:')
fig.show()

In [None]:
# low language literacy plot
df = sensor_list_new

fig = go.Figure(data=go.Scattergeo(
        lon = df['sensor_long'],
        lat = df['sensor_lat'],
        mode = 'markers',
        text = df['text'],
        marker_color = df['language'],
        marker = dict(
            size = 8,
            opacity = 0.8,
            reversescale = False,
            autocolorscale = False,
            symbol = 'square',
            line = dict(
                width=1,
                color='rgba(102, 102, 102)'
            ),
            colorscale = 'hot_r',
            cmin = 0,
            cmax = df['language'].max(),
            colorbar_title="Low Language Literacy %"
        ))
        )



fig.update_layout(
        title = 'Low Language Literacy Distribution<br>Sensor Locations with > 84% dates covered in 2019-2020',
        geo_scope='usa',
    )
fig.show()

In [None]:
# High language isolation distribution map
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)
    
df = sensor_list_new
color_max = df.language.max()
fig = px.choropleth(df, geojson=counties, locations='fips', color='language',
                    scope="usa",
                    hover_name='cbsa',
                    color_continuous_scale="hot_r", #Viridis
                    range_color=(0,color_max),
                    labels={'language':'Language Isolation %'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
print('High Language Isolation Distribution:')
fig.show()

#### Across US Continent:
* Environmental conditon (i.e. Temp, ) is highly geographical location dependent.
* Some air pollutants are highly dependent on temperature.
* Non-EJ communities tend to form in any climate, slighlty more in colder climate (Northen US).
* EJ communities tend to form in warmer climate (Southern US).
* Because of this, EJ communities are more prone to exposed to PM2.5.
* low income, low education, low language literacy people tend to gather in large cities.

#### Within California/Philadelphia:
* These States with large cities have more equal distribution of people of color.