In [106]:
from dash import Dash, dcc, Output, Input  
import dash_bootstrap_components as dbc    
import plotly.express as px
import pandas as pd                       
import sqlite3
from urllib.request import urlopen
import json
import os


In [107]:
db = r'/Users/jennadodge/uofo-virt-data-pt-12-2021-u-b/Water_Quality_Analysis/Database/database.sqlite3'
# Connect to SQLite database
conn = sqlite3.connect(db)
  
# Create cursor object
cursor = conn.cursor()

contaminants_df = pd.read_sql_query("SELECT * FROM all_contaminants",conn)
# fips_df = pd.read_sql_query('SELECT * from FIPS_Codes', conn)

conn.close()

In [108]:
contaminants_df.head()

Unnamed: 0,State,City,Zip,Utility,People_served,Contaminant,Utility_Measurement,EWG_Health_Guideline,Legal_Limit,Units,Contaminant_Factor
0,VT,Ludlow VT,5001.0,Ludlow Village Water Department,2818,Bromodichloromethane,1.41,0.06,0.0,ppb,23.5
1,VT,Ludlow VT,5001.0,Ludlow Village Water Department,2818,Chloroform,2.26,0.4,0.0,ppb,5.65
2,VT,Ludlow VT,5001.0,Ludlow Village Water Department,2818,Dibromochloromethane,0.938,0.1,0.0,ppb,9.38
3,VT,Ludlow VT,5001.0,Ludlow Village Water Department,2818,Nitrate,0.35,0.14,10.0,ppm,2.5
4,VT,Ludlow VT,5001.0,Ludlow Village Water Department,2818,Total trihalomethanes (TTHMs)†,5.53,0.15,80.0,ppb,36.866667


In [109]:
contaminants_df.dtypes

State                    object
City                     object
Zip                     float64
Utility                  object
People_served             int64
Contaminant              object
Utility_Measurement     float64
EWG_Health_Guideline    float64
Legal_Limit             float64
Units                    object
Contaminant_Factor      float64
dtype: object

In [110]:
# Change Zip dtype to string
contaminants_df["Zip"] = contaminants_df["Zip"].astype(str)

In [111]:
# Get rid of the .0 at the end of zip
contaminants_df["Zip"] = contaminants_df["Zip"].str[:-2]

In [112]:
# add a leading zero
contaminants_df["Zip"] = contaminants_df["Zip"].apply('{:0>5}'.format)

In [74]:
# change dtype of Zip to string, get rid of .0 at the end, add a leading zero if needed
# contaminants_df["Zip"] = contaminants_df["Zip"].astype(str).str[:-2].apply('{:0>5}'.format)

In [113]:
contaminants_df.head()


Unnamed: 0,State,City,Zip,Utility,People_served,Contaminant,Utility_Measurement,EWG_Health_Guideline,Legal_Limit,Units,Contaminant_Factor
0,VT,Ludlow VT,5001,Ludlow Village Water Department,2818,Bromodichloromethane,1.41,0.06,0.0,ppb,23.5
1,VT,Ludlow VT,5001,Ludlow Village Water Department,2818,Chloroform,2.26,0.4,0.0,ppb,5.65
2,VT,Ludlow VT,5001,Ludlow Village Water Department,2818,Dibromochloromethane,0.938,0.1,0.0,ppb,9.38
3,VT,Ludlow VT,5001,Ludlow Village Water Department,2818,Nitrate,0.35,0.14,10.0,ppm,2.5
4,VT,Ludlow VT,5001,Ludlow Village Water Department,2818,Total trihalomethanes (TTHMs)†,5.53,0.15,80.0,ppb,36.866667


In [114]:
contaminants_df['Zip'].value_counts()

93287    968
90630    756
64012    421
26559    381
26301    356
        ... 
49839      1
49834      1
93218      1
93254      1
18045      1
Name: Zip, Length: 2755, dtype: int64

In [115]:
# bring in zips to counties dataframe to merge
file_to_read = os.path.join("..","..","Census_Data_Cleaning","zips_to_counties.csv")
zips_to_counties = pd.read_csv(file_to_read,dtype={"zip": str})

In [116]:
zips_to_counties.dtypes

zip            object
county_fips     int64
state_id       object
dtype: object

In [117]:
zips_to_counties.head()

Unnamed: 0,zip,county_fips,state_id
0,601,72001,PR
1,602,72003,PR
2,603,72005,PR
3,606,72093,PR
4,610,72011,PR


In [118]:
# Change county_fips datatype to string, add leading zero if needed
zips_to_counties["county_fips"] = zips_to_counties["county_fips"].astype(str).apply('{:0>5}'.format)

In [119]:
zips_to_counties.dtypes

zip            object
county_fips    object
state_id       object
dtype: object

In [120]:
df = pd.merge(contaminants_df,zips_to_counties,how="left",left_on="Zip",right_on='zip')

In [121]:
df.shape

(123658, 14)

In [122]:
df["county_fips"].nunique()

901

In [123]:
df.sample(30)

Unnamed: 0,State,City,Zip,Utility,People_served,Contaminant,Utility_Measurement,EWG_Health_Guideline,Legal_Limit,Units,Contaminant_Factor,zip,county_fips,state_id
44843,CA,Palmdale CA,93551,El Dorado Mutual Water Company,648,Dichloroacetic acid,0.125,0.2,0.0,ppb,0.625,93551,6037,CA
16808,MO,Elmo,64423,Elmo PWS,165,Bromoform,0.29,0.5,0.0,ppb,0.58,64423,29147,MO
91870,KS,Bronson KS,66716,Bronson City of,311,Monobromoacetic acid*,0.0967,25.0,0.0,ppb,0.003868,66716,20011,KS
35356,CA,Chino Hills CA,91708,City of Chino Hills,75655,Perchlorate,0.111,1.0,0.0,ppb,0.111,91708,6071,CA
22935,AL,Hodges,35565,Hodges Water System,2352,Total trihalomethanes (TTHMs)†,1.55,0.15,80.0,ppb,10.333333,35565,1133,AL
123085,TN,Lynnville,37047,Lynnville Water Department,1062,Simazine*,0.03,0.1,4.0,ppb,0.3,37047,47117,TN
57539,WY,Mountain View,82930,Mountain View Town Of,1250,Bromochloroacetic acid,4.65,0.02,0.0,ppb,232.5,82930,49043,WY
33969,CA,Bellflower CA,90706,Bellflower Home Gardens Water Company,1129,Manganese*,14.4,100.0,0.0,ppb,0.144,90706,6037,CA
61520,TX,Bogata TX,75417,City of Bogata,1749,Dibromoacetic acid,2.92,0.04,0.0,ppb,73.0,75417,48387,TX
26461,MI,Glennie MI,48116,Lyon Township,9390,Total trihalomethanes (TTHMs)†,8.97,0.15,80.0,ppb,59.8,48116,26093,MI


In [124]:
df["Contaminant"].unique()

array(['Bromodichloromethane', 'Chloroform', 'Dibromochloromethane',
       'Nitrate', 'Total trihalomethanes (TTHMs)†',
       'Radium combined (-226 &amp; -228)', 'Bromoform', 'Manganese',
       'Barium', 'Dichloroacetic acid', 'Xylenes (total)', 'Toluene',
       'MTBE', 'Nitrate and nitrite', 'Haloacetic acids (HAA5)†',
       'Trichloroacetic acid', 'Radon', 'Perchlorate',
       'Hexachlorocyclopentadiene', 'Monochloroacetic acid',
       'Radium combined (-226 &amp; -228)*', 'Barium*', 'Chlorate*',
       'Chromium (hexavalent)*', 'Manganese*', 'Molybdenum*', 'Nitrate*',
       'Strontium*', 'Uranium', 'Vanadium*', 'Chromium (hexavalent)',
       'Chlorate', 'Strontium', 'Vanadium', 'Haloacetic acids (HAA9)†',
       'Di(2-ethylhexyl) phthalate', 'Molybdenum',
       'Haloacetic acids (HAA9)*†', '11-Dichloroethane*', '14-Dioxane*',
       'Monobromoacetic acid', 'Bromodichloromethane*', 'Chloroform*',
       'Dichloroacetic acid*', 'Haloacetic acids (HAA5)*†',
       'Total tri

In [129]:
df.dtypes

State                    object
City                     object
Zip                      object
Utility                  object
People_served             int64
Contaminant              object
Utility_Measurement     float64
EWG_Health_Guideline    float64
Legal_Limit             float64
Units                    object
Contaminant_Factor      float64
zip                      object
county_fips              object
state_id                 object
dtype: object

In [131]:
df.columns

Index(['State', 'City', 'Zip', 'Utility', 'People_served', 'Contaminant',
       'Utility_Measurement', 'EWG_Health_Guideline', 'Legal_Limit', 'Units',
       'Contaminant_Factor', 'zip', 'county_fips', 'state_id'],
      dtype='object')

In [132]:
cols = ['State', 'City', 'Zip', 'county_fips', 'Utility', 'People_served', 'Contaminant',
       'Utility_Measurement', 'EWG_Health_Guideline', 'Legal_Limit', 'Units',
       'Contaminant_Factor']
all_cont_df = df.loc[:,cols]

In [134]:
all_cont_df.head()

Unnamed: 0,State,City,Zip,county_fips,Utility,People_served,Contaminant,Utility_Measurement,EWG_Health_Guideline,Legal_Limit,Units,Contaminant_Factor
0,VT,Ludlow VT,5001,50027,Ludlow Village Water Department,2818,Bromodichloromethane,1.41,0.06,0.0,ppb,23.5
1,VT,Ludlow VT,5001,50027,Ludlow Village Water Department,2818,Chloroform,2.26,0.4,0.0,ppb,5.65
2,VT,Ludlow VT,5001,50027,Ludlow Village Water Department,2818,Dibromochloromethane,0.938,0.1,0.0,ppb,9.38
3,VT,Ludlow VT,5001,50027,Ludlow Village Water Department,2818,Nitrate,0.35,0.14,10.0,ppm,2.5
4,VT,Ludlow VT,5001,50027,Ludlow Village Water Department,2818,Total trihalomethanes (TTHMs)†,5.53,0.15,80.0,ppb,36.866667


In [138]:
## Export as CSV
all_cont_df.to_csv('../all_contaminants_with_fips.csv', index = False)

In [143]:
# read back in
dff = pd.read_csv('../all_contaminants_with_fips.csv', dtype={"Zip":str,"county_fips":str})
dff.head()

Unnamed: 0,State,City,Zip,county_fips,Utility,People_served,Contaminant,Utility_Measurement,EWG_Health_Guideline,Legal_Limit,Units,Contaminant_Factor
0,VT,Ludlow VT,5001,50027,Ludlow Village Water Department,2818,Bromodichloromethane,1.41,0.06,0.0,ppb,23.5
1,VT,Ludlow VT,5001,50027,Ludlow Village Water Department,2818,Chloroform,2.26,0.4,0.0,ppb,5.65
2,VT,Ludlow VT,5001,50027,Ludlow Village Water Department,2818,Dibromochloromethane,0.938,0.1,0.0,ppb,9.38
3,VT,Ludlow VT,5001,50027,Ludlow Village Water Department,2818,Nitrate,0.35,0.14,10.0,ppm,2.5
4,VT,Ludlow VT,5001,50027,Ludlow Village Water Department,2818,Total trihalomethanes (TTHMs)†,5.53,0.15,80.0,ppb,36.866667


In [144]:
dff.dtypes

State                    object
City                     object
Zip                      object
county_fips              object
Utility                  object
People_served             int64
Contaminant              object
Utility_Measurement     float64
EWG_Health_Guideline    float64
Legal_Limit             float64
Units                    object
Contaminant_Factor      float64
dtype: object

In [None]:
dff["Zip"] = dff["Zip"].apply('{:0>5}'.format)

In [24]:
# test out grouping by fips to determine top 10 contaminants
temp_df = df[df["county_fips"]=="41005"]

In [25]:
temp_df.head()

Unnamed: 0,State,City,Zip,Utility,People_served,Contaminant,Utility_Measurement,EWG_Health_Guideline,Legal_Limit,Units,Contaminant_Factor,zip,county_fips,state_id
101561,OR,Milwaukie OR,97004,City of Milwaukie,20500,Chromium (hexavalent),1.09,0.02,0.0,ppb,54.5,97004,41005,OR
101562,OR,Milwaukie OR,97004,City of Milwaukie,20500,Haloacetic acids (HAA5)†,0.233,0.1,60.0,ppb,2.33,97004,41005,OR
101563,OR,Milwaukie OR,97004,City of Milwaukie,20500,Haloacetic acids (HAA9)†,2.73,0.06,0.0,ppb,45.5,97004,41005,OR
101564,OR,Milwaukie OR,97004,City of Milwaukie,20500,Nitrate,2.2,0.14,10.0,ppm,15.714286,97004,41005,OR
101565,OR,Milwaukie OR,97004,City of Milwaukie,20500,Nitrate and nitrite,2.0,0.14,10.0,ppm,14.285714,97004,41005,OR


In [102]:
temp_df.groupby(["Contaminant"]).head(20)

Unnamed: 0,State,City,Zip,Utility,People_served,Contaminant,Utility_Measurement,EWG_Health_Guideline,Legal_Limit,Units,Contaminant_Factor,zip,county_fips,state_id
101561,OR,Milwaukie OR,97004,City of Milwaukie,20500,Chromium (hexavalent),1.09,0.02,0.0,ppb,54.5,97004,41005,OR
101562,OR,Milwaukie OR,97004,City of Milwaukie,20500,Haloacetic acids (HAA5)†,0.233,0.1,60.0,ppb,2.33,97004,41005,OR
101563,OR,Milwaukie OR,97004,City of Milwaukie,20500,Haloacetic acids (HAA9)†,2.73,0.06,0.0,ppb,45.5,97004,41005,OR
101564,OR,Milwaukie OR,97004,City of Milwaukie,20500,Nitrate,2.2,0.14,10.0,ppm,15.714286,97004,41005,OR
101565,OR,Milwaukie OR,97004,City of Milwaukie,20500,Nitrate and nitrite,2.0,0.14,10.0,ppm,14.285714,97004,41005,OR
101566,OR,Milwaukie OR,97004,City of Milwaukie,20500,Total trihalomethanes (TTHMs)†,5.03,0.15,80.0,ppb,33.533333,97004,41005,OR
101567,OR,Milwaukie OR,97004,City of Milwaukie,20500,14-Dioxane,0.0351,0.35,0.0,ppb,0.100286,97004,41005,OR
101568,OR,Milwaukie OR,97004,City of Milwaukie,20500,Barium,3.94,700.0,2000.0,ppb,0.005629,97004,41005,OR
101569,OR,Milwaukie OR,97004,City of Milwaukie,20500,Manganese,0.867,100.0,0.0,ppb,0.00867,97004,41005,OR
101570,OR,Milwaukie OR,97004,City of Milwaukie,20500,Strontium,0.0931,1500.0,0.0,ppb,6.2e-05,97004,41005,OR


In [26]:
temp_df=temp_df.groupby(["Contaminant"]).size().to_frame().sort_values([0],ascending = True).tail(10).reset_index()

In [27]:
temp_df.head(10)

Unnamed: 0,Contaminant,0
0,Chlorate,3
1,Nitrate,3
2,Nitrate and nitrite,3
3,Chromium (hexavalent),4
4,Haloacetic acids (HAA5)†,4
5,Haloacetic acids (HAA9)†,4
6,Manganese,4
7,Strontium,4
8,Total trihalomethanes (TTHMs)†,4
9,Vanadium,4


In [28]:
temp_df = temp_df.rename(columns={0:'Count of Contaminant'})

In [31]:
# df = pd.read_csv('../census_contaminant_priority_by_zip.csv', dtype={"zip":str,"fips":str})

In [62]:
df.head()

Unnamed: 0,State,City,Zip,Utility,People_served,Contaminant,Utility_Measurement,EWG_Health_Guideline,Legal_Limit,Units,Contaminant_Factor,zip,county_fips,state_id
0,VT,Ludlow VT,5001,Ludlow Village Water Department,2818,Bromodichloromethane,1.41,0.06,0.0,ppb,23.5,5001,50027,VT
1,VT,Ludlow VT,5001,Ludlow Village Water Department,2818,Chloroform,2.26,0.4,0.0,ppb,5.65,5001,50027,VT
2,VT,Ludlow VT,5001,Ludlow Village Water Department,2818,Dibromochloromethane,0.938,0.1,0.0,ppb,9.38,5001,50027,VT
3,VT,Ludlow VT,5001,Ludlow Village Water Department,2818,Nitrate,0.35,0.14,10.0,ppm,2.5,5001,50027,VT
4,VT,Ludlow VT,5001,Ludlow Village Water Department,2818,Total trihalomethanes (TTHMs)†,5.53,0.15,80.0,ppb,36.866667,5001,50027,VT


In [63]:
# get scatter plot contaminant algorithm
temp_df = df[df["county_fips"]=="41005"]

In [64]:
cont_sum_df = temp_df.groupby(by=["Contaminant"]).sum().sort_values(by=["Contaminant_Factor"], ascending=False)[['People_served',"Contaminant_Factor"]]

In [65]:
top_15_cont_sum = cont_sum_df.head(15)

In [66]:
top_15_cont_sum

Unnamed: 0_level_0,People_served,Contaminant_Factor
Contaminant,Unnamed: 1_level_1,Unnamed: 2_level_1
Haloacetic acids (HAA9)†,178243,1097.333333
Haloacetic acids (HAA5)†,178243,638.53
Total trihalomethanes (TTHMs)†,178243,526.866667
Chromium (hexavalent),178243,60.345
Nitrate,104311,19.370714
Nitrate and nitrite,104311,17.992857
Bromate,22729,12.9
Vanadium,178243,0.544143
Chlorate,157743,0.524667
Manganese,178243,0.19997


In [83]:
# scatter and hist plot algorithms
cont_fips_df = df

In [84]:
cont_fips_df.head()

Unnamed: 0,State,City,Zip,Utility,People_served,Contaminant,Utility_Measurement,EWG_Health_Guideline,Legal_Limit,Units,Contaminant_Factor,zip,county_fips,state_id
0,VT,Ludlow VT,5001,Ludlow Village Water Department,2818,Bromodichloromethane,1.41,0.06,0.0,ppb,23.5,5001,50027,VT
1,VT,Ludlow VT,5001,Ludlow Village Water Department,2818,Chloroform,2.26,0.4,0.0,ppb,5.65,5001,50027,VT
2,VT,Ludlow VT,5001,Ludlow Village Water Department,2818,Dibromochloromethane,0.938,0.1,0.0,ppb,9.38,5001,50027,VT
3,VT,Ludlow VT,5001,Ludlow Village Water Department,2818,Nitrate,0.35,0.14,10.0,ppm,2.5,5001,50027,VT
4,VT,Ludlow VT,5001,Ludlow Village Water Department,2818,Total trihalomethanes (TTHMs)†,5.53,0.15,80.0,ppb,36.866667,5001,50027,VT


In [85]:
cont_fips_df.shape

(123658, 14)

In [86]:
dff = cont_fips_df.copy()

In [87]:
dff = dff[dff['county_fips']=='41005']

In [93]:
dff2 = dff.groupby(["Contaminant"]).size().to_frame().sort_values([0], ascending = True).tail(10).reset_index()


In [94]:
dff2 = dff2.rename(columns={0: 'Count of Contaminant'})

In [95]:
dff2.shape

(10, 2)

In [96]:
dff2.head()

Unnamed: 0,Contaminant,Count of Contaminant
0,Chlorate,3
1,Nitrate,3
2,Nitrate and nitrite,3
3,Chromium (hexavalent),4
4,Haloacetic acids (HAA5)†,4


In [97]:
dff3 = dff.groupby(by=["Contaminant"]).sum().sort_values(by=['Contaminant_Factor'], ascending=False)[['People_served', 'Contaminant_Factor']]


In [98]:
top15_c_df = dff3.head(15)

In [99]:
top15_c_df

Unnamed: 0_level_0,People_served,Contaminant_Factor
Contaminant,Unnamed: 1_level_1,Unnamed: 2_level_1
Haloacetic acids (HAA9)†,178243,1097.333333
Haloacetic acids (HAA5)†,178243,638.53
Total trihalomethanes (TTHMs)†,178243,526.866667
Chromium (hexavalent),178243,60.345
Nitrate,104311,19.370714
Nitrate and nitrite,104311,17.992857
Bromate,22729,12.9
Vanadium,178243,0.544143
Chlorate,157743,0.524667
Manganese,178243,0.19997


In [100]:
top15_c_df = top15_c_df.reset_index()

In [101]:
top15_c_df

Unnamed: 0,Contaminant,People_served,Contaminant_Factor
0,Haloacetic acids (HAA9)†,178243,1097.333333
1,Haloacetic acids (HAA5)†,178243,638.53
2,Total trihalomethanes (TTHMs)†,178243,526.866667
3,Chromium (hexavalent),178243,60.345
4,Nitrate,104311,19.370714
5,Nitrate and nitrite,104311,17.992857
6,Bromate,22729,12.9
7,Vanadium,178243,0.544143
8,Chlorate,157743,0.524667
9,Manganese,178243,0.19997
