# Preparing Data for Machine Learning
The asthma emergency department visit data for California is loaded, transformed to separate by demographics, and cleaned. The correspondings air quality summaries for the location and time of each row are pulled from the EPA AQS API and attached to the asthma data. The combined data is written to a SQL database for later use in training a ML model.

## EXTRACT

## 1. Load asthma data from the data lake.

In [0]:
storageAccount = "gen10datafund2111"
storageContainer = "jadr-health-insights"
clientSecret = dbutils.secrets.get(scope = "jadr_blob", key = "clientSecret")
clientid = dbutils.secrets.get(scope = "jadr_blob", key = "clientid")
mount_point = "/mnt/darrellgerber/jadr"


configs = {"fs.azure.account.auth.type": "OAuth",
       "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
       "fs.azure.account.oauth2.client.id": clientid,
       "fs.azure.account.oauth2.client.secret": clientSecret,
       "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/d46b54b2-a652-420b-aa5a-2ef7f8fc706e/oauth2/token",
       "fs.azure.createRemoteFileSystemDuringInitialization": "true"}

try: 
    dbutils.fs.unmount(mount_point)
except:
    pass

dbutils.fs.mount(
    source = "abfss://"+storageContainer+"@"+storageAccount+".dfs.core.windows.net/",
    mount_point = mount_point,
    extra_configs = configs)
display(dbutils.fs.ls(mount_point))

path,name,size,modificationTime
dbfs:/mnt/darrellgerber/jadr/Data/,Data/,0,1643742636000
dbfs:/mnt/darrellgerber/jadr/ML-Models/,ML-Models/,0,1643906451000
dbfs:/mnt/darrellgerber/jadr/deleteme.txt,deleteme.txt,8,1643742578000


## TRANSFORM

In [0]:
asthmaDF = spark.read.options(inferSchema = 'True', delimiter=',', header='True').csv('/mnt/darrellgerber/jadr/Data/asthma-emergency-department-visit-rates-fbyr_b/asthma-ed-visit-rates-by-county.csv')
display(asthmaDF)
print(asthmaDF.printSchema())
print(asthmaDF.count())
from pyspark.sql.functions import isnan, when, count, col
# Count of missing values in each column
asthmaDF.select([count(when(isnan(c), c)).alias(c) for c in asthmaDF.columns]).show()
# Count of null values in each column
asthmaDF.select([count(when(col(c).isNull(), c)).alias(c) for c in asthmaDF.columns]).show()
# Count of zero values in each column
asthmaDF.select([count(when(col(c)=="0", c)).alias(c) for c in asthmaDF.columns]).show()

COUNTY,YEAR,STRATA,STRATA NAME,AGE GROUP,NUMBER OF ED VISITS,AGE-ADJUSTED ED VISIT RATE,COMMENT
California,2015,Total population,All ages,All ages,191904.0,50.4,
Alameda,2015,Total population,All ages,All ages,9939.0,64.3,
Alpine,2015,Total population,All ages,All ages,0.0,0.0,
Amador,2015,Total population,All ages,All ages,196.0,58.4,
Butte,2015,Total population,All ages,All ages,1044.0,50.2,
Calaveras,2015,Total population,All ages,All ages,185.0,48.0,
Colusa,2015,Total population,All ages,All ages,97.0,41.4,
Contra Costa,2015,Total population,All ages,All ages,6858.0,65.2,
Del Norte,2015,Total population,All ages,All ages,140.0,53.0,
El Dorado,2015,Total population,All ages,All ages,592.0,36.4,


Cleanup that is needed from this quick look:
1. Remove numeric data that is zero or null
2. Change the type of numeric columns to int or float

In [0]:
asthmaDF = asthmaDF.filter((asthmaDF['NUMBER OF ED VISITS'] != "0") & (asthmaDF['NUMBER OF ED VISITS'].isNotNull()))
asthmaDF = asthmaDF.filter((asthmaDF['AGE-ADJUSTED ED VISIT RATE'] != "0") & (asthmaDF['AGE-ADJUSTED ED VISIT RATE'].isNotNull()))
print(asthmaDF.count())
from pyspark.sql.functions import isnan, when, count, col
# Count of missing values in each column
asthmaDF.select([count(when(isnan(c), c)).alias(c) for c in asthmaDF.columns]).show()
# Count of null values in each column
asthmaDF.select([count(when(col(c).isNull(), c)).alias(c) for c in asthmaDF.columns]).show()
# Count of zero values in each column
asthmaDF.select([count(when(col(c)=="0", c)).alias(c) for c in asthmaDF.columns]).show()

In [0]:
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as F
# Remove the commas in the column because the cast function will change those into null values instead of converting the number
asthmaDF = asthmaDF.withColumn('NUMBER OF ED VISITS(int)', F.regexp_replace(F.col('NUMBER OF ED VISITS'), ',', '').cast('integer'))
asthmaDF = asthmaDF.drop('NUMBER OF ED VISITS')
asthmaDF = asthmaDF.withColumnRenamed("NUMBER OF ED VISITS(int)","NUMBER OF ED VISITS")
display(asthmaDF)

COUNTY,YEAR,STRATA,STRATA NAME,AGE GROUP,AGE-ADJUSTED ED VISIT RATE,COMMENT,NUMBER OF ED VISITS
California,2015,Total population,All ages,All ages,50.4,,191904
Alameda,2015,Total population,All ages,All ages,64.3,,9939
Amador,2015,Total population,All ages,All ages,58.4,,196
Butte,2015,Total population,All ages,All ages,50.2,,1044
Calaveras,2015,Total population,All ages,All ages,48.0,,185
Colusa,2015,Total population,All ages,All ages,41.4,,97
Contra Costa,2015,Total population,All ages,All ages,65.2,,6858
Del Norte,2015,Total population,All ages,All ages,53.0,,140
El Dorado,2015,Total population,All ages,All ages,36.4,,592
Fresno,2015,Total population,All ages,All ages,77.0,,7936


Remove the Comment column

In [0]:
asthmaDF = asthmaDF.drop('COMMENT')

## 2. Pull out just the total population data

In [0]:
asthmaTotalDF = asthmaDF.filter(asthmaDF.STRATA == 'Total population')
display(asthmaTotalDF)


COUNTY,YEAR,STRATA,STRATA NAME,AGE GROUP,AGE-ADJUSTED ED VISIT RATE,NUMBER OF ED VISITS
California,2015,Total population,All ages,All ages,50.4,191904
Alameda,2015,Total population,All ages,All ages,64.3,9939
Amador,2015,Total population,All ages,All ages,58.4,196
Butte,2015,Total population,All ages,All ages,50.2,1044
Calaveras,2015,Total population,All ages,All ages,48.0,185
Colusa,2015,Total population,All ages,All ages,41.4,97
Contra Costa,2015,Total population,All ages,All ages,65.2,6858
Del Norte,2015,Total population,All ages,All ages,53.0,140
El Dorado,2015,Total population,All ages,All ages,36.4,592
Fresno,2015,Total population,All ages,All ages,77.0,7936


Remove the columns that we don't need: Strata, Strata Name, and Age Group.

In [0]:
asthmaTotalDF = asthmaTotalDF.drop('STRATA', 'STRATA NAME', 'AGE GROUP')
print(asthmaTotalDF.printSchema())

Even though we know this data is from California, let's add a column for state name to keep in the database for possible later use.

In [0]:
from pyspark.sql.functions import col,lit
asthmaTotalDF = asthmaTotalDF.withColumn("STATE",lit("CA"))
asthmaTotalDF = asthmaTotalDF['STATE', 'COUNTY', 'YEAR', 'NUMBER OF ED VISITS', 'AGE-ADJUSTED ED VISIT RATE']
display(asthmaTotalDF)

STATE,COUNTY,YEAR,NUMBER OF ED VISITS,AGE-ADJUSTED ED VISIT RATE
CA,California,2015,191904,50.4
CA,Alameda,2015,9939,64.3
CA,Amador,2015,196,58.4
CA,Butte,2015,1044,50.2
CA,Calaveras,2015,185,48.0
CA,Colusa,2015,97,41.4
CA,Contra Costa,2015,6858,65.2
CA,Del Norte,2015,140,53.0
CA,El Dorado,2015,592,36.4
CA,Fresno,2015,7936,77.0


Write just the Asthma data to the SQL database. Note: We may want to change this to combine with air quality data first

## EXTRACT

### 3. Read in the air quality data and match up to the county and year in our asthma dataset

In [0]:
import requests
import json
### Read in API

# 2021 data 
# Documentation for API: https://aqs.epa.gov/aqsweb/documents/data_api.html#annual

# By county, annual data

# List of all counties in California
# https://aqs.epa.gov/data/api/list/countiesByState?email=test@aqs.api&key=test&state=06      
email = "dgerber@dev-10.com"
key = confluentRegistrySecret = dbutils.secrets.get(scope = "jadr_blob", key = "AQKey_dg")
# This returns Lead (TSP) LC, Carbon monoxide, Sulfur dioxide, Nitrogen dioxide (NO2), Ozone, PM10 Total 0-10um STP, Lead PM10 LC FRM/FEM,  and PM2.5 - Local Conditions
params = "14129,42401,42602,44201,81102,85129,88101" 


state = "06" # California
col_to_keep = ["state_code", "county_code", "latitude","longitude","parameter","metric_used","method","year","units_of_measure",
               "state","county","city","arithmetic_mean","standard_deviation", "first_max_value", "second_max_value", 
               "ninety_ninth_percentile", "cbsa_code"]

AQIList = []
# Our CA dataset goes from 2015 to 2019
dates = ['2015', '2016', '2017', '2018', '2019']

for date in dates:
    bdate = f"{date}0101"
    edate = f"{date}1231"

    stateurl = f'https://aqs.epa.gov/data/api/list/countiesByState?email={email}&key={key}&state={state}'
    for countyrow in json.loads(requests.get(stateurl).text)['Data']:
        county = countyrow['code']
        URL = f'https://aqs.epa.gov/data/api/annualData/byCounty?email={email}&key={key}&param={params}&bdate={bdate}&edate={edate}&state={state}&county={county}'
        for aDict in json.loads(requests.get(URL).text)['Data']:
            AQIList.append(dict((k, aDict[k]) for k in col_to_keep)) # Grab only the columns we want, and append it to the list of dictionaries



In [0]:
AQIDF = spark.createDataFrame(AQIList)


## TRANSFORM

In [0]:
from pyspark.sql.functions import col,lit
AQIDF = AQIDF.withColumn("state",lit("CA"))
display(AQIDF)
AQIDF.printSchema()
display(AQIDF.select("parameter").distinct())

arithmetic_mean,cbsa_code,city,county,county_code,first_max_value,latitude,longitude,method,metric_used,ninety_ninth_percentile,parameter,second_max_value,standard_deviation,state,state_code,units_of_measure,year
0.758855,41860.0,Oakland,Alameda,1,21.6,37.814781,-122.282347,INSTRUMENTAL - PULSED FLUORESCENT,Observed Values,4.2,Sulfur dioxide,15.7,0.923692,CA,6,Parts per billion,2015
2.125905,41860.0,Oakland,Alameda,1,21.6,37.814781,-122.282347,INSTRUMENTAL - PULSED FLUORESCENT,Daily maximum 1-hour average,11.2,Sulfur dioxide,15.7,2.497825,CA,6,Parts per billion,2015
0.762953,41860.0,Oakland,Alameda,1,3.9,37.814781,-122.282347,,Daily Average of observed values,2.3,Sulfur dioxide,2.8,0.57172,CA,6,Parts per billion,2015
0.743659,41860.0,Oakland,Alameda,1,13.3,37.814781,-122.282347,,3-Hour block average of observed hourly values,3.8,Sulfur dioxide,7.8,0.826127,CA,6,Parts per billion,2015
2.972808,41860.0,,Alameda,1,18.9,37.689615,-121.631916,INSTRUMENTAL - CHEMILUMINESCENCE,Observed values,10.1,Nitrogen dioxide (NO2),17.7,1.979219,CA,6,Parts per billion,2015
5.761826,41860.0,,Alameda,1,18.9,37.689615,-121.631916,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,15.6,Nitrogen dioxide (NO2),17.2,3.139326,CA,6,Parts per billion,2015
18.12485,41860.0,Oakland,Alameda,1,72.5,37.793624,-122.263376,INSTRUMENTAL - CHEMILUMINESCENCE,Observed values,43.1,Nitrogen dioxide (NO2),59.8,9.450608,CA,6,Parts per billion,2015
29.338187,41860.0,Oakland,Alameda,1,72.5,37.793624,-122.263376,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,55.4,Nitrogen dioxide (NO2),59.8,10.039421,CA,6,Parts per billion,2015
13.975694,41860.0,Oakland,Alameda,1,57.1,37.814781,-122.282347,INSTRUMENTAL - CHEMILUMINESCENCE,Observed values,39.7,Nitrogen dioxide (NO2),56.6,9.404179,CA,6,Parts per billion,2015
24.574095,41860.0,Oakland,Alameda,1,57.1,37.814781,-122.282347,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,49.8,Nitrogen dioxide (NO2),56.6,10.749651,CA,6,Parts per billion,2015


parameter
PM10 Total 0-10um STP
Lead (TSP) LC
PM2.5 - Local Conditions
Sulfur dioxide
Ozone
Nitrogen dioxide (NO2)
Lead PM10 LC FRM/FEM


Let's take a look at just Los Angeles County for Lead for 2015 to see what we have to help narrow down what to keep.

In [0]:

LosAngelesDF = AQIDF.filter(
                           (AQIDF['parameter']  == 'Lead PM10 LC FRM/FEM') &
                           (AQIDF['year']  == '2015'))
display(LosAngelesDF)
display(LosAngelesDF.select("method").distinct())
display(LosAngelesDF.select("metric_used").distinct())

arithmetic_mean,cbsa_code,city,county,county_code,first_max_value,latitude,longitude,method,metric_used,ninety_ninth_percentile,parameter,second_max_value,standard_deviation,state,state_code,units_of_measure,year
0.001529,40900,Arden-Arcade,Sacramento,67,0.007,38.613779,-121.368014,Thermo/R & P 2025 PM10 - X-ray Fluorescence (EDXRF) FRM,Observed Values,0.007,Lead PM10 LC FRM/FEM,0.007,0.001901,CA,6,Micrograms/cubic meter (LC),2015
0.003151,41940,San Jose,Santa Clara,85,0.0121,37.348497,-121.894898,Thermo/R & P 2025 Teflon - ICPMS,Observed Values,0.0121,Lead PM10 LC FRM/FEM,0.0103,0.002745,CA,6,Micrograms/cubic meter (LC),2015


method
Thermo/R & P 2025 PM10 - X-ray Fluorescence (EDXRF) FRM
Thermo/R & P 2025 Teflon - ICPMS


metric_used
Observed Values


The dataset contains multiple measurements for each pollutant using different methods and metrics. We will narrow down the measurements used so we can get only one per site per year.

In [0]:
parameters = ['PM10 Total 0-10um STP',
              'Lead (TSP) LC',
              'PM2.5 - Local Conditions',
              'Sulfur dioxide',
              'Ozone',
              'Nitrogen dioxide (NO2)']

preferredMethods = ['Hi-Vol - ICAP SPECTRA (ICP-MS); 0.45M HNO3 Boil30 min',
                    'INSTRUMENTAL - GAS PHASE CHEMILUMINESCENCE',
                   'INSTRUMENTAL - CHEMILUMINESCENCE',
                   'INSTRUMENTAL - ULTRA VIOLET ABSORPTION',
                   'INSTRUMENTAL - ULTRA VIOLET',
                   'Andersen RAAS2.5-300 PM2.5 SEQ w/WINS - GRAVIMETRIC',
                   'R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC - Gravimetric',
                   'Multiple Methods Used',
                   'INSTRUMENTAL - Pulsed Fluorescent 43C-TLE/43i-TLE']
preferredMetrics = ['Daily Maximum 1-hour average',
                    'Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM)',
                    'Daily Mean',
                    'Daily maximum 1-hour average',
                   'Observed Values']

In [0]:
AQIDFclean = AQIDF.filter(AQIDF.parameter.isin(parameters))
display(AQIDFclean.select("parameter").distinct())
AQIDFclean = AQIDFclean.filter(AQIDF.method.isin(preferredMethods))
display(AQIDFclean.select("method").distinct())
AQIDFclean = AQIDFclean.filter(AQIDF.metric_used.isin(preferredMetrics))
display(AQIDFclean.select("metric_used").distinct())

parameter
PM10 Total 0-10um STP
Lead (TSP) LC
PM2.5 - Local Conditions
Sulfur dioxide
Ozone
Nitrogen dioxide (NO2)


method
INSTRUMENTAL - Pulsed Fluorescent 43C-TLE/43i-TLE
Andersen RAAS2.5-300 PM2.5 SEQ w/WINS - GRAVIMETRIC
Multiple Methods Used
R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC - Gravimetric
Hi-Vol - ICAP SPECTRA (ICP-MS); 0.45M HNO3 Boil30 min
INSTRUMENTAL - ULTRA VIOLET ABSORPTION
INSTRUMENTAL - CHEMILUMINESCENCE
INSTRUMENTAL - ULTRA VIOLET
INSTRUMENTAL - GAS PHASE CHEMILUMINESCENCE


metric_used
Daily maximum 1-hour average
Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM)
Daily Mean
Observed Values
Daily Maximum 1-hour average


In [0]:
LosAngelesDF = AQIDFclean.filter(
                           (AQIDFclean['metric_used']  == 'Observed Values') &
                           (AQIDFclean['year']  == '2015'))
display(LosAngelesDF)
display(LosAngelesDF.select("method").distinct())
display(LosAngelesDF.select("parameter").distinct())

arithmetic_mean,cbsa_code,city,county,county_code,first_max_value,latitude,longitude,method,metric_used,ninety_ninth_percentile,parameter,second_max_value,standard_deviation,state,state_code,units_of_measure,year
0.003579,23420,Fresno,Fresno,19,0.014,36.78538,-119.77321,Multiple Methods Used,Observed Values,0.014,Lead (TSP) LC,0.007,0.002129,CA,6,Micrograms/cubic meter (LC),2015
0.506257,23420,Fresno,Fresno,19,10.8,36.78538,-119.77321,INSTRUMENTAL - Pulsed Fluorescent 43C-TLE/43i-TLE,Observed Values,3.0,Sulfur dioxide,7.7,0.646618,CA,6,Parts per billion,2015
0.017739,20940,Calexico,Imperial,25,0.087,32.67618,-115.48307,Multiple Methods Used,Observed Values,0.087,Lead (TSP) LC,0.046,0.01728,CA,6,Micrograms/cubic meter (LC),2015
0.726755,20940,Calexico,Imperial,25,16.1,32.67618,-115.48307,INSTRUMENTAL - Pulsed Fluorescent 43C-TLE/43i-TLE,Observed Values,4.5,Sulfur dioxide,13.0,0.975822,CA,6,Parts per billion,2015
0.051669,13860,,Inyo,27,0.3,37.360684,-118.330783,INSTRUMENTAL - Pulsed Fluorescent 43C-TLE/43i-TLE,Observed Values,0.2,Sulfur dioxide,0.3,0.056891,CA,6,Parts per billion,2015
0.002931,31080,Los Angeles,Los Angeles,37,0.01,33.95507,-118.43049,Hi-Vol - ICAP SPECTRA (ICP-MS); 0.45M HNO3 Boil30 min,Observed Values,0.01,Lead (TSP) LC,0.01,0.002784,CA,6,Micrograms/cubic meter (LC),2015
0.00445,31080,Long Beach,Los Angeles,37,0.017,33.79236,-118.17533,Hi-Vol - ICAP SPECTRA (ICP-MS); 0.45M HNO3 Boil30 min,Observed Values,0.017,Lead (TSP) LC,0.012,0.003753,CA,6,Micrograms/cubic meter (LC),2015
0.007362,31080,Pico Rivera,Los Angeles,37,0.026,34.01029,-118.0685,Hi-Vol - ICAP SPECTRA (ICP-MS); 0.45M HNO3 Boil30 min,Observed Values,0.026,Lead (TSP) LC,0.022,0.005074,CA,6,Micrograms/cubic meter (LC),2015
0.016517,31080,Commerce,Los Angeles,37,0.031,34.008333,-118.190556,Hi-Vol - ICAP SPECTRA (ICP-MS); 0.45M HNO3 Boil30 min,Observed Values,0.031,Lead (TSP) LC,0.031,0.007486,CA,6,Micrograms/cubic meter (LC),2015
0.011488,31080,Commerce,Los Angeles,37,0.025,34.008333,-118.190556,Hi-Vol - ICAP SPECTRA (ICP-MS); 0.45M HNO3 Boil30 min,Observed Values,0.025,Lead (TSP) LC,0.024,0.00573,CA,6,Micrograms/cubic meter (LC),2015


method
INSTRUMENTAL - Pulsed Fluorescent 43C-TLE/43i-TLE
Multiple Methods Used
Hi-Vol - ICAP SPECTRA (ICP-MS); 0.45M HNO3 Boil30 min


parameter
Lead (TSP) LC
Sulfur dioxide


In [0]:
# Handle bringing in some SO2 types we don't want with 'Observed Values'
AQIDFclean = AQIDFclean.filter(~((AQIDF.parameter == 'Sulfur dioxide') & 
    (AQIDF.metric_used == 'Observed Values')))


In [0]:
LosAngelesDF = AQIDFclean.filter((AQIDFclean['county']  == 'Los Angeles') &
                           (AQIDFclean['metric_used']  == 'Observed Values') &
                           (AQIDFclean['year']  == '2015'))
display(LosAngelesDF)
display(LosAngelesDF.select("method").distinct())
display(LosAngelesDF.select("parameter").distinct())

arithmetic_mean,cbsa_code,city,county,county_code,first_max_value,latitude,longitude,method,metric_used,ninety_ninth_percentile,parameter,second_max_value,standard_deviation,state,state_code,units_of_measure,year
0.002931,31080,Los Angeles,Los Angeles,37,0.01,33.95507,-118.43049,Hi-Vol - ICAP SPECTRA (ICP-MS); 0.45M HNO3 Boil30 min,Observed Values,0.01,Lead (TSP) LC,0.01,0.002784,CA,6,Micrograms/cubic meter (LC),2015
0.00445,31080,Long Beach,Los Angeles,37,0.017,33.79236,-118.17533,Hi-Vol - ICAP SPECTRA (ICP-MS); 0.45M HNO3 Boil30 min,Observed Values,0.017,Lead (TSP) LC,0.012,0.003753,CA,6,Micrograms/cubic meter (LC),2015
0.007362,31080,Pico Rivera,Los Angeles,37,0.026,34.01029,-118.0685,Hi-Vol - ICAP SPECTRA (ICP-MS); 0.45M HNO3 Boil30 min,Observed Values,0.026,Lead (TSP) LC,0.022,0.005074,CA,6,Micrograms/cubic meter (LC),2015
0.016517,31080,Commerce,Los Angeles,37,0.031,34.008333,-118.190556,Hi-Vol - ICAP SPECTRA (ICP-MS); 0.45M HNO3 Boil30 min,Observed Values,0.031,Lead (TSP) LC,0.031,0.007486,CA,6,Micrograms/cubic meter (LC),2015
0.011488,31080,Commerce,Los Angeles,37,0.025,34.008333,-118.190556,Hi-Vol - ICAP SPECTRA (ICP-MS); 0.45M HNO3 Boil30 min,Observed Values,0.025,Lead (TSP) LC,0.024,0.00573,CA,6,Micrograms/cubic meter (LC),2015
0.017859,31080,Vernon,Los Angeles,37,0.052,34.006389,-118.193056,Hi-Vol - ICAP SPECTRA (ICP-MS); 0.45M HNO3 Boil30 min,Observed Values,0.038,Lead (TSP) LC,0.038,0.007715,CA,6,Micrograms/cubic meter (LC),2015
0.020161,31080,Vernon,Los Angeles,37,0.067,34.006389,-118.193056,Hi-Vol - ICAP SPECTRA (ICP-MS); 0.45M HNO3 Boil30 min,Observed Values,0.06,Lead (TSP) LC,0.06,0.010168,CA,6,Micrograms/cubic meter (LC),2015
0.021236,31080,Vernon,Los Angeles,37,0.062,34.006389,-118.193056,Hi-Vol - ICAP SPECTRA (ICP-MS); 0.45M HNO3 Boil30 min,Observed Values,0.057,Lead (TSP) LC,0.057,0.010019,CA,6,Micrograms/cubic meter (LC),2015
0.011389,31080,Industry (corporate name for City of Industry),Los Angeles,37,0.034,34.02637,-117.9822,Hi-Vol - ICAP SPECTRA (ICP-MS); 0.45M HNO3 Boil30 min,Observed Values,0.034,Lead (TSP) LC,0.023,0.005584,CA,6,Micrograms/cubic meter (LC),2015
0.028678,31080,Santa Fe Springs,Los Angeles,37,0.097,33.954952,-118.055768,Hi-Vol - ICAP SPECTRA (ICP-MS); 0.45M HNO3 Boil30 min,Observed Values,0.097,Lead (TSP) LC,0.064,0.016331,CA,6,Micrograms/cubic meter (LC),2015


method
Hi-Vol - ICAP SPECTRA (ICP-MS); 0.45M HNO3 Boil30 min


parameter
Lead (TSP) LC


In [0]:
from pyspark.sql import functions as F
AQIDFAgg = AQIDFclean.groupBy( 'state', 'county','year','parameter').pivot('parameter').agg(F.mean('arithmetic_mean'),
                              F.mean('first_max_value'),
                              F.mean('ninety_ninth_percentile'), 
                              F.mean('standard_deviation'),
                              F.mean('second_max_value'),
                              F.first('method'),
                              F.first('metric_used'),                            
                              F.first('units_of_measure'))

display(AQIDFAgg)

state,county,year,parameter,Lead (TSP) LC_avg(arithmetic_mean),Lead (TSP) LC_avg(first_max_value),Lead (TSP) LC_avg(ninety_ninth_percentile),Lead (TSP) LC_avg(standard_deviation),Lead (TSP) LC_avg(second_max_value),Lead (TSP) LC_first(method),Lead (TSP) LC_first(metric_used),Lead (TSP) LC_first(units_of_measure),Nitrogen dioxide (NO2)_avg(arithmetic_mean),Nitrogen dioxide (NO2)_avg(first_max_value),Nitrogen dioxide (NO2)_avg(ninety_ninth_percentile),Nitrogen dioxide (NO2)_avg(standard_deviation),Nitrogen dioxide (NO2)_avg(second_max_value),Nitrogen dioxide (NO2)_first(method),Nitrogen dioxide (NO2)_first(metric_used),Nitrogen dioxide (NO2)_first(units_of_measure),Ozone_avg(arithmetic_mean),Ozone_avg(first_max_value),Ozone_avg(ninety_ninth_percentile),Ozone_avg(standard_deviation),Ozone_avg(second_max_value),Ozone_first(method),Ozone_first(metric_used),Ozone_first(units_of_measure),PM10 Total 0-10um STP_avg(arithmetic_mean),PM10 Total 0-10um STP_avg(first_max_value),PM10 Total 0-10um STP_avg(ninety_ninth_percentile),PM10 Total 0-10um STP_avg(standard_deviation),PM10 Total 0-10um STP_avg(second_max_value),PM10 Total 0-10um STP_first(method),PM10 Total 0-10um STP_first(metric_used),PM10 Total 0-10um STP_first(units_of_measure),PM2.5 - Local Conditions_avg(arithmetic_mean),PM2.5 - Local Conditions_avg(first_max_value),PM2.5 - Local Conditions_avg(ninety_ninth_percentile),PM2.5 - Local Conditions_avg(standard_deviation),PM2.5 - Local Conditions_avg(second_max_value),PM2.5 - Local Conditions_first(method),PM2.5 - Local Conditions_first(metric_used),PM2.5 - Local Conditions_first(units_of_measure),Sulfur dioxide_avg(arithmetic_mean),Sulfur dioxide_avg(first_max_value),Sulfur dioxide_avg(ninety_ninth_percentile),Sulfur dioxide_avg(standard_deviation),Sulfur dioxide_avg(second_max_value),Sulfur dioxide_first(method),Sulfur dioxide_first(metric_used),Sulfur dioxide_first(units_of_measure)
CA,Alameda,2015,Nitrogen dioxide (NO2),,,,,,,,,20.1278364,49.22,41.7,9.059481,45.28,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
CA,Alameda,2015,Ozone,,,,,,,,,,,,,,,,,0.0409086,0.0984,0.0862,0.0122206,0.092,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,
CA,Alameda,2016,Nitrogen dioxide (NO2),,,,,,,,,19.29764116666667,46.116666666666674,37.73333333333333,7.934909333333333,39.2,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
CA,Alameda,2016,Ozone,,,,,,,,,,,,,,,,,0.0373358333333333,0.0821666666666666,0.074,0.011103,0.0781666666666666,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,
CA,Alameda,2017,Nitrogen dioxide (NO2),,,,,,,,,20.515038,61.19999999999999,44.43333333333334,9.129422,49.883333333333326,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
CA,Alameda,2017,Ozone,,,,,,,,,,,,,,,,,0.038411,0.0976666666666666,0.071,0.0107346666666666,0.0873333333333333,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,
CA,Alameda,2018,Nitrogen dioxide (NO2),,,,,,,,,22.827882166666665,69.06666666666666,59.2,10.554471166666668,65.26666666666667,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
CA,Alameda,2018,Ozone,,,,,,,,,,,,,,,,,0.0360527999999999,0.0713999999999999,0.0612,0.0095216,0.0673999999999999,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,
CA,Alameda,2019,Nitrogen dioxide (NO2),,,,,,,,,21.414331500000003,55.18333333333334,45.63333333333333,9.367031333333331,50.48333333333334,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
CA,Alameda,2019,Ozone,,,,,,,,,,,,,,,,,0.0366726,0.0914,0.0679999999999999,0.0104352,0.0849999999999999,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,


Need to aggregate the parameters into a single row for each year and county

In [0]:
print(AQIDFAgg.printSchema())

Rename the PM2.5 columns because the agg() function does not handle column names with periods in them

In [0]:
AQIDFAgg = AQIDFAgg.withColumnRenamed("PM2.5 - Local Conditions_avg(ninety_ninth_percentile)","PM25 - Local Conditions_avg(ninety_ninth_percentile)")
AQIDFAgg = AQIDFAgg.withColumnRenamed("PM2.5 - Local Conditions_avg(arithmetic_mean)","PM25 - Local Conditions_avg(arithmetic_mean)")
AQIDFAgg = AQIDFAgg.withColumnRenamed("PM2.5 - Local Conditions_avg(first_max_value)","PM25 - Local Conditions_avg(first_max_value)")
AQIDFAgg = AQIDFAgg.withColumnRenamed("PM2.5 - Local Conditions_avg(standard_deviation)","PM25 - Local Conditions_avg(standard_deviation)")
AQIDFAgg = AQIDFAgg.withColumnRenamed("PM2.5 - Local Conditions_avg(second_max_value)","PM25 - Local Conditions_avg(second_max_value)")
AQIDFAgg = AQIDFAgg.withColumnRenamed("PM2.5 - Local Conditions_first(method)","PM25 - Local Conditions_first(method)")
AQIDFAgg = AQIDFAgg.withColumnRenamed("PM2.5 - Local Conditions_first(metric_used)","PM25 - Local Conditions_first(metric_used)")
AQIDFAgg = AQIDFAgg.withColumnRenamed("PM2.5 - Local Conditions_first(units_of_measure)","PM25 - Local Conditions_first(units_of_measure)")


AQIDFAgg.printSchema()


In [0]:
AQIDFAgg2 = AQIDFAgg.groupBy('state','county','year').agg(F.first('Lead (TSP) LC_avg(arithmetic_mean)', ignorenulls=True).alias('LEAD_MEAN'),
                             F.first('Lead (TSP) LC_avg(first_max_value)', ignorenulls=True).alias('LEAD_1STMAX'),
                              F.first('Lead (TSP) LC_avg(ninety_ninth_percentile)', ignorenulls=True).alias('LEAD_99PERC'), 
                              F.first('Lead (TSP) LC_avg(standard_deviation)', ignorenulls=True).alias('LEAD_STD'),
                              F.first('Lead (TSP) LC_avg(second_max_value)', ignorenulls=True).alias('LEAD_2NDMAX'),
                              F.first('Lead (TSP) LC_first(method)', ignorenulls=True).alias('LEAD_METHOD'),
                              F.first('Lead (TSP) LC_first(metric_used)', ignorenulls=True).alias('LEAD_METRIC'),                            
                              F.first('Lead (TSP) LC_first(units_of_measure)', ignorenulls=True).alias('LEAD_UNITS'),
                              F.first('Nitrogen dioxide (NO2)_avg(arithmetic_mean)', ignorenulls=True).alias('NO2_MEAN'),
                              F.first('Nitrogen dioxide (NO2)_avg(first_max_value)', ignorenulls=True).alias('NO2_1STMAX'),
                              F.first('Nitrogen dioxide (NO2)_avg(ninety_ninth_percentile)', ignorenulls=True).alias('NO2_99PERC'),
                              F.first('Nitrogen dioxide (NO2)_avg(standard_deviation)', ignorenulls=True).alias('NO2_STD'),
                              F.first('Nitrogen dioxide (NO2)_avg(second_max_value)', ignorenulls=True).alias('NO2_2NDMAX'),
                              F.first('Nitrogen dioxide (NO2)_first(method)', ignorenulls=True).alias('NO2_METHOD'),
                              F.first('Nitrogen dioxide (NO2)_first(metric_used)', ignorenulls=True).alias('NO2_METRIC'),
                              F.first('Nitrogen dioxide (NO2)_first(units_of_measure)', ignorenulls=True).alias('NO2_UNITS'),
                              F.first('Ozone_avg(arithmetic_mean)', ignorenulls=True).alias('OZONE_MEAN'),
                              F.first('Ozone_avg(first_max_value)', ignorenulls=True).alias('OZONE_1STMAX'),
                              F.first('Ozone_avg(ninety_ninth_percentile)', ignorenulls=True).alias('OZONE_99PERC'),
                              F.first('Ozone_avg(standard_deviation)', ignorenulls=True).alias('OZONE_STD'),
                              F.first('Ozone_avg(second_max_value)', ignorenulls=True).alias('OZONE_2NDMAX'),
                              F.first('Ozone_first(method)', ignorenulls=True).alias('OZONE_METHOD'),
                              F.first('Ozone_first(metric_used)', ignorenulls=True).alias('OZONE_METRIC'),
                              F.first('Ozone_first(units_of_measure)', ignorenulls=True).alias('OZONE_UNITS'),                                                                       
                              F.first('PM10 Total 0-10um STP_avg(arithmetic_mean)', ignorenulls=True).alias('PM10_MEAN'),
                              F.first('PM10 Total 0-10um STP_avg(first_max_value)', ignorenulls=True).alias('PM10_1STMAX'),
                              F.first('PM10 Total 0-10um STP_avg(ninety_ninth_percentile)', ignorenulls=True).alias('PM10_99PERC'),
                              F.first('PM10 Total 0-10um STP_avg(standard_deviation)', ignorenulls=True).alias('PM10_STD'),
                              F.first('PM10 Total 0-10um STP_avg(second_max_value)', ignorenulls=True).alias('PM10_2NDMAX'),
                              F.first('PM10 Total 0-10um STP_first(method)', ignorenulls=True).alias('PM10_METHOD'),
                              F.first('PM10 Total 0-10um STP_first(metric_used)', ignorenulls=True).alias('PM10_METRIC'),
                              F.first('PM10 Total 0-10um STP_first(units_of_measure)', ignorenulls=True).alias('PM10_UNITS'),
                              F.first('PM25 - Local Conditions_avg(arithmetic_mean)', ignorenulls=True).alias('PM25_MEAN'),
                              F.first('PM25 - Local Conditions_avg(first_max_value)', ignorenulls=True).alias('PM25_1STMAX'),
                              F.first('PM25 - Local Conditions_avg(ninety_ninth_percentile)', ignorenulls=True).alias('PM25_99PERC'),
                              F.first('PM25 - Local Conditions_avg(standard_deviation)', ignorenulls=True).alias('PM25_STD'),
                              F.first('PM25 - Local Conditions_avg(second_max_value)', ignorenulls=True).alias('PM25_2NDMAX'),
                              F.first('PM25 - Local Conditions_first(method)', ignorenulls=True).alias('PM25_METHOD'),
                              F.first('PM25 - Local Conditions_first(metric_used)', ignorenulls=True).alias('PM25_METRIC'),
                              F.first('PM25 - Local Conditions_first(units_of_measure)', ignorenulls=True).alias('PM25_UNITS'),
                              F.first('Sulfur dioxide_avg(arithmetic_mean)', ignorenulls=True).alias('SO2_MEAN'),                                     
                              F.first('Sulfur dioxide_avg(first_max_value)', ignorenulls=True).alias('SO2_1STMAX'),
                              F.first('Sulfur dioxide_avg(ninety_ninth_percentile)', ignorenulls=True).alias('SO2_99PERC'),
                              F.first('Sulfur dioxide_avg(standard_deviation)', ignorenulls=True).alias('SO2_STD'),
                              F.first('Sulfur dioxide_avg(second_max_value)', ignorenulls=True).alias('SO2_2NDMAX'),
                              F.first('Sulfur dioxide_first(method)', ignorenulls=True).alias('SO2_METHOD'),
                              F.first('Sulfur dioxide_first(metric_used)', ignorenulls=True).alias('SO2_METRIC'),
                              F.first('Sulfur dioxide_first(units_of_measure)', ignorenulls=True).alias('SO2_UNITS'))
                                                                                                    
display(AQIDFAgg2)

state,county,year,LEAD_MEAN,LEAD_1STMAX,LEAD_99PERC,LEAD_STD,LEAD_2NDMAX,LEAD_METHOD,LEAD_METRIC,LEAD_UNITS,NO2_MEAN,NO2_1STMAX,NO2_99PERC,NO2_STD,NO2_2NDMAX,NO2_METHOD,NO2_METRIC,NO2_UNITS,OZONE_MEAN,OZONE_1STMAX,OZONE_99PERC,OZONE_STD,OZONE_2NDMAX,OZONE_METHOD,OZONE_METRIC,OZONE_UNITS,PM10_MEAN,PM10_1STMAX,PM10_99PERC,PM10_STD,PM10_2NDMAX,PM10_METHOD,PM10_METRIC,PM10_UNITS,PM25_MEAN,PM25_1STMAX,PM25_99PERC,PM25_STD,PM25_2NDMAX,PM25_METHOD,PM25_METRIC,PM25_UNITS,SO2_MEAN,SO2_1STMAX,SO2_99PERC,SO2_STD,SO2_2NDMAX,SO2_METHOD,SO2_METRIC,SO2_UNITS
CA,Alameda,2015,,,,,,,,,20.1278364,49.22,41.7,9.059481,45.28,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.0409086,0.0984,0.0862,0.0122206,0.092,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,
CA,Alameda,2016,,,,,,,,,19.29764116666667,46.116666666666674,37.73333333333333,7.934909333333333,39.2,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.0373358333333333,0.0821666666666666,0.074,0.011103,0.0781666666666666,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,
CA,Alameda,2017,,,,,,,,,20.515038,61.19999999999999,44.43333333333334,9.129422,49.883333333333326,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.038411,0.0976666666666666,0.071,0.0107346666666666,0.0873333333333333,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,
CA,Alameda,2018,,,,,,,,,22.827882166666665,69.06666666666666,59.2,10.554471166666668,65.26666666666667,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.0360527999999999,0.0713999999999999,0.0612,0.0095216,0.0673999999999999,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,
CA,Alameda,2019,,,,,,,,,21.414331500000003,55.18333333333334,45.63333333333333,9.367031333333331,50.48333333333334,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.0366726,0.0914,0.0679999999999999,0.0104352,0.0849999999999999,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,
CA,Amador,2015,,,,,,,,,,,,,,,,,0.048449,0.103,0.088,0.015178,0.094,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,
CA,Amador,2016,,,,,,,,,,,,,,,,,0.047055,0.104,0.081,0.014501,0.084,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,
CA,Amador,2017,,,,,,,,,,,,,,,,,0.047845,0.094,0.085,0.014494,0.088,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,
CA,Amador,2018,,,,,,,,,,,,,,,,,0.049264,0.109,0.092,0.015534,0.093,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,
CA,Amador,2019,,,,,,,,,,,,,,,,,0.0475,0.084,0.078,0.011916,0.083,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,


In [0]:

print(AQIDFAgg2.printSchema())

## LOAD

### 5. Link the asthma dataframe to the counties table

In [0]:
server = 'gen10-data-fundamentals-21-11-sql-server.database.windows.net'
database = 'jadr-SQL-Database'
port = '1433'
user = dbutils.secrets.get(scope = "jadr_blob", key = "SQLUser_dg")
password = dbutils.secrets.get(scope = "jadr_blob", key = "SQLPassword_dg")
url = f"jdbc:sqlserver://{server}:{port};databaseName={database};user={user};password={password};" 

Read in the current county table and state table and join them. We will match up the county state in each and join to the asthma dataframe. If a match is not found, do a quick check that there isn't an error. If not, go ahead and add the mising county to the database.

In [0]:

countyDF = spark.read.format("jdbc") \
    .option("url", url) \
    .option("dbtable", "County") \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .load()
display(countyDF)
countySchema = countyDF.schema
print(countySchema)
print(countySchema[1].name)

stateDF = spark.read.format("jdbc") \
    .option("url", url) \
    .option("dbtable", "State") \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .load()
display(stateDF)


countyStateDF = countyDF.join(stateDF, on='STATE_ID', how='inner')
display(countyStateDF)

COUNTY_ID,COUNTY_NAME,STATE_ID
1,Tuscaloosa,28
2,Sumter,28
3,Fremont,22
4,Pratt,42
5,Baltimo,3
6,Oregon,43
7,Carroll,20
8,Warren,19
9,Cattaraugus,19
10,Jefferson,26


STATE_ID,STATE_ABBR,STATE_NAME
1,MT,Montana
2,NC,North Carolina
3,MD,Maryland
4,CO,Colorado
5,CT,Connecticut
6,IL,Illinois
7,WY,Wyoming
8,NJ,New Jersey
9,DE,Delaware
10,DC,District of Columbia


STATE_ID,COUNTY_ID,COUNTY_NAME,STATE_ABBR,STATE_NAME
1,3109,Big Horn,MT,Montana
1,2975,Petroleum,MT,Montana
1,2953,Granite,MT,Montana
1,2924,Carter,MT,Montana
1,2832,Rosebud,MT,Montana
1,2770,Flathead,MT,Montana
1,2639,Daniels,MT,Montana
1,2568,Deer Lodge,MT,Montana
1,2498,Ravalli,MT,Montana
1,2497,McCone,MT,Montana


In [0]:
from pyspark.sql.types import StructType,StructField, StringType
import pyodbc
# If nothing has been loaded in the county table in the database, we need to construct the first set from the asthma dataset
if(countyStateDF.count() == 0):
    schema = StructType([ \
        StructField("STATE_ABBR",StringType(),True), \
        StructField("STATE_NAME",StringType(),True)])
    newStateDF = spark.createDataFrame([("CA", "California")],schema)
    
    # Save and reload to get the latest ID numbers
    # Put in a try statement in case the state already exists
    try:
        writer = newStateDF.write.format("jdbc").option("url", url) \
            .mode("append") \
            .option("dbtable", 'State') \
            .option("user", user) \
            .option("password", password) \
            .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")
        writer.save()
    except Exception as e:
        print(e)
    
    stateDF = spark.read.format("jdbc") \
        .option("url", url) \
        .option("dbtable", "State") \
        .option("user", user) \
        .option("password", password) \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()
    display(stateDF)
    
    countyDF = asthmaTotalDFjoined.select('COUNTY').distinct()
    # Need to remember that this list has statewide data. Remove California from the county list
    countyDF = countyDF.filter(countyDF.COUNTY != 'California')
    display(countyDF)
    countyDF = countyDF.withColumn('STATE_ID', lit(stateDF.filter((stateDF.STATE_ABBR == 'CA')).first()['STATE_ID']))
    display(countyDF)
    # Rename the column to match the SQL table
    countyDF = countyDF.withColumnRenamed("COUNTY",countySchema[1].name)
    print(countyDF.printSchema())
    # Write and reload to get the latest ID numbers
    try:
        writer = countyDF.write.format("jdbc").option("url", url) \
            .mode("append") \
            .option("dbtable", 'County') \
            .option("user", user) \
            .option("password", password) \
            .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")
        writer.save()
    except Exception as e:
        print(e)
    
    countyDF = spark.read.format("jdbc") \
        .option("url", url) \
        .option("dbtable", "County") \
        .option("user", user) \
        .option("password", password) \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()
    display(countyDF)
    
    # Join the two new dataframes and continue
    countyStateDF = countyDF.join(stateDF, on='STATE_ID', how='inner')
    display(countyStateDF)

In [0]:
import pandas as pd
# Add in an ID to the table 
asthmaTotalDF = asthmaTotalDF.withColumn('AsthmaID',F.monotonically_increasing_id()+1)

matched = pd.DataFrame([])
noMatchFound = False
display(asthmaTotalDF)
# If there are current countyies in the database, look through them for a match
if(countyStateDF.count() > 0):
    for countyAsthma in asthmaTotalDF.toLocalIterator():
        matchedCol = countyStateDF.filter((countyStateDF.COUNTY_NAME == countyAsthma.COUNTY) & (countyStateDF.STATE_ABBR == countyAsthma.STATE)).select('COUNTY_ID')  
        # There are conditions where there isn't a match like if the county isn't loaded in the in the table, yet, or we have statewide data.
        if(matchedCol.count() >0):
            matched = matched.append({'AsthmaID': countyAsthma.AsthmaID, 
                                      'COUNTY_ID': matchedCol.collect()[0][0]},
                                     ignore_index=True)
        else: noMatchFound = True
    if(len(matched)>0):    
        matched = spark.createDataFrame(matched)
        display(matched)
        # if everything went ok, then join the matched tabled of county ID's to the asthma dataframe
        asthmaTotalDF = asthmaTotalDF.join(matched, on='AsthmaID', how='left_outer')
        display(asthmaTotalDF)
    else:
        print("NOTHING MATCHED")
        asthmaTotalDF = asthmaTotalDF.withColumn('COUNTY_ID', lit(None))

display(asthmaTotalDF)


STATE,COUNTY,YEAR,NUMBER OF ED VISITS,AGE-ADJUSTED ED VISIT RATE,AsthmaID
CA,California,2015,191904,50.4,1
CA,Alameda,2015,9939,64.3,2
CA,Amador,2015,196,58.4,3
CA,Butte,2015,1044,50.2,4
CA,Calaveras,2015,185,48.0,5
CA,Colusa,2015,97,41.4,6
CA,Contra Costa,2015,6858,65.2,7
CA,Del Norte,2015,140,53.0,8
CA,El Dorado,2015,592,36.4,9
CA,Fresno,2015,7936,77.0,10


AsthmaID,COUNTY_ID
2.0,3085.0
3.0,843.0
4.0,152.0
5.0,65.0
6.0,545.0
7.0,46.0
8.0,2947.0
9.0,1751.0
10.0,2778.0
11.0,2490.0


AsthmaID,STATE,COUNTY,YEAR,NUMBER OF ED VISITS,AGE-ADJUSTED ED VISIT RATE,COUNTY_ID
147,CA,Riverside,2017,10076,43.2,586.0
170,CA,Yolo,2017,894,43.3,1994.0
184,CA,Imperial,2018,1147,58.1,1113.0
160,CA,Siskiyou,2017,159,42.1,534.0
169,CA,Ventura,2017,2836,34.8,1406.0
8,CA,Del Norte,2015,140,53.0,2947.0
67,CA,Fresno,2016,6930,67.4,2778.0
70,CA,Imperial,2016,1287,66.3,1113.0
168,CA,Tuolumne,2017,244,53.6,1144.0
69,CA,Humboldt,2016,815,64.5,224.0


AsthmaID,STATE,COUNTY,YEAR,NUMBER OF ED VISITS,AGE-ADJUSTED ED VISIT RATE,COUNTY_ID
147,CA,Riverside,2017,10076,43.2,586.0
170,CA,Yolo,2017,894,43.3,1994.0
184,CA,Imperial,2018,1147,58.1,1113.0
160,CA,Siskiyou,2017,159,42.1,534.0
169,CA,Ventura,2017,2836,34.8,1406.0
8,CA,Del Norte,2015,140,53.0,2947.0
67,CA,Fresno,2016,6930,67.4,2778.0
70,CA,Imperial,2016,1287,66.3,1113.0
168,CA,Tuolumne,2017,244,53.6,1144.0
69,CA,Humboldt,2016,815,64.5,224.0


In [0]:
# If there was a missing county, we need to add it to the database
if noMatchFound:
    missingCountyAsthmaDF = asthmaTotalDF.filter((asthmaTotalDF.COUNTY.isNull()))
   
    display(missingCountyAsthmaDF)

    missingStates = pd.DataFrame()
    missingCounties = pd.DataFrame()
    for countyAsthma in missingCountyAsthmaDF.toLocalIterator():
        # We need to check to see if this is the statewide row. All others, add them to the county table. 
        if countyAsthma.COUNTY == 'California': continue
        else:
            # Check if state already exists
            if(stateDF.filter(stateDF.STATE_ABBR == countyAsthma.STATE).count() == 0):
                # If not, add it to a list of missing states
                missingStates.append({'STATE_ABBR': countyAsthma.STATE, 
                                      'STATE_NAME': "California"}, # Need to get a full name for general use
                                     ignore_index=True)      
            # Add to the list of missing counties
            missingCounties.append({'COUNTY_NAME': countyAsthma.COUNTY,
                                   'STATE': countyAsthma.STATE},
                                   ignore_index=True)
                                      
    # If we were missing states, add them to the DB                                  
    if(len(missingStates) > 0):       
        statesSchema = StructType([ \
            StructField("STATE_ABBR",StringType(),True), \
            StructField("STATE_NAME",StringType(),True)]) 
        newStatesDF = spark.createDataFrame(missingStates,schema)
        # Save and reload to get the latest ID numbers
        # Put in a try statement in case the state already exists
        try:
            writer = newStatesDF.write.format("jdbc").option("url", url) \
                .mode("append") \
                .option("dbtable", 'State') \
                .option("user", user) \
                .option("password", password) \
                .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")
            writer.save()
        except Exception as e:
            print(e)

        stateDF = spark.read.format("jdbc") \
            .option("url", url) \
            .option("dbtable", "State") \
            .option("user", user) \
            .option("password", password) \
            .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
            .load()
        display(stateDF)    
        
    if(len(missingCounties) > 0):   
        # Get the state ID and add to the dataframe
        missingCounties = spark.createDataFrame(missingCounties)
        newcountyDF = missingCounties.withColumn('STATE_ID', lit(stateDF.filter((stateDF.STATE_ABBR == missingCounties.STATE)).first()['STATE_ID']))
        # Append and reload to get the latest ID numbers
        try:
            writer = newcountyDF.write.format("jdbc").option("url", url) \
                .mode("append") \
                .option("dbtable", 'County') \
                .option("user", user) \
                .option("password", password) \
                .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")
            writer.save()
        except Exception as e:
            print(e)

        countyDF = spark.read.format("jdbc") \
            .option("url", url) \
            .option("dbtable", "County") \
            .option("user", user) \
            .option("password", password) \
            .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
            .load()
        display(countyDF)

        # Get the new combined county state dataframe
        countyStateDF = countyDF.join(stateDF, on='STATE_ID', how='inner')
        display(countyStateDF)           
            
        # Redo the step of finding the county matches in the asthma dataframe
        matched = pd.DataFrame([])
        for countyAsthma in asthmaTotalDFjoined.toLocalIterator():
            matchedCol = countyStateDF.filter((countyStateDF.COUNTY_NAME == countyAsthma.COUNTY) & (countyStateDF.STATE_ABBR == countyAsthma.STATE)).select('COUNTY_ID')  
            # There are conditions where there isn't a match still like if the we have statewide data.
            if(matchedCol.count() >0):
                matched = matched.append({'AsthmaID': countyAsthma.AsthmaID, 
                                          'COUNTY_ID': matchedCol.collect()[0][0]},
                                         ignore_index=True)
        if(len(matched) > 0):
            matched = spark.createDataFrame(matched)
            display(matched)
            # if everything went ok, then join the matched tabled of county ID's to the asthma dataframe
            # Since we ran this once already to find the missing columns, there may already be a COUNTY_ID column. If so remove it
            try:
                asthmaTotalDFjoined = asthmaTotalDFjoined.drop('COUNTY_ID')
            except Exception as e:
                print(e)
            asthmaTotalDFjoined = asthmaTotalDFjoined.join(matched, on='AsthmaID', how='left_outer')
            display(asthmaTotalDFjoined)
        else:
            print("NOTHING MATCHED")
 
            
    

AsthmaID,STATE,COUNTY,YEAR,NUMBER OF ED VISITS,AGE-ADJUSTED ED VISIT RATE,COUNTY_ID


Since we merged in the COUNTY_ID, there is no need for the STATE and COUNTY columns

In [0]:
asthmaTotalDF = asthmaTotalDF.drop('STATE', 'COUNTY')
display(asthmaTotalDF)

AsthmaID,YEAR,NUMBER OF ED VISITS,AGE-ADJUSTED ED VISIT RATE,COUNTY_ID
147,2017,10076,43.2,586.0
170,2017,894,43.3,1994.0
184,2018,1147,58.1,1113.0
160,2017,159,42.1,534.0
169,2017,2836,34.8,1406.0
8,2015,140,53.0,2947.0
67,2016,6930,67.4,2778.0
70,2016,1287,66.3,1113.0
168,2017,244,53.6,1144.0
69,2016,815,64.5,224.0


### 6. Create the metric, method, and unit tables and link to the air quality dataset. Link  the County Table , too.

In [0]:
# Add in an ID to the table 
AQIDFAgg2 = AQIDFAgg2.withColumn('AQ_ID',F.monotonically_increasing_id()+1)
print(AQIDFAgg2.printSchema())
display(AQIDFAgg2)

state,county,year,LEAD_MEAN,LEAD_1STMAX,LEAD_99PERC,LEAD_STD,LEAD_2NDMAX,LEAD_METHOD,LEAD_METRIC,LEAD_UNITS,NO2_MEAN,NO2_1STMAX,NO2_99PERC,NO2_STD,NO2_2NDMAX,NO2_METHOD,NO2_METRIC,NO2_UNITS,OZONE_MEAN,OZONE_1STMAX,OZONE_99PERC,OZONE_STD,OZONE_2NDMAX,OZONE_METHOD,OZONE_METRIC,OZONE_UNITS,PM10_MEAN,PM10_1STMAX,PM10_99PERC,PM10_STD,PM10_2NDMAX,PM10_METHOD,PM10_METRIC,PM10_UNITS,PM25_MEAN,PM25_1STMAX,PM25_99PERC,PM25_STD,PM25_2NDMAX,PM25_METHOD,PM25_METRIC,PM25_UNITS,SO2_MEAN,SO2_1STMAX,SO2_99PERC,SO2_STD,SO2_2NDMAX,SO2_METHOD,SO2_METRIC,SO2_UNITS,AQ_ID
CA,Alameda,2015,,,,,,,,,20.1278364,49.22,41.7,9.059481,45.28,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.0409086,0.0984,0.0862,0.0122206,0.092,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,1
CA,Alameda,2016,,,,,,,,,19.29764116666667,46.116666666666674,37.73333333333333,7.934909333333333,39.2,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.0373358333333333,0.0821666666666666,0.074,0.011103,0.0781666666666666,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,2
CA,Alameda,2017,,,,,,,,,20.515038,61.19999999999999,44.43333333333334,9.129422,49.883333333333326,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.038411,0.0976666666666666,0.071,0.0107346666666666,0.0873333333333333,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3
CA,Alameda,2018,,,,,,,,,22.827882166666665,69.06666666666666,59.2,10.554471166666668,65.26666666666667,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.0360527999999999,0.0713999999999999,0.0612,0.0095216,0.0673999999999999,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,4
CA,Alameda,2019,,,,,,,,,21.414331500000003,55.18333333333334,45.63333333333333,9.367031333333331,50.48333333333334,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.0366726,0.0914,0.0679999999999999,0.0104352,0.0849999999999999,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,5
CA,Amador,2015,,,,,,,,,,,,,,,,,0.048449,0.103,0.088,0.015178,0.094,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,6
CA,Amador,2016,,,,,,,,,,,,,,,,,0.047055,0.104,0.081,0.014501,0.084,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,7
CA,Amador,2017,,,,,,,,,,,,,,,,,0.047845,0.094,0.085,0.014494,0.088,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,8
CA,Amador,2018,,,,,,,,,,,,,,,,,0.049264,0.109,0.092,0.015534,0.093,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,9
CA,Amador,2019,,,,,,,,,,,,,,,,,0.0475,0.084,0.078,0.011916,0.083,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,10


Method

In [0]:
# Get the unique methods across all method columns in the airquality dataframe
methodsDF = AQIDFAgg2.select('LEAD_METHOD').distinct()
methodsDF = methodsDF.withColumnRenamed("LEAD_METHOD","METHOD_NAME")
methodsDF2 = AQIDFAgg2.select('NO2_METHOD').distinct()
methodsDF2 = methodsDF2.withColumnRenamed("NO2_METHOD","METHOD_NAME")
methodsDF = methodsDF2.union(methodsDF)

methodsDF2 = AQIDFAgg2.select('OZONE_METHOD').distinct()
methodsDF2 = methodsDF2.withColumnRenamed("OZONE_METHOD","METHOD_NAME")
methodsDF = methodsDF2.union(methodsDF)

methodsDF2 = AQIDFAgg2.select('PM10_METHOD').distinct()
methodsDF2 = methodsDF2.withColumnRenamed("PM10_METHOD","METHOD_NAME")
methodsDF = methodsDF2.union(methodsDF)

methodsDF2 = AQIDFAgg2.select('PM25_METHOD').distinct()
methodsDF2 = methodsDF2.withColumnRenamed("PM25_METHOD","METHOD_NAME")
methodsDF = methodsDF2.union(methodsDF)

methodsDF2 = AQIDFAgg2.select('SO2_METHOD').distinct()
methodsDF2 = methodsDF2.withColumnRenamed("SO2_METHOD","METHOD_NAME")
methodsDF = methodsDF2.union(methodsDF)

methodsDF = methodsDF.select('METHOD_NAME').distinct()
display(methodsDF)

METHOD_NAME
INSTRUMENTAL - Pulsed Fluorescent 43C-TLE/43i-TLE
""
Andersen RAAS2.5-300 PM2.5 SEQ w/WINS - GRAVIMETRIC
Multiple Methods Used
R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC - Gravimetric
INSTRUMENTAL - ULTRA VIOLET ABSORPTION
INSTRUMENTAL - ULTRA VIOLET
INSTRUMENTAL - CHEMILUMINESCENCE
INSTRUMENTAL - GAS PHASE CHEMILUMINESCENCE
Hi-Vol - ICAP SPECTRA (ICP-MS); 0.45M HNO3 Boil30 min


In [0]:
try:
    writer = methodsDF.write.format("jdbc").option("url", url) \
        .mode("append") \
        .option("dbtable", 'Method') \
        .option("user", user) \
        .option("password", password) \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")
    writer.save()
except Exception as e:
    print(e)

    df_loaded = spark.read.format("jdbc") \
        .option("url", url) \
        .option("dbtable", "Method") \
        .option("user", user) \
        .option("password", password) \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()
    display(df_loaded)   
        
    # Subtract the two to only leave what is not in the database
    # remove the index column of the loaded dataframe
    df_loaded = df_loaded.drop('METHOD_ID')
    df_toload = methodsDF.subtract(df_loaded)
    display(df_toload)
    
    # Upload all the data that wasn't already in the database
    df_toload.write.format("jdbc")  \
        .option("url", url) \
        .mode("append") \
        .option("dbtable", "Method") \
        .option("user", user) \
        .option("password", password) \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .save()
    
methodsDF = spark.read.format("jdbc") \
    .option("url", url) \
    .option("dbtable", "Method") \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .load()
display(methodsDF)  


METHOD_ID,METHOD_NAME
3,
50,Andersen RAAS2.5-300 PM2.5 SEQ w/WINS - GRAVIMETRIC
2,Hi-Vol - ICAP SPECTRA (ICP-MS); 0.45M HNO3 Boil30 min
4,INSTRUMENTAL - CHEMILUMINESCENCE
5,INSTRUMENTAL - GAS PHASE CHEMILUMINESCENCE
9,INSTRUMENTAL - Pulsed Fluorescent 43C-TLE/43i-TLE
6,INSTRUMENTAL - ULTRA VIOLET
7,INSTRUMENTAL - ULTRA VIOLET ABSORPTION
1,Multiple Methods Used
8,R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC - Gravimetric


METHOD_NAME


METHOD_ID,METHOD_NAME
3,
50,Andersen RAAS2.5-300 PM2.5 SEQ w/WINS - GRAVIMETRIC
2,Hi-Vol - ICAP SPECTRA (ICP-MS); 0.45M HNO3 Boil30 min
4,INSTRUMENTAL - CHEMILUMINESCENCE
5,INSTRUMENTAL - GAS PHASE CHEMILUMINESCENCE
9,INSTRUMENTAL - Pulsed Fluorescent 43C-TLE/43i-TLE
6,INSTRUMENTAL - ULTRA VIOLET
7,INSTRUMENTAL - ULTRA VIOLET ABSORPTION
1,Multiple Methods Used
8,R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC - Gravimetric


In [0]:
if(methodsDF.count() > 0):
    lead_matched = pd.DataFrame([])    
    NO2_matched = pd.DataFrame([])
    ozone_matched = pd.DataFrame([])
    PM10_matched = pd.DataFrame([])
    PM25_matched = pd.DataFrame([])
    SO2_matched = pd.DataFrame([])

    # Loop through all the rows in the AQ dataframe and find the matching method in the methods dataframe
    for AQrow in AQIDFAgg2.toLocalIterator():
        if(AQrow.LEAD_METHOD == None): lead_matchedCol = methodsDF.filter((methodsDF.METHOD_NAME.isNull())).select('METHOD_ID')
        else: lead_matchedCol = methodsDF.filter((methodsDF.METHOD_NAME == AQrow.LEAD_METHOD)).select('METHOD_ID')  
        
        if(AQrow.NO2_METHOD == None): NO2_matchedCol = methodsDF.filter((methodsDF.METHOD_NAME.isNull())).select('METHOD_ID')
        else: NO2_matchedCol = methodsDF.filter((methodsDF.METHOD_NAME == AQrow.NO2_METHOD)).select('METHOD_ID')  
            
        if(AQrow.OZONE_METHOD == None): ozone_matchedCol = methodsDF.filter((methodsDF.METHOD_NAME.isNull())).select('METHOD_ID')
        else: ozone_matchedCol = methodsDF.filter((methodsDF.METHOD_NAME == AQrow.OZONE_METHOD)).select('METHOD_ID')  
            
        if(AQrow.PM10_METHOD == None): PM10_matchedCol = methodsDF.filter((methodsDF.METHOD_NAME.isNull())).select('METHOD_ID')
        else: PM10_matchedCol = methodsDF.filter((methodsDF.METHOD_NAME == AQrow.PM10_METHOD)).select('METHOD_ID')  
            
        if(AQrow.PM25_METHOD == None): PM25_matchedCol = methodsDF.filter((methodsDF.METHOD_NAME.isNull())).select('METHOD_ID')
        else: PM25_matchedCol = methodsDF.filter((methodsDF.METHOD_NAME == AQrow.PM25_METHOD)).select('METHOD_ID')  
            
        if(AQrow.SO2_METHOD == None): SO2_matchedCol = methodsDF.filter((methodsDF.METHOD_NAME.isNull())).select('METHOD_ID')
        else: SO2_matchedCol = methodsDF.filter((methodsDF.METHOD_NAME == AQrow.SO2_METHOD)).select('METHOD_ID')  

        if(lead_matchedCol.count() > 0): lead_matched = lead_matched.append({'AQ_ID': AQrow.AQ_ID, 'LEAD_METHOD_ID': lead_matchedCol.collect()[0][0]}, ignore_index=True)        
        if(NO2_matchedCol.count() > 0): NO2_matched = NO2_matched.append({'AQ_ID': AQrow.AQ_ID, 'NO2_METHOD_ID': NO2_matchedCol.collect()[0][0]}, ignore_index=True)
        if(ozone_matchedCol.count() > 0): ozone_matched = ozone_matched.append({'AQ_ID': AQrow.AQ_ID, 'OZONE_METHOD_ID': ozone_matchedCol.collect()[0][0]}, ignore_index=True)
        if(PM10_matchedCol.count() > 0): PM10_matched = PM10_matched.append({'AQ_ID': AQrow.AQ_ID, 'PM10_METHOD_ID': PM10_matchedCol.collect()[0][0]}, ignore_index=True)
        if(PM25_matchedCol.count() > 0): PM25_matched = PM25_matched.append({'AQ_ID': AQrow.AQ_ID, 'PM25_METHOD_ID': PM25_matchedCol.collect()[0][0]}, ignore_index=True)
        if(SO2_matchedCol.count() > 0): SO2_matched = SO2_matched.append({'AQ_ID': AQrow.AQ_ID, 'SO2_METHOD_ID': SO2_matchedCol.collect()[0][0]}, ignore_index=True)

                
    if(len(lead_matched) > 0): AQIDFAgg2 = AQIDFAgg2.join(spark.createDataFrame(lead_matched), on='AQ_ID', how='left_outer')
    if(len(NO2_matched) > 0): AQIDFAgg2 = AQIDFAgg2.join(spark.createDataFrame(NO2_matched), on='AQ_ID', how='left_outer')
    if(len(ozone_matched) > 0): AQIDFAgg2 = AQIDFAgg2.join(spark.createDataFrame(ozone_matched), on='AQ_ID', how='left_outer')
    if(len(PM10_matched) > 0): AQIDFAgg2 = AQIDFAgg2.join(spark.createDataFrame(PM10_matched), on='AQ_ID', how='left_outer')
    if(len(PM25_matched) > 0): AQIDFAgg2 = AQIDFAgg2.join(spark.createDataFrame(PM25_matched), on='AQ_ID', how='left_outer')
    if(len(SO2_matched) > 0): AQIDFAgg2 = AQIDFAgg2.join(spark.createDataFrame(SO2_matched), on='AQ_ID', how='left_outer')
   

    display(AQIDFAgg2)


AQ_ID,state,county,year,LEAD_MEAN,LEAD_1STMAX,LEAD_99PERC,LEAD_STD,LEAD_2NDMAX,LEAD_METHOD,LEAD_METRIC,LEAD_UNITS,NO2_MEAN,NO2_1STMAX,NO2_99PERC,NO2_STD,NO2_2NDMAX,NO2_METHOD,NO2_METRIC,NO2_UNITS,OZONE_MEAN,OZONE_1STMAX,OZONE_99PERC,OZONE_STD,OZONE_2NDMAX,OZONE_METHOD,OZONE_METRIC,OZONE_UNITS,PM10_MEAN,PM10_1STMAX,PM10_99PERC,PM10_STD,PM10_2NDMAX,PM10_METHOD,PM10_METRIC,PM10_UNITS,PM25_MEAN,PM25_1STMAX,PM25_99PERC,PM25_STD,PM25_2NDMAX,PM25_METHOD,PM25_METRIC,PM25_UNITS,SO2_MEAN,SO2_1STMAX,SO2_99PERC,SO2_STD,SO2_2NDMAX,SO2_METHOD,SO2_METRIC,SO2_UNITS,LEAD_METHOD_ID,NO2_METHOD_ID,OZONE_METHOD_ID,PM10_METHOD_ID,PM25_METHOD_ID,SO2_METHOD_ID
1,CA,Alameda,2015,,,,,,,,,20.1278364,49.22,41.7,9.059481,45.28,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.0409086,0.0984,0.0862,0.0122206,0.092,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0
2,CA,Alameda,2016,,,,,,,,,19.29764116666667,46.116666666666674,37.73333333333333,7.934909333333333,39.2,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.0373358333333333,0.0821666666666666,0.074,0.011103,0.0781666666666666,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0
3,CA,Alameda,2017,,,,,,,,,20.515038,61.19999999999999,44.43333333333334,9.129422,49.883333333333326,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.038411,0.0976666666666666,0.071,0.0107346666666666,0.0873333333333333,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0
4,CA,Alameda,2018,,,,,,,,,22.827882166666665,69.06666666666666,59.2,10.554471166666668,65.26666666666667,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.0360527999999999,0.0713999999999999,0.0612,0.0095216,0.0673999999999999,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0
5,CA,Alameda,2019,,,,,,,,,21.414331500000003,55.18333333333334,45.63333333333333,9.367031333333331,50.48333333333334,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.0366726,0.0914,0.0679999999999999,0.0104352,0.0849999999999999,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0
6,CA,Amador,2015,,,,,,,,,,,,,,,,,0.048449,0.103,0.088,0.015178,0.094,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,3.0,7.0,3.0,3.0,3.0
7,CA,Amador,2016,,,,,,,,,,,,,,,,,0.047055,0.104,0.081,0.014501,0.084,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,3.0,7.0,3.0,3.0,3.0
8,CA,Amador,2017,,,,,,,,,,,,,,,,,0.047845,0.094,0.085,0.014494,0.088,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,3.0,7.0,3.0,3.0,3.0
9,CA,Amador,2018,,,,,,,,,,,,,,,,,0.049264,0.109,0.092,0.015534,0.093,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,3.0,7.0,3.0,3.0,3.0
10,CA,Amador,2019,,,,,,,,,,,,,,,,,0.0475,0.084,0.078,0.011916,0.083,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,3.0,7.0,3.0,3.0,3.0


Metric

In [0]:
# Get the unique methods across all method columns in the airquality dataframe
metricsDF = AQIDFAgg2.select('LEAD_METRIC').distinct()
metricsDF = metricsDF.withColumnRenamed("LEAD_METRIC","METRIC_NAME")
metricsDF2 = AQIDFAgg2.select('NO2_METRIC').distinct()
metricsDF2 = metricsDF2.withColumnRenamed("NO2_METRIC","METRIC_NAME")
metricsDF = metricsDF2.union(metricsDF)

metricsDF2 = AQIDFAgg2.select('OZONE_METRIC').distinct()
metricsDF2 = metricsDF2.withColumnRenamed("OZONE_METRIC","METRIC_NAME")
metricsDF = metricsDF2.union(metricsDF)

metricsDF2 = AQIDFAgg2.select('PM10_METRIC').distinct()
metricsDF2 = metricsDF2.withColumnRenamed("PM10_METRIC","METRIC_NAME")
metricsDF = metricsDF2.union(metricsDF)

metricsDF2 = AQIDFAgg2.select('PM25_METRIC').distinct()
metricsDF2 = metricsDF2.withColumnRenamed("PM25_METRIC","METRIC_NAME")
metricsDF = metricsDF2.union(metricsDF)

metricsDF2 = AQIDFAgg2.select('SO2_METRIC').distinct()
metricsDF2 = metricsDF2.withColumnRenamed("SO2_METRIC","METRIC_NAME")
metricsDF = metricsDF2.union(metricsDF)

metricsDF = metricsDF.select('METRIC_NAME').distinct()
display(metricsDF)

METRIC_NAME
Daily maximum 1-hour average
""
Daily Mean
Observed Values
Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM)
Daily Maximum 1-hour average


In [0]:
try:
    metricsDF = metricsDF.sort('METRIC_NAME')    
    display(metricsDF)
    writer = metricsDF.write.format("jdbc").option("url", url) \
        .mode("append") \
        .option("dbtable", 'Metric') \
        .option("user", user) \
        .option("password", password) \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")
    writer.save()
except Exception as e:
    print(e)

    df_loaded = spark.read.format("jdbc") \
        .option("url", url) \
        .option("dbtable", "Metric") \
        .option("user", user) \
        .option("password", password) \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()
    print('df_loaded')
    display(df_loaded)   
        
    # Subtract the two to only leave what is not in the database
    # remove the index column of the loaded dataframe
    df_loaded = df_loaded.drop('METRIC_ID')
    df_toload = metricsDF.subtract(df_loaded)
    print('df_toload')
    display(df_toload)
    
    if(df_toload.count() > 0):
        # Upload all the data that wasn't already in the database
        df_toload.write.format("jdbc")  \
            .option("url", url) \
            .mode("append") \
            .option("dbtable", "Metric") \
            .option("user", user) \
            .option("password", password) \
            .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
            .save()
    
metricsDF = spark.read.format("jdbc") \
    .option("url", url) \
    .option("dbtable", "Metric") \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .load()
display(metricsDF)  


METRIC_NAME
""
Daily Maximum 1-hour average
Daily Mean
Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM)
Daily maximum 1-hour average
Observed Values


METRIC_ID,METRIC_NAME
2,
4,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM)
3,Daily Maximum 1-hour average
6,Daily maximum 1-hour average
5,Daily Mean
1,Observed Values


METRIC_NAME


METRIC_ID,METRIC_NAME
2,
4,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM)
3,Daily Maximum 1-hour average
6,Daily maximum 1-hour average
5,Daily Mean
1,Observed Values


In [0]:
if(metricsDF.count() > 0):
    lead_matched = pd.DataFrame([])    
    NO2_matched = pd.DataFrame([])
    ozone_matched = pd.DataFrame([])
    PM10_matched = pd.DataFrame([])
    PM25_matched = pd.DataFrame([])
    SO2_matched = pd.DataFrame([])

    # Loop through all the rows in the AQ dataframe and find the matching method in the metrics dataframe
    for AQrow in AQIDFAgg2.toLocalIterator():
        if(AQrow.LEAD_METRIC == None): lead_matchedCol = metricsDF.filter((metricsDF.METRIC_NAME.isNull())).select('METRIC_ID')
        else: lead_matchedCol = metricsDF.filter((metricsDF.METRIC_NAME == AQrow.LEAD_METRIC)).select('METRIC_ID')  
        
        if(AQrow.NO2_METRIC == None): NO2_matchedCol = metricsDF.filter((metricsDF.METRIC_NAME.isNull())).select('METRIC_ID')
        else: NO2_matchedCol = metricsDF.filter((metricsDF.METRIC_NAME == AQrow.NO2_METRIC)).select('METRIC_ID')  
            
        if(AQrow.OZONE_METRIC == None): ozone_matchedCol = metricsDF.filter((metricsDF.METRIC_NAME.isNull())).select('METRIC_ID')
        else: ozone_matchedCol = metricsDF.filter((metricsDF.METRIC_NAME == AQrow.OZONE_METRIC)).select('METRIC_ID')  
            
        if(AQrow.PM10_METRIC == None): PM10_matchedCol = metricsDF.filter((metricsDF.METRIC_NAME.isNull())).select('METRIC_ID')
        else: PM10_matchedCol = metricsDF.filter((metricsDF.METRIC_NAME == AQrow.PM10_METRIC)).select('METRIC_ID')  
            
        if(AQrow.PM25_METRIC == None): PM25_matchedCol = metricsDF.filter((metricsDF.METRIC_NAME.isNull())).select('METRIC_ID')
        else: PM25_matchedCol = metricsDF.filter((metricsDF.METRIC_NAME == AQrow.PM25_METRIC)).select('METRIC_ID')  
            
        if(AQrow.SO2_METRIC == None): SO2_matchedCol = metricsDF.filter((metricsDF.METRIC_NAME.isNull())).select('METRIC_ID')
        else: SO2_matchedCol = metricsDF.filter((metricsDF.METRIC_NAME == AQrow.SO2_METRIC)).select('METRIC_ID')  

        if(lead_matchedCol.count() > 0): lead_matched = lead_matched.append({'AQ_ID': AQrow.AQ_ID, 'LEAD_METRIC_ID': lead_matchedCol.collect()[0][0]}, ignore_index=True)        
        if(NO2_matchedCol.count() > 0): NO2_matched = NO2_matched.append({'AQ_ID': AQrow.AQ_ID, 'NO2_METRIC_ID': NO2_matchedCol.collect()[0][0]}, ignore_index=True)
        if(ozone_matchedCol.count() > 0): ozone_matched = ozone_matched.append({'AQ_ID': AQrow.AQ_ID, 'OZONE_METRIC_ID': ozone_matchedCol.collect()[0][0]}, ignore_index=True)
        if(PM10_matchedCol.count() > 0): PM10_matched = PM10_matched.append({'AQ_ID': AQrow.AQ_ID, 'PM10_METRIC_ID': PM10_matchedCol.collect()[0][0]}, ignore_index=True)
        if(PM25_matchedCol.count() > 0): PM25_matched = PM25_matched.append({'AQ_ID': AQrow.AQ_ID, 'PM25_METRIC_ID': PM25_matchedCol.collect()[0][0]}, ignore_index=True)
        if(SO2_matchedCol.count() > 0): SO2_matched = SO2_matched.append({'AQ_ID': AQrow.AQ_ID, 'SO2_METRIC_ID': SO2_matchedCol.collect()[0][0]}, ignore_index=True)

                
    if(len(lead_matched) > 0): AQIDFAgg2 = AQIDFAgg2.join(spark.createDataFrame(lead_matched), on='AQ_ID', how='left_outer')
    if(len(NO2_matched) > 0): AQIDFAgg2 = AQIDFAgg2.join(spark.createDataFrame(NO2_matched), on='AQ_ID', how='left_outer')
    if(len(ozone_matched) > 0): AQIDFAgg2 = AQIDFAgg2.join(spark.createDataFrame(ozone_matched), on='AQ_ID', how='left_outer')
    if(len(PM10_matched) > 0): AQIDFAgg2 = AQIDFAgg2.join(spark.createDataFrame(PM10_matched), on='AQ_ID', how='left_outer')
    if(len(PM25_matched) > 0): AQIDFAgg2 = AQIDFAgg2.join(spark.createDataFrame(PM25_matched), on='AQ_ID', how='left_outer')
    if(len(SO2_matched) > 0): AQIDFAgg2 = AQIDFAgg2.join(spark.createDataFrame(SO2_matched), on='AQ_ID', how='left_outer')

        
    display(AQIDFAgg2)


AQ_ID,state,county,year,LEAD_MEAN,LEAD_1STMAX,LEAD_99PERC,LEAD_STD,LEAD_2NDMAX,LEAD_METHOD,LEAD_METRIC,LEAD_UNITS,NO2_MEAN,NO2_1STMAX,NO2_99PERC,NO2_STD,NO2_2NDMAX,NO2_METHOD,NO2_METRIC,NO2_UNITS,OZONE_MEAN,OZONE_1STMAX,OZONE_99PERC,OZONE_STD,OZONE_2NDMAX,OZONE_METHOD,OZONE_METRIC,OZONE_UNITS,PM10_MEAN,PM10_1STMAX,PM10_99PERC,PM10_STD,PM10_2NDMAX,PM10_METHOD,PM10_METRIC,PM10_UNITS,PM25_MEAN,PM25_1STMAX,PM25_99PERC,PM25_STD,PM25_2NDMAX,PM25_METHOD,PM25_METRIC,PM25_UNITS,SO2_MEAN,SO2_1STMAX,SO2_99PERC,SO2_STD,SO2_2NDMAX,SO2_METHOD,SO2_METRIC,SO2_UNITS,LEAD_METHOD_ID,NO2_METHOD_ID,OZONE_METHOD_ID,PM10_METHOD_ID,PM25_METHOD_ID,SO2_METHOD_ID,LEAD_METRIC_ID,NO2_METRIC_ID,OZONE_METRIC_ID,PM10_METRIC_ID,PM25_METRIC_ID,SO2_METRIC_ID
1,CA,Alameda,2015,,,,,,,,,20.1278364,49.22,41.7,9.059481,45.28,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.0409086,0.0984,0.0862,0.0122206,0.092,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0
2,CA,Alameda,2016,,,,,,,,,19.29764116666667,46.116666666666674,37.73333333333333,7.934909333333333,39.2,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.0373358333333333,0.0821666666666666,0.074,0.011103,0.0781666666666666,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0
3,CA,Alameda,2017,,,,,,,,,20.515038,61.19999999999999,44.43333333333334,9.129422,49.883333333333326,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.038411,0.0976666666666666,0.071,0.0107346666666666,0.0873333333333333,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0
4,CA,Alameda,2018,,,,,,,,,22.827882166666665,69.06666666666666,59.2,10.554471166666668,65.26666666666667,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.0360527999999999,0.0713999999999999,0.0612,0.0095216,0.0673999999999999,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0
5,CA,Alameda,2019,,,,,,,,,21.414331500000003,55.18333333333334,45.63333333333333,9.367031333333331,50.48333333333334,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.0366726,0.0914,0.0679999999999999,0.0104352,0.0849999999999999,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0
6,CA,Amador,2015,,,,,,,,,,,,,,,,,0.048449,0.103,0.088,0.015178,0.094,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0
7,CA,Amador,2016,,,,,,,,,,,,,,,,,0.047055,0.104,0.081,0.014501,0.084,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0
8,CA,Amador,2017,,,,,,,,,,,,,,,,,0.047845,0.094,0.085,0.014494,0.088,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0
9,CA,Amador,2018,,,,,,,,,,,,,,,,,0.049264,0.109,0.092,0.015534,0.093,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0
10,CA,Amador,2019,,,,,,,,,,,,,,,,,0.0475,0.084,0.078,0.011916,0.083,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0


Units

In [0]:
# Get the unique methods across all method columns in the airquality dataframe
unitsDF = AQIDFAgg2.select('LEAD_UNITS').distinct()
unitsDF = unitsDF.withColumnRenamed("LEAD_UNITS","UNIT_NAME")
unitsDF2 = AQIDFAgg2.select('NO2_UNITS').distinct()
unitsDF2 = unitsDF2.withColumnRenamed("NO2_UNITS","UNIT_NAME")
unitsDF = unitsDF2.union(unitsDF)

unitsDF2 = AQIDFAgg2.select('OZONE_UNITS').distinct()
unitsDF2 = unitsDF2.withColumnRenamed("OZONE_UNITS","UNIT_NAME")
unitsDF = unitsDF2.union(unitsDF)

unitsDF2 = AQIDFAgg2.select('PM10_UNITS').distinct()
unitsDF2 = unitsDF2.withColumnRenamed("PM10_UNITS","UNIT_NAME")
unitsDF = unitsDF2.union(unitsDF)

unitsDF2 = AQIDFAgg2.select('PM25_UNITS').distinct()
unitsDF2 = unitsDF2.withColumnRenamed("PM25_UNITS","UNIT_NAME")
unitsDF = unitsDF2.union(unitsDF)

unitsDF2 = AQIDFAgg2.select('SO2_UNITS').distinct()
unitsDF2 = unitsDF2.withColumnRenamed("SO2_UNITS","UNIT_NAME")
unitsDF = unitsDF2.union(unitsDF)

unitsDF = unitsDF.select('UNIT_NAME').distinct()
display(unitsDF)

UNIT_NAME
""
Parts per billion
Micrograms/cubic meter (LC)
Micrograms/cubic meter (25 C)
Parts per million


In [0]:
try:
    writer = unitsDF.write.format("jdbc").option("url", url) \
        .mode("append") \
        .option("dbtable", 'Unit') \
        .option("user", user) \
        .option("password", password) \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")
    writer.save()
except Exception as e:
    print(e)

    df_loaded = spark.read.format("jdbc") \
        .option("url", url) \
        .option("dbtable", "Unit") \
        .option("user", user) \
        .option("password", password) \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()
    print('df_loaded')
    display(df_loaded)   
        
    # Subtract the two to only leave what is not in the database
    # remove the index column of the loaded dataframe
    df_loaded = df_loaded.drop('UNIT_ID')
    df_toload = unitsDF.subtract(df_loaded)
    print('df_toload')
    display(df_toload)
    
    if(df_toload.count() > 0):
        # Upload all the data that wasn't already in the database
        df_toload.write.format("jdbc")  \
            .option("url", url) \
            .mode("append") \
            .option("dbtable", "Unit") \
            .option("user", user) \
            .option("password", password) \
            .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
            .save()
    
unitsDF = spark.read.format("jdbc") \
    .option("url", url) \
    .option("dbtable", "Unit") \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .load()
display(unitsDF)  


UNIT_ID,UNIT_NAME
2,
5,Micrograms/cubic meter (25 C)
1,Micrograms/cubic meter (LC)
3,Parts per billion
4,Parts per million


UNIT_NAME


UNIT_ID,UNIT_NAME
2,
5,Micrograms/cubic meter (25 C)
1,Micrograms/cubic meter (LC)
3,Parts per billion
4,Parts per million


In [0]:
if(unitsDF.count() > 0):
    lead_matched = pd.DataFrame([])    
    NO2_matched = pd.DataFrame([])
    ozone_matched = pd.DataFrame([])
    PM10_matched = pd.DataFrame([])
    PM25_matched = pd.DataFrame([])
    SO2_matched = pd.DataFrame([])

    # Loop through all the rows in the AQ dataframe and find the matching method in the metrics dataframe
    for AQrow in AQIDFAgg2.toLocalIterator():
        if(AQrow.LEAD_UNITS == None): lead_matchedCol = unitsDF.filter((unitsDF.UNIT_NAME.isNull())).select('UNIT_ID')
        else: lead_matchedCol = unitsDF.filter((unitsDF.UNIT_NAME == AQrow.LEAD_UNITS)).select('UNIT_ID')  
        
        if(AQrow.NO2_UNITS == None): NO2_matchedCol = unitsDF.filter((unitsDF.UNIT_NAME.isNull())).select('UNIT_ID')
        else: NO2_matchedCol = unitsDF.filter((unitsDF.UNIT_NAME == AQrow.NO2_UNITS)).select('UNIT_ID')  
            
        if(AQrow.OZONE_UNITS == None): ozone_matchedCol = unitsDF.filter((unitsDF.UNIT_NAME.isNull())).select('UNIT_ID')
        else: ozone_matchedCol = unitsDF.filter((unitsDF.UNIT_NAME == AQrow.OZONE_UNITS)).select('UNIT_ID')  
            
        if(AQrow.PM10_UNITS == None): PM10_matchedCol = unitsDF.filter((unitsDF.UNIT_NAME.isNull())).select('UNIT_ID')
        else: PM10_matchedCol = unitsDF.filter((unitsDF.UNIT_NAME == AQrow.PM10_UNITS)).select('UNIT_ID')  
            
        if(AQrow.PM25_UNITS == None): PM25_matchedCol = unitsDF.filter((unitsDF.UNIT_NAME.isNull())).select('UNIT_ID')
        else: PM25_matchedCol = unitsDF.filter((unitsDF.UNIT_NAME == AQrow.PM25_UNITS)).select('UNIT_ID')  
            
        if(AQrow.SO2_UNITS == None): SO2_matchedCol = unitsDF.filter((unitsDF.UNIT_NAME.isNull())).select('UNIT_ID')
        else: SO2_matchedCol = unitsDF.filter((unitsDF.UNIT_NAME == AQrow.SO2_UNITS)).select('UNIT_ID')  

        if(lead_matchedCol.count() > 0): lead_matched = lead_matched.append({'AQ_ID': AQrow.AQ_ID, 'LEAD_UNITS_ID': lead_matchedCol.collect()[0][0]}, ignore_index=True)        
        if(NO2_matchedCol.count() > 0): NO2_matched = NO2_matched.append({'AQ_ID': AQrow.AQ_ID, 'NO2_UNITS_ID': NO2_matchedCol.collect()[0][0]}, ignore_index=True)
        if(ozone_matchedCol.count() > 0): ozone_matched = ozone_matched.append({'AQ_ID': AQrow.AQ_ID, 'OZONE_UNITS_ID': ozone_matchedCol.collect()[0][0]}, ignore_index=True)
        if(PM10_matchedCol.count() > 0): PM10_matched = PM10_matched.append({'AQ_ID': AQrow.AQ_ID, 'PM10_UNITS_ID': PM10_matchedCol.collect()[0][0]}, ignore_index=True)
        if(PM25_matchedCol.count() > 0): PM25_matched = PM25_matched.append({'AQ_ID': AQrow.AQ_ID, 'PM25_UNITS_ID': PM25_matchedCol.collect()[0][0]}, ignore_index=True)
        if(SO2_matchedCol.count() > 0): SO2_matched = SO2_matched.append({'AQ_ID': AQrow.AQ_ID, 'SO2_UNITS_ID': SO2_matchedCol.collect()[0][0]}, ignore_index=True)

                
    if(len(lead_matched) > 0): AQIDFAgg2 = AQIDFAgg2.join(spark.createDataFrame(lead_matched), on='AQ_ID', how='left_outer')
    if(len(NO2_matched) > 0): AQIDFAgg2 = AQIDFAgg2.join(spark.createDataFrame(NO2_matched), on='AQ_ID', how='left_outer')
    if(len(ozone_matched) > 0): AQIDFAgg2 = AQIDFAgg2.join(spark.createDataFrame(ozone_matched), on='AQ_ID', how='left_outer')
    if(len(PM10_matched) > 0): AQIDFAgg2 = AQIDFAgg2.join(spark.createDataFrame(PM10_matched), on='AQ_ID', how='left_outer')
    if(len(PM25_matched) > 0): AQIDFAgg2 = AQIDFAgg2.join(spark.createDataFrame(PM25_matched), on='AQ_ID', how='left_outer')
    if(len(SO2_matched) > 0): AQIDFAgg2 = AQIDFAgg2.join(spark.createDataFrame(SO2_matched), on='AQ_ID', how='left_outer')

        
    display(AQIDFAgg2)


AQ_ID,state,county,year,LEAD_MEAN,LEAD_1STMAX,LEAD_99PERC,LEAD_STD,LEAD_2NDMAX,LEAD_METHOD,LEAD_METRIC,LEAD_UNITS,NO2_MEAN,NO2_1STMAX,NO2_99PERC,NO2_STD,NO2_2NDMAX,NO2_METHOD,NO2_METRIC,NO2_UNITS,OZONE_MEAN,OZONE_1STMAX,OZONE_99PERC,OZONE_STD,OZONE_2NDMAX,OZONE_METHOD,OZONE_METRIC,OZONE_UNITS,PM10_MEAN,PM10_1STMAX,PM10_99PERC,PM10_STD,PM10_2NDMAX,PM10_METHOD,PM10_METRIC,PM10_UNITS,PM25_MEAN,PM25_1STMAX,PM25_99PERC,PM25_STD,PM25_2NDMAX,PM25_METHOD,PM25_METRIC,PM25_UNITS,SO2_MEAN,SO2_1STMAX,SO2_99PERC,SO2_STD,SO2_2NDMAX,SO2_METHOD,SO2_METRIC,SO2_UNITS,LEAD_METHOD_ID,NO2_METHOD_ID,OZONE_METHOD_ID,PM10_METHOD_ID,PM25_METHOD_ID,SO2_METHOD_ID,LEAD_METRIC_ID,NO2_METRIC_ID,OZONE_METRIC_ID,PM10_METRIC_ID,PM25_METRIC_ID,SO2_METRIC_ID,LEAD_UNITS_ID,NO2_UNITS_ID,OZONE_UNITS_ID,PM10_UNITS_ID,PM25_UNITS_ID,SO2_UNITS_ID
1,CA,Alameda,2015,,,,,,,,,20.1278364,49.22,41.7,9.059481,45.28,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.0409086,0.0984,0.0862,0.0122206,0.092,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0
2,CA,Alameda,2016,,,,,,,,,19.29764116666667,46.116666666666674,37.73333333333333,7.934909333333333,39.2,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.0373358333333333,0.0821666666666666,0.074,0.011103,0.0781666666666666,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0
3,CA,Alameda,2017,,,,,,,,,20.515038,61.19999999999999,44.43333333333334,9.129422,49.883333333333326,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.038411,0.0976666666666666,0.071,0.0107346666666666,0.0873333333333333,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0
4,CA,Alameda,2018,,,,,,,,,22.827882166666665,69.06666666666666,59.2,10.554471166666668,65.26666666666667,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.0360527999999999,0.0713999999999999,0.0612,0.0095216,0.0673999999999999,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0
5,CA,Alameda,2019,,,,,,,,,21.414331500000003,55.18333333333334,45.63333333333333,9.367031333333331,50.48333333333334,INSTRUMENTAL - CHEMILUMINESCENCE,Daily Maximum 1-hour average,Parts per billion,0.0366726,0.0914,0.0679999999999999,0.0104352,0.0849999999999999,INSTRUMENTAL - ULTRA VIOLET,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0
6,CA,Amador,2015,,,,,,,,,,,,,,,,,0.048449,0.103,0.088,0.015178,0.094,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0
7,CA,Amador,2016,,,,,,,,,,,,,,,,,0.047055,0.104,0.081,0.014501,0.084,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0
8,CA,Amador,2017,,,,,,,,,,,,,,,,,0.047845,0.094,0.085,0.014494,0.088,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0
9,CA,Amador,2018,,,,,,,,,,,,,,,,,0.049264,0.109,0.092,0.015534,0.093,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0
10,CA,Amador,2019,,,,,,,,,,,,,,,,,0.0475,0.084,0.078,0.011916,0.083,INSTRUMENTAL - ULTRA VIOLET ABSORPTION,Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM),Parts per million,,,,,,,,,,,,,,,,,,,,,,,,,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0


The method, metric, and unit ID columns have now been added. Deleted the columns they are replacing. ALso, drop the AQ_ID column because the SQL database is set to fill that in itself.

In [0]:
AQIDFAgg2 = AQIDFAgg2.drop('LEAD_METHOD','LEAD_METRIC', 'LEAD_UNITS')
AQIDFAgg2 = AQIDFAgg2.drop('NO2_METHOD','NO2_METRIC', 'NO2_UNITS')
AQIDFAgg2 = AQIDFAgg2.drop('OZONE_METHOD','OZONE_METRIC', 'OZONE_UNITS')
AQIDFAgg2 = AQIDFAgg2.drop('PM10_METHOD','PM10_METRIC', 'PM10_UNITS')
AQIDFAgg2 = AQIDFAgg2.drop('PM25_METHOD','PM25_METRIC', 'PM25_UNITS')
AQIDFAgg2 = AQIDFAgg2.drop('SO2_METHOD','SO2_METRIC', 'SO2_UNITS')


In [0]:
AQIDFAgg2 = AQIDFAgg2.drop('AQ_ID')
display(AQIDFAgg2)

state,county,year,LEAD_MEAN,LEAD_1STMAX,LEAD_99PERC,LEAD_STD,LEAD_2NDMAX,NO2_MEAN,NO2_1STMAX,NO2_99PERC,NO2_STD,NO2_2NDMAX,OZONE_MEAN,OZONE_1STMAX,OZONE_99PERC,OZONE_STD,OZONE_2NDMAX,PM10_MEAN,PM10_1STMAX,PM10_99PERC,PM10_STD,PM10_2NDMAX,PM25_MEAN,PM25_1STMAX,PM25_99PERC,PM25_STD,PM25_2NDMAX,SO2_MEAN,SO2_1STMAX,SO2_99PERC,SO2_STD,SO2_2NDMAX,LEAD_METHOD_ID,NO2_METHOD_ID,OZONE_METHOD_ID,PM10_METHOD_ID,PM25_METHOD_ID,SO2_METHOD_ID,LEAD_METRIC_ID,NO2_METRIC_ID,OZONE_METRIC_ID,PM10_METRIC_ID,PM25_METRIC_ID,SO2_METRIC_ID,LEAD_UNITS_ID,NO2_UNITS_ID,OZONE_UNITS_ID,PM10_UNITS_ID,PM25_UNITS_ID,SO2_UNITS_ID
CA,Sonoma,2018,,,,,,10.831921,65.1,45.0,8.739281,56.4,0.0379805,0.073,0.0615,0.0094615,0.0675,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0
CA,Stanislaus,2015,,,,,,18.95788,42.0,38.0,8.758836,38.0,0.051584,0.112,0.0955,0.018962,0.1,32.051724,76.0,76.0,15.31359,64.0,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0
CA,Calaveras,2017,,,,,,,,,,,0.052957,0.109,0.089,0.015065,0.091,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0
CA,Contra Costa,2019,,,,,,12.65093925,39.325,33.325,7.3735775,35.825,0.04185325,0.093,0.0765,0.0117075,0.08625,,,,,,6.133333,25.5,25.5,3.794034,17.2,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0
CA,Lake,2016,,,,,,,,,,,0.041474,0.068,0.065,0.008504,0.068,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0
CA,Santa Clara,2016,,,,,,23.385278,51.650000000000006,46.0,9.6735115,50.25,0.04473725,0.08825,0.07775,0.01179375,0.08275,,,,,,8.045263,22.7,22.7,3.834162000000001,20.3,0.537047,1.8,1.6,0.381402,1.7,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0
CA,Los Angeles,2015,0.0123738571428571,0.0363571428571428,0.0345,0.0067071428571428,0.0298571428571428,29.4755605,73.84285714285714,62.38571428571429,12.8423125,67.30714285714286,0.0548571538461538,0.1123076923076923,0.0999230769230769,0.0172545384615384,0.1053846153846154,,,,,,10.92832075,51.410000000000046,40.33500000000003,6.861900049999998,42.68500000000003,1.8496146666666669,21.666666666666668,8.3,1.8388513333333332,9.7,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0
CA,San Diego,2016,,,,,,19.314231777777778,57.66666666666666,46.66666666666666,10.879725888888888,50.55555555555556,0.0506423636363636,0.0857272727272727,0.0807272727272727,0.0108840909090909,0.0828181818181817,,,,,,8.451768666666668,23.533333333333328,19.866666666666664,3.427879,18.6,0.2253495,1.2,0.65,0.1548435,0.65,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0
CA,Riverside,2017,0.005121,0.015,0.015,0.003032,0.014,17.986095,50.11428571428571,41.8,9.269774571428572,43.85714285714285,0.0603519090909091,0.1143636363636363,0.1045454545454545,0.0179740909090909,0.1116363636363636,,,,,,12.455024749999998,46.15,40.6,7.090440500000001,40.87500000000001,0.617355,2.5,1.9,0.414118,2.3,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0
CA,Orange,2016,,,,,,26.712886,64.925,58.6,13.15272775,61.525000000000006,0.0510245,0.1044999999999999,0.08925,0.01382375,0.0967499999999999,,,,,,8.3535625,34.550000000000004,22.95,4.102276000000001,26.3,0.465903,3.3,2.1,0.496436,3.2,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0


County

In [0]:
display(countyStateDF)

STATE_ID,COUNTY_ID,COUNTY_NAME,STATE_ABBR,STATE_NAME
1,3109,Big Horn,MT,Montana
1,2975,Petroleum,MT,Montana
1,2953,Granite,MT,Montana
1,2924,Carter,MT,Montana
1,2832,Rosebud,MT,Montana
1,2770,Flathead,MT,Montana
1,2639,Daniels,MT,Montana
1,2568,Deer Lodge,MT,Montana
1,2498,Ravalli,MT,Montana
1,2497,McCone,MT,Montana


In [0]:
import pandas as pd
# Add in an ID to the table 
AQIDFAgg2 = AQIDFAgg2.withColumn('AQ_ID',F.monotonically_increasing_id()+1)

matched = pd.DataFrame([])
noMatchFound = False
display(AQIDFAgg2)
display(countyStateDF)
# If there are current countyies in the database, look through them for a match
if(countyStateDF.count() > 0):
    for countyAQ in AQIDFAgg2.toLocalIterator():
        matchedCol = countyStateDF.filter((countyStateDF.COUNTY_NAME == countyAQ.county) & (countyStateDF.STATE_ABBR == countyAQ.state)).select('COUNTY_ID')  
        # There are conditions where there isn't a match like if the county isn't loaded in the in the table, yet, or we have statewide data.
        if(matchedCol.count() >0):
            matched = matched.append({'AQ_ID': countyAQ.AQ_ID, 
                                      'COUNTY_ID': matchedCol.collect()[0][0]},
                                     ignore_index=True)
        else: noMatchFound = True
    if(len(matched)>0):    
        matched = spark.createDataFrame(matched)
        display(matched)
        # if everything went ok, then join the matched tabled of county ID's to the asthma dataframe
        AQIDFAgg2 = AQIDFAgg2.join(matched, on='AQ_ID', how='left_outer')
        display(AQIDFAgg2)
    else:
        print("NOTHING MATCHED")
        AQIDFAgg2 = AQIDFAgg2.withColumn('COUNTY_ID', lit(None))

display(AQIDFAgg2)
print(noMatchFound)

state,county,year,LEAD_MEAN,LEAD_1STMAX,LEAD_99PERC,LEAD_STD,LEAD_2NDMAX,NO2_MEAN,NO2_1STMAX,NO2_99PERC,NO2_STD,NO2_2NDMAX,OZONE_MEAN,OZONE_1STMAX,OZONE_99PERC,OZONE_STD,OZONE_2NDMAX,PM10_MEAN,PM10_1STMAX,PM10_99PERC,PM10_STD,PM10_2NDMAX,PM25_MEAN,PM25_1STMAX,PM25_99PERC,PM25_STD,PM25_2NDMAX,SO2_MEAN,SO2_1STMAX,SO2_99PERC,SO2_STD,SO2_2NDMAX,LEAD_METHOD_ID,NO2_METHOD_ID,OZONE_METHOD_ID,PM10_METHOD_ID,PM25_METHOD_ID,SO2_METHOD_ID,LEAD_METRIC_ID,NO2_METRIC_ID,OZONE_METRIC_ID,PM10_METRIC_ID,PM25_METRIC_ID,SO2_METRIC_ID,LEAD_UNITS_ID,NO2_UNITS_ID,OZONE_UNITS_ID,PM10_UNITS_ID,PM25_UNITS_ID,SO2_UNITS_ID,AQ_ID
CA,Sonoma,2018,,,,,,10.831921,65.1,45.0,8.739281,56.4,0.0379805,0.073,0.0615,0.0094615,0.0675,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0,1
CA,Stanislaus,2015,,,,,,18.95788,42.0,38.0,8.758836,38.0,0.051584,0.112,0.0955,0.018962,0.1,32.051724,76.0,76.0,15.31359,64.0,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0,2
CA,Calaveras,2017,,,,,,,,,,,0.052957,0.109,0.089,0.015065,0.091,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0,3
CA,Contra Costa,2019,,,,,,12.65093925,39.325,33.325,7.3735775,35.825,0.04185325,0.093,0.0765,0.0117075,0.08625,,,,,,6.133333,25.5,25.5,3.794034,17.2,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0,4
CA,Lake,2016,,,,,,,,,,,0.041474,0.068,0.065,0.008504,0.068,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0,5
CA,Santa Clara,2016,,,,,,23.385278,51.650000000000006,46.0,9.6735115,50.25,0.04473725,0.08825,0.07775,0.01179375,0.08275,,,,,,8.045263,22.7,22.7,3.834162000000001,20.3,0.537047,1.8,1.6,0.381402,1.7,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0,6
CA,Los Angeles,2015,0.0123738571428571,0.0363571428571428,0.0345,0.0067071428571428,0.0298571428571428,29.4755605,73.84285714285714,62.38571428571429,12.8423125,67.30714285714286,0.0548571538461538,0.1123076923076923,0.0999230769230769,0.0172545384615384,0.1053846153846154,,,,,,10.92832075,51.410000000000046,40.33500000000003,6.861900049999998,42.68500000000003,1.8496146666666669,21.666666666666668,8.3,1.8388513333333332,9.7,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0,7
CA,San Diego,2016,,,,,,19.314231777777778,57.66666666666666,46.66666666666666,10.879725888888888,50.55555555555556,0.0506423636363636,0.0857272727272727,0.0807272727272727,0.0108840909090909,0.0828181818181817,,,,,,8.451768666666668,23.533333333333328,19.866666666666664,3.427879,18.6,0.2253495,1.2,0.65,0.1548435,0.65,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0,8
CA,Riverside,2017,0.005121,0.015,0.015,0.003032,0.014,17.986095,50.11428571428571,41.8,9.269774571428572,43.85714285714285,0.0603519090909091,0.1143636363636363,0.1045454545454545,0.0179740909090909,0.1116363636363636,,,,,,12.455024749999998,46.15,40.6,7.090440500000001,40.87500000000001,0.617355,2.5,1.9,0.414118,2.3,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0,9
CA,Orange,2016,,,,,,26.712886,64.925,58.6,13.15272775,61.525000000000006,0.0510245,0.1044999999999999,0.08925,0.01382375,0.0967499999999999,,,,,,8.3535625,34.550000000000004,22.95,4.102276000000001,26.3,0.465903,3.3,2.1,0.496436,3.2,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0,10


STATE_ID,COUNTY_ID,COUNTY_NAME,STATE_ABBR,STATE_NAME
1,3109,Big Horn,MT,Montana
1,2975,Petroleum,MT,Montana
1,2953,Granite,MT,Montana
1,2924,Carter,MT,Montana
1,2832,Rosebud,MT,Montana
1,2770,Flathead,MT,Montana
1,2639,Daniels,MT,Montana
1,2568,Deer Lodge,MT,Montana
1,2498,Ravalli,MT,Montana
1,2497,McCone,MT,Montana


AQ_ID,COUNTY_ID
1.0,862.0
2.0,809.0
3.0,65.0
4.0,46.0
5.0,603.0
6.0,1737.0
7.0,3118.0
8.0,2343.0
9.0,586.0
10.0,2491.0


AQ_ID,state,county,year,LEAD_MEAN,LEAD_1STMAX,LEAD_99PERC,LEAD_STD,LEAD_2NDMAX,NO2_MEAN,NO2_1STMAX,NO2_99PERC,NO2_STD,NO2_2NDMAX,OZONE_MEAN,OZONE_1STMAX,OZONE_99PERC,OZONE_STD,OZONE_2NDMAX,PM10_MEAN,PM10_1STMAX,PM10_99PERC,PM10_STD,PM10_2NDMAX,PM25_MEAN,PM25_1STMAX,PM25_99PERC,PM25_STD,PM25_2NDMAX,SO2_MEAN,SO2_1STMAX,SO2_99PERC,SO2_STD,SO2_2NDMAX,LEAD_METHOD_ID,NO2_METHOD_ID,OZONE_METHOD_ID,PM10_METHOD_ID,PM25_METHOD_ID,SO2_METHOD_ID,LEAD_METRIC_ID,NO2_METRIC_ID,OZONE_METRIC_ID,PM10_METRIC_ID,PM25_METRIC_ID,SO2_METRIC_ID,LEAD_UNITS_ID,NO2_UNITS_ID,OZONE_UNITS_ID,PM10_UNITS_ID,PM25_UNITS_ID,SO2_UNITS_ID,COUNTY_ID
1,CA,Sonoma,2018,,,,,,10.831921,65.1,45.0,8.739281,56.4,0.0379805,0.073,0.0615,0.0094615,0.0675,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0,862.0
2,CA,Stanislaus,2015,,,,,,18.95788,42.0,38.0,8.758836,38.0,0.051584,0.112,0.0955,0.018962,0.1,32.051724,76.0,76.0,15.31359,64.0,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0,809.0
3,CA,Calaveras,2017,,,,,,,,,,,0.052957,0.109,0.089,0.015065,0.091,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0,65.0
4,CA,Contra Costa,2019,,,,,,12.65093925,39.325,33.325,7.3735775,35.825,0.04185325,0.093,0.0765,0.0117075,0.08625,,,,,,6.133333,25.5,25.5,3.794034,17.2,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0,46.0
5,CA,Lake,2016,,,,,,,,,,,0.041474,0.068,0.065,0.008504,0.068,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0,603.0
6,CA,Santa Clara,2016,,,,,,23.385278,51.650000000000006,46.0,9.6735115,50.25,0.04473725,0.08825,0.07775,0.01179375,0.08275,,,,,,8.045263,22.7,22.7,3.834162000000001,20.3,0.537047,1.8,1.6,0.381402,1.7,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0,1737.0
7,CA,Los Angeles,2015,0.0123738571428571,0.0363571428571428,0.0345,0.0067071428571428,0.0298571428571428,29.4755605,73.84285714285714,62.38571428571429,12.8423125,67.30714285714286,0.0548571538461538,0.1123076923076923,0.0999230769230769,0.0172545384615384,0.1053846153846154,,,,,,10.92832075,51.410000000000046,40.33500000000003,6.861900049999998,42.68500000000003,1.8496146666666669,21.666666666666668,8.3,1.8388513333333332,9.7,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0,3118.0
8,CA,San Diego,2016,,,,,,19.314231777777778,57.66666666666666,46.66666666666666,10.879725888888888,50.55555555555556,0.0506423636363636,0.0857272727272727,0.0807272727272727,0.0108840909090909,0.0828181818181817,,,,,,8.451768666666668,23.533333333333328,19.866666666666664,3.427879,18.6,0.2253495,1.2,0.65,0.1548435,0.65,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0,2343.0
9,CA,Riverside,2017,0.005121,0.015,0.015,0.003032,0.014,17.986095,50.11428571428571,41.8,9.269774571428572,43.85714285714285,0.0603519090909091,0.1143636363636363,0.1045454545454545,0.0179740909090909,0.1116363636363636,,,,,,12.455024749999998,46.15,40.6,7.090440500000001,40.87500000000001,0.617355,2.5,1.9,0.414118,2.3,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0,586.0
10,CA,Orange,2016,,,,,,26.712886,64.925,58.6,13.15272775,61.525000000000006,0.0510245,0.1044999999999999,0.08925,0.01382375,0.0967499999999999,,,,,,8.3535625,34.550000000000004,22.95,4.102276000000001,26.3,0.465903,3.3,2.1,0.496436,3.2,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0,2491.0


AQ_ID,state,county,year,LEAD_MEAN,LEAD_1STMAX,LEAD_99PERC,LEAD_STD,LEAD_2NDMAX,NO2_MEAN,NO2_1STMAX,NO2_99PERC,NO2_STD,NO2_2NDMAX,OZONE_MEAN,OZONE_1STMAX,OZONE_99PERC,OZONE_STD,OZONE_2NDMAX,PM10_MEAN,PM10_1STMAX,PM10_99PERC,PM10_STD,PM10_2NDMAX,PM25_MEAN,PM25_1STMAX,PM25_99PERC,PM25_STD,PM25_2NDMAX,SO2_MEAN,SO2_1STMAX,SO2_99PERC,SO2_STD,SO2_2NDMAX,LEAD_METHOD_ID,NO2_METHOD_ID,OZONE_METHOD_ID,PM10_METHOD_ID,PM25_METHOD_ID,SO2_METHOD_ID,LEAD_METRIC_ID,NO2_METRIC_ID,OZONE_METRIC_ID,PM10_METRIC_ID,PM25_METRIC_ID,SO2_METRIC_ID,LEAD_UNITS_ID,NO2_UNITS_ID,OZONE_UNITS_ID,PM10_UNITS_ID,PM25_UNITS_ID,SO2_UNITS_ID,COUNTY_ID
1,CA,Sonoma,2018,,,,,,10.831921,65.1,45.0,8.739281,56.4,0.0379805,0.073,0.0615,0.0094615,0.0675,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0,862.0
2,CA,Stanislaus,2015,,,,,,18.95788,42.0,38.0,8.758836,38.0,0.051584,0.112,0.0955,0.018962,0.1,32.051724,76.0,76.0,15.31359,64.0,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0,809.0
3,CA,Calaveras,2017,,,,,,,,,,,0.052957,0.109,0.089,0.015065,0.091,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0,65.0
4,CA,Contra Costa,2019,,,,,,12.65093925,39.325,33.325,7.3735775,35.825,0.04185325,0.093,0.0765,0.0117075,0.08625,,,,,,6.133333,25.5,25.5,3.794034,17.2,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0,46.0
5,CA,Lake,2016,,,,,,,,,,,0.041474,0.068,0.065,0.008504,0.068,,,,,,,,,,,,,,,,3.0,4.0,6.0,3.0,3.0,3.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,4.0,2.0,2.0,2.0,603.0
6,CA,Santa Clara,2016,,,,,,23.385278,51.650000000000006,46.0,9.6735115,50.25,0.04473725,0.08825,0.07775,0.01179375,0.08275,,,,,,8.045263,22.7,22.7,3.834162000000001,20.3,0.537047,1.8,1.6,0.381402,1.7,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0,1737.0
7,CA,Los Angeles,2015,0.0123738571428571,0.0363571428571428,0.0345,0.0067071428571428,0.0298571428571428,29.4755605,73.84285714285714,62.38571428571429,12.8423125,67.30714285714286,0.0548571538461538,0.1123076923076923,0.0999230769230769,0.0172545384615384,0.1053846153846154,,,,,,10.92832075,51.410000000000046,40.33500000000003,6.861900049999998,42.68500000000003,1.8496146666666669,21.666666666666668,8.3,1.8388513333333332,9.7,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0,3118.0
8,CA,San Diego,2016,,,,,,19.314231777777778,57.66666666666666,46.66666666666666,10.879725888888888,50.55555555555556,0.0506423636363636,0.0857272727272727,0.0807272727272727,0.0108840909090909,0.0828181818181817,,,,,,8.451768666666668,23.533333333333328,19.866666666666664,3.427879,18.6,0.2253495,1.2,0.65,0.1548435,0.65,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0,2343.0
9,CA,Riverside,2017,0.005121,0.015,0.015,0.003032,0.014,17.986095,50.11428571428571,41.8,9.269774571428572,43.85714285714285,0.0603519090909091,0.1143636363636363,0.1045454545454545,0.0179740909090909,0.1116363636363636,,,,,,12.455024749999998,46.15,40.6,7.090440500000001,40.87500000000001,0.617355,2.5,1.9,0.414118,2.3,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0,586.0
10,CA,Orange,2016,,,,,,26.712886,64.925,58.6,13.15272775,61.525000000000006,0.0510245,0.1044999999999999,0.08925,0.01382375,0.0967499999999999,,,,,,8.3535625,34.550000000000004,22.95,4.102276000000001,26.3,0.465903,3.3,2.1,0.496436,3.2,3.0,3.0,7.0,3.0,3.0,3.0,2.0,2.0,4.0,2.0,2.0,2.0,2.0,2.0,4.0,2.0,2.0,2.0,2491.0


In [0]:
AQIDFAgg2 = AQIDFAgg2.drop('state','county', 'AQ_ID')

Write the AQ data to the SQL database

In [0]:
print(AQIDFAgg2.printSchema())
from pyspark.sql.types import LongType
AQIDFAgg3 = AQIDFAgg2.withColumn("COUNTY_ID", AQIDFAgg2["COUNTY_ID"].cast(LongType()))
print(AQIDFAgg3.printSchema())

In [0]:
# Copy schema from database and make DataFrame with that schema

# Load in database
df_loaded = spark.read.format("jdbc") \
    .option("url", url) \
    .option("dbtable", "AirQualityDataCounty") \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .load()
df_loaded = df_loaded.drop("AQ_ID") # We don't need the IDs

print(df_loaded.printSchema())
print(AQIDFAgg2.printSchema())
# Reorder columns to match order from database
AQIDFAgg2 = AQIDFAgg2.select("COUNTY_ID", "year",
                             "LEAD_MEAN", "LEAD_1STMAX", "LEAD_99PERC", "LEAD_STD",
                             "LEAD_2NDMAX","LEAD_METHOD_ID","LEAD_METRIC_ID","LEAD_UNITS_ID",
                             "NO2_MEAN", "NO2_1STMAX", "NO2_99PERC", "NO2_STD",
                             "NO2_2NDMAX", "NO2_METHOD_ID","NO2_METRIC_ID","NO2_UNITS_ID",
                             "OZONE_MEAN", "OZONE_1STMAX", "OZONE_99PERC", "OZONE_STD",
                             "OZONE_2NDMAX", "OZONE_METHOD_ID","OZONE_METRIC_ID","OZONE_UNITS_ID", 
                             "PM10_MEAN", "PM10_1STMAX", "PM10_99PERC", "PM10_STD",
                             "PM10_2NDMAX", "PM10_METHOD_ID","PM10_METRIC_ID","PM10_UNITS_ID",
                             "PM25_MEAN", "PM25_1STMAX", "PM25_99PERC", "PM25_STD", 
                             "PM25_2NDMAX", "PM25_METHOD_ID","PM25_METRIC_ID","PM25_UNITS_ID",
                             "SO2_MEAN", "SO2_1STMAX", "SO2_99PERC", "SO2_STD",
                             "SO2_2NDMAX", "SO2_METHOD_ID","SO2_METRIC_ID","SO2_UNITS_ID")

AQIDFAgg2 = AQIDFAgg2.withColumnRenamed("year","YEAR")
AQIDFAgg2 = AQIDFAgg2.withColumn("COUNTY_ID", AQIDFAgg2.COUNTY_ID.cast("integer"))   

AQIDFAgg2 = AQIDFAgg2.withColumn("LEAD_METHOD_ID", AQIDFAgg2.LEAD_METHOD_ID.cast("integer"))   
AQIDFAgg2 = AQIDFAgg2.withColumn("NO2_METHOD_ID", AQIDFAgg2.NO2_METHOD_ID.cast("integer"))    
AQIDFAgg2 = AQIDFAgg2.withColumn("OZONE_METHOD_ID", AQIDFAgg2.OZONE_METHOD_ID.cast("integer"))    
AQIDFAgg2 = AQIDFAgg2.withColumn("PM10_METHOD_ID", AQIDFAgg2.PM10_METHOD_ID.cast("integer"))    
AQIDFAgg2 = AQIDFAgg2.withColumn("PM25_METHOD_ID", AQIDFAgg2.PM25_METHOD_ID.cast("integer"))    
AQIDFAgg2 = AQIDFAgg2.withColumn("SO2_METHOD_ID", AQIDFAgg2.SO2_METHOD_ID.cast("integer")) 

AQIDFAgg2 = AQIDFAgg2.withColumn("LEAD_METRIC_ID", AQIDFAgg2.LEAD_METRIC_ID.cast("integer"))   
AQIDFAgg2 = AQIDFAgg2.withColumn("NO2_METRIC_ID", AQIDFAgg2.NO2_METRIC_ID.cast("integer"))    
AQIDFAgg2 = AQIDFAgg2.withColumn("OZONE_METRIC_ID", AQIDFAgg2.OZONE_METRIC_ID.cast("integer"))    
AQIDFAgg2 = AQIDFAgg2.withColumn("PM10_METRIC_ID", AQIDFAgg2.PM10_METRIC_ID.cast("integer"))    
AQIDFAgg2 = AQIDFAgg2.withColumn("PM25_METRIC_ID", AQIDFAgg2.PM25_METRIC_ID.cast("integer"))    
AQIDFAgg2 = AQIDFAgg2.withColumn("SO2_METRIC_ID", AQIDFAgg2.SO2_METRIC_ID.cast("integer")) 

AQIDFAgg2 = AQIDFAgg2.withColumn("LEAD_UNITS_ID", AQIDFAgg2.LEAD_UNITS_ID.cast("integer"))   
AQIDFAgg2 = AQIDFAgg2.withColumn("NO2_UNITS_ID", AQIDFAgg2.NO2_UNITS_ID.cast("integer"))    
AQIDFAgg2 = AQIDFAgg2.withColumn("OZONE_UNITS_ID", AQIDFAgg2.OZONE_UNITS_ID.cast("integer"))    
AQIDFAgg2 = AQIDFAgg2.withColumn("PM10_UNITS_ID", AQIDFAgg2.PM10_UNITS_ID.cast("integer"))    
AQIDFAgg2 = AQIDFAgg2.withColumn("PM25_UNITS_ID", AQIDFAgg2.PM25_UNITS_ID.cast("integer"))    
AQIDFAgg2 = AQIDFAgg2.withColumn("SO2_UNITS_ID", AQIDFAgg2.SO2_UNITS_ID.cast("integer")) 

try:
    AQIDFAgg3 = sqlContext.createDataFrame(AQIDFAgg2.rdd, df_loaded.schema)
    display(AQIDFAgg3)
except Exception as e:
    print(f"SCHEMA: {e}")

COUNTY_ID,YEAR,LEAD_MEAN,LEAD_1STMAX,LEAD_99PERC,LEAD_STD,LEAD_2NDMAX,LEAD_METHOD_ID,LEAD_METRIC_ID,LEAD_UNITS_ID,NO2_MEAN,NO2_1STMAX,NO2_99PERC,NO2_STD,NO2_2NDMAX,NO2_METHOD_ID,NO2_METRIC_ID,NO2_UNITS_ID,OZONE_MEAN,OZONE_1STMAX,OZONE_99PERC,OZONE_STD,OZONE_2NDMAX,OZONE_METHOD_ID,OZONE_METRIC_ID,OZONE_UNITS_ID,PM10_MEAN,PM10_1STMAX,PM10_99PERC,PM10_STD,PM10_2NDMAX,PM10_METHOD_ID,PM10_METRIC_ID,PM10_UNITS_ID,PM25_MEAN,PM25_1STMAX,PM25_99PERC,PM25_STD,PM25_2NDMAX,PM25_METHOD_ID,PM25_METRIC_ID,PM25_UNITS_ID,SO2_MEAN,SO2_1STMAX,SO2_99PERC,SO2_STD,SO2_2NDMAX,SO2_METHOD_ID,SO2_METRIC_ID,SO2_UNITS_ID
862,2018,,,,,,3,2,2,10.831921,65.1,45.0,8.739281,56.4,4,3,3,0.0379805,0.073,0.0615,0.0094615,0.0675,6,4,4,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2
809,2015,,,,,,3,2,2,18.95788,42.0,38.0,8.758836,38.0,4,3,3,0.051584,0.112,0.0955,0.018962,0.1,6,4,4,32.051724,76.0,76.0,15.31359,64.0,3,2,2,,,,,,3,2,2,,,,,,3,2,2
65,2017,,,,,,3,2,2,,,,,,4,3,3,0.052957,0.109,0.089,0.015065,0.091,6,4,4,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2
46,2019,,,,,,3,2,2,12.65093925,39.325,33.325,7.3735775,35.825,4,3,3,0.04185325,0.093,0.0765,0.0117075,0.08625,6,4,4,,,,,,3,2,2,6.133333,25.5,25.5,3.794034,17.2,3,2,2,,,,,,3,2,2
603,2016,,,,,,3,2,2,,,,,,4,3,3,0.041474,0.068,0.065,0.008504,0.068,6,4,4,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2
1737,2016,,,,,,3,2,2,23.385278,51.650000000000006,46.0,9.6735115,50.25,3,2,2,0.04473725,0.08825,0.07775,0.01179375,0.08275,7,4,4,,,,,,3,2,2,8.045263,22.7,22.7,3.834162000000001,20.3,3,2,2,0.537047,1.8,1.6,0.381402,1.7,3,2,2
3118,2015,0.0123738571428571,0.0363571428571428,0.0345,0.0067071428571428,0.0298571428571428,3,2,2,29.4755605,73.84285714285714,62.38571428571429,12.8423125,67.30714285714286,3,2,2,0.0548571538461538,0.1123076923076923,0.0999230769230769,0.0172545384615384,0.1053846153846154,7,4,4,,,,,,3,2,2,10.92832075,51.410000000000046,40.33500000000003,6.861900049999998,42.68500000000003,3,2,2,1.8496146666666669,21.666666666666668,8.3,1.8388513333333332,9.7,3,2,2
2343,2016,,,,,,3,2,2,19.314231777777778,57.66666666666666,46.66666666666666,10.879725888888888,50.55555555555556,3,2,2,0.0506423636363636,0.0857272727272727,0.0807272727272727,0.0108840909090909,0.0828181818181817,7,4,4,,,,,,3,2,2,8.451768666666668,23.533333333333328,19.866666666666664,3.427879,18.6,3,2,2,0.2253495,1.2,0.65,0.1548435,0.65,3,2,2
586,2017,0.005121,0.015,0.015,0.003032,0.014,3,2,2,17.986095,50.11428571428571,41.8,9.269774571428572,43.85714285714285,3,2,2,0.0603519090909091,0.1143636363636363,0.1045454545454545,0.0179740909090909,0.1116363636363636,7,4,4,,,,,,3,2,2,12.455024749999998,46.15,40.6,7.090440500000001,40.87500000000001,3,2,2,0.617355,2.5,1.9,0.414118,2.3,3,2,2
2491,2016,,,,,,3,2,2,26.712886,64.925,58.6,13.15272775,61.525000000000006,3,2,2,0.0510245,0.1044999999999999,0.08925,0.01382375,0.0967499999999999,7,4,4,,,,,,3,2,2,8.3535625,34.550000000000004,22.95,4.102276000000001,26.3,3,2,2,0.465903,3.3,2.1,0.496436,3.2,3,2,2


In [0]:
print(AQIDFAgg3.printSchema())


In [0]:
try:
    writer = AQIDFAgg3.write.format("jdbc").option("url", url) \
        .mode("append") \
        .option("dbtable", 'AirQualityDataCounty') \
        .option("user", user) \
        .option("password", password) \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")
    writer.save()
except Exception as e:
    print(e)

    df_loaded = spark.read.format("jdbc") \
        .option("url", url) \
        .option("dbtable", "AirQualityDataCounty") \
        .option("user", user) \
        .option("password", password) \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()
    print('df_loaded')
    display(df_loaded)   
        
    # Subtract the two to only leave what is not in the database
    # remove the index column of the loaded dataframe
    df_loaded = df_loaded.drop('AQ_ID')
    df_toload = AQIDFAgg3.subtract(df_loaded)
    print('df_toload')
    display(df_toload)
    
    if(df_toload.count() > 0):
        # Upload all the data that wasn't already in the database
        df_toload.write.format("jdbc")  \
            .option("url", url) \
            .mode("append") \
            .option("dbtable", "AirQualityDataCounty") \
            .option("user", user) \
            .option("password", password) \
            .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
            .save()
    
AQIDFAggDF = spark.read.format("jdbc") \
    .option("url", url) \
    .option("dbtable", "AirQualityDataCounty") \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .load()
display(AQIDFAggDF) 

AQ_ID,COUNTY_ID,YEAR,LEAD_MEAN,LEAD_1STMAX,LEAD_99PERC,LEAD_STD,LEAD_2NDMAX,LEAD_METHOD_ID,LEAD_METRIC_ID,LEAD_UNITS_ID,NO2_MEAN,NO2_1STMAX,NO2_99PERC,NO2_STD,NO2_2NDMAX,NO2_METHOD_ID,NO2_METRIC_ID,NO2_UNITS_ID,OZONE_MEAN,OZONE_1STMAX,OZONE_99PERC,OZONE_STD,OZONE_2NDMAX,OZONE_METHOD_ID,OZONE_METRIC_ID,OZONE_UNITS_ID,PM10_MEAN,PM10_1STMAX,PM10_99PERC,PM10_STD,PM10_2NDMAX,PM10_METHOD_ID,PM10_METRIC_ID,PM10_UNITS_ID,PM25_MEAN,PM25_1STMAX,PM25_99PERC,PM25_STD,PM25_2NDMAX,PM25_METHOD_ID,PM25_METRIC_ID,PM25_UNITS_ID,SO2_MEAN,SO2_1STMAX,SO2_99PERC,SO2_STD,SO2_2NDMAX,SO2_METHOD_ID,SO2_METRIC_ID,SO2_UNITS_ID
1,1371,2021,,,,,,3,2,2,,,,,,3,2,2,0.054537,0.095,0.088,0.01188,0.093,6,4,4,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2
2,714,2021,,,,,,3,2,2,,,,,,3,2,2,0.041595,0.089,0.0825,0.0122104999999999,0.0825,6,4,4,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2
3,1286,2021,,,,,,3,2,2,,,,,,3,2,2,0.047252,0.079,0.076,0.009666,0.078,7,4,4,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2
4,1796,2021,,,,,,3,2,2,4.42316,13.833333333333334,11.333333333333334,2.5284210000000003,11.466666666666669,4,3,3,0.0385808888888888,0.0654444444444444,0.0612222222222222,0.0067007777777777,0.0609999999999999,6,4,4,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2
5,1303,2021,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2,1.890776,30.5,19.0,2.906337,19.5,9,6,3
6,3126,2021,,,,,,3,2,2,,,,,,3,2,2,0.0464555,0.107,0.092,0.0147595,0.0955,7,4,4,,,,,,3,2,2,7.237862,24.95,24.0,4.3267165,24.0,8,5,1,0.9241095,4.6,3.45,0.7187600000000001,4.05,9,6,3
7,2422,2021,,,,,,3,2,2,,,,,,3,2,2,0.043206,0.071,0.065,0.01211,0.066,7,4,4,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2
8,921,2021,,,,,,3,2,2,,,,,,3,2,2,0.041337,0.0743333333333333,0.0696666666666666,0.0117803333333333,0.0713333333333333,7,4,4,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2
9,2073,2021,,,,,,3,2,2,9.6700714,29.4,23.2,4.461328399999999,24.76,4,3,3,0.0512102,0.0806,0.0754,0.0098608,0.0788,6,4,4,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2
10,1379,2021,,,,,,3,2,2,,,,,,3,2,2,0.05092325,0.121,0.1105,0.017247,0.116,7,4,4,,,,,,3,2,2,7.42,25.75,25.75,5.3271945,20.3,8,5,1,0.602247,3.4,2.9,0.618568,2.9,9,6,3


COUNTY_ID,YEAR,LEAD_MEAN,LEAD_1STMAX,LEAD_99PERC,LEAD_STD,LEAD_2NDMAX,LEAD_METHOD_ID,LEAD_METRIC_ID,LEAD_UNITS_ID,NO2_MEAN,NO2_1STMAX,NO2_99PERC,NO2_STD,NO2_2NDMAX,NO2_METHOD_ID,NO2_METRIC_ID,NO2_UNITS_ID,OZONE_MEAN,OZONE_1STMAX,OZONE_99PERC,OZONE_STD,OZONE_2NDMAX,OZONE_METHOD_ID,OZONE_METRIC_ID,OZONE_UNITS_ID,PM10_MEAN,PM10_1STMAX,PM10_99PERC,PM10_STD,PM10_2NDMAX,PM10_METHOD_ID,PM10_METRIC_ID,PM10_UNITS_ID,PM25_MEAN,PM25_1STMAX,PM25_99PERC,PM25_STD,PM25_2NDMAX,PM25_METHOD_ID,PM25_METRIC_ID,PM25_UNITS_ID,SO2_MEAN,SO2_1STMAX,SO2_99PERC,SO2_STD,SO2_2NDMAX,SO2_METHOD_ID,SO2_METRIC_ID,SO2_UNITS_ID


AQ_ID,COUNTY_ID,YEAR,LEAD_MEAN,LEAD_1STMAX,LEAD_99PERC,LEAD_STD,LEAD_2NDMAX,LEAD_METHOD_ID,LEAD_METRIC_ID,LEAD_UNITS_ID,NO2_MEAN,NO2_1STMAX,NO2_99PERC,NO2_STD,NO2_2NDMAX,NO2_METHOD_ID,NO2_METRIC_ID,NO2_UNITS_ID,OZONE_MEAN,OZONE_1STMAX,OZONE_99PERC,OZONE_STD,OZONE_2NDMAX,OZONE_METHOD_ID,OZONE_METRIC_ID,OZONE_UNITS_ID,PM10_MEAN,PM10_1STMAX,PM10_99PERC,PM10_STD,PM10_2NDMAX,PM10_METHOD_ID,PM10_METRIC_ID,PM10_UNITS_ID,PM25_MEAN,PM25_1STMAX,PM25_99PERC,PM25_STD,PM25_2NDMAX,PM25_METHOD_ID,PM25_METRIC_ID,PM25_UNITS_ID,SO2_MEAN,SO2_1STMAX,SO2_99PERC,SO2_STD,SO2_2NDMAX,SO2_METHOD_ID,SO2_METRIC_ID,SO2_UNITS_ID
1,1371,2021,,,,,,3,2,2,,,,,,3,2,2,0.054537,0.095,0.088,0.01188,0.093,6,4,4,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2
2,714,2021,,,,,,3,2,2,,,,,,3,2,2,0.041595,0.089,0.0825,0.0122104999999999,0.0825,6,4,4,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2
3,1286,2021,,,,,,3,2,2,,,,,,3,2,2,0.047252,0.079,0.076,0.009666,0.078,7,4,4,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2
4,1796,2021,,,,,,3,2,2,4.42316,13.833333333333334,11.333333333333334,2.5284210000000003,11.466666666666669,4,3,3,0.0385808888888888,0.0654444444444444,0.0612222222222222,0.0067007777777777,0.0609999999999999,6,4,4,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2
5,1303,2021,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2,1.890776,30.5,19.0,2.906337,19.5,9,6,3
6,3126,2021,,,,,,3,2,2,,,,,,3,2,2,0.0464555,0.107,0.092,0.0147595,0.0955,7,4,4,,,,,,3,2,2,7.237862,24.95,24.0,4.3267165,24.0,8,5,1,0.9241095,4.6,3.45,0.7187600000000001,4.05,9,6,3
7,2422,2021,,,,,,3,2,2,,,,,,3,2,2,0.043206,0.071,0.065,0.01211,0.066,7,4,4,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2
8,921,2021,,,,,,3,2,2,,,,,,3,2,2,0.041337,0.0743333333333333,0.0696666666666666,0.0117803333333333,0.0713333333333333,7,4,4,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2
9,2073,2021,,,,,,3,2,2,9.6700714,29.4,23.2,4.461328399999999,24.76,4,3,3,0.0512102,0.0806,0.0754,0.0098608,0.0788,6,4,4,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2
10,1379,2021,,,,,,3,2,2,,,,,,3,2,2,0.05092325,0.121,0.1105,0.017247,0.116,7,4,4,,,,,,3,2,2,7.42,25.75,25.75,5.3271945,20.3,8,5,1,0.602247,3.4,2.9,0.618568,2.9,9,6,3


### 4. Add an index in the asthma dataframe referencing the matching data from the air quality dataframe

In [0]:
display(AQIDFAggDF)
display(asthmaTotalDF)

AQ_ID,COUNTY_ID,YEAR,LEAD_MEAN,LEAD_1STMAX,LEAD_99PERC,LEAD_STD,LEAD_2NDMAX,LEAD_METHOD_ID,LEAD_METRIC_ID,LEAD_UNITS_ID,NO2_MEAN,NO2_1STMAX,NO2_99PERC,NO2_STD,NO2_2NDMAX,NO2_METHOD_ID,NO2_METRIC_ID,NO2_UNITS_ID,OZONE_MEAN,OZONE_1STMAX,OZONE_99PERC,OZONE_STD,OZONE_2NDMAX,OZONE_METHOD_ID,OZONE_METRIC_ID,OZONE_UNITS_ID,PM10_MEAN,PM10_1STMAX,PM10_99PERC,PM10_STD,PM10_2NDMAX,PM10_METHOD_ID,PM10_METRIC_ID,PM10_UNITS_ID,PM25_MEAN,PM25_1STMAX,PM25_99PERC,PM25_STD,PM25_2NDMAX,PM25_METHOD_ID,PM25_METRIC_ID,PM25_UNITS_ID,SO2_MEAN,SO2_1STMAX,SO2_99PERC,SO2_STD,SO2_2NDMAX,SO2_METHOD_ID,SO2_METRIC_ID,SO2_UNITS_ID
1,1371,2021,,,,,,3,2,2,,,,,,3,2,2,0.054537,0.095,0.088,0.01188,0.093,6,4,4,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2
2,714,2021,,,,,,3,2,2,,,,,,3,2,2,0.041595,0.089,0.0825,0.0122104999999999,0.0825,6,4,4,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2
3,1286,2021,,,,,,3,2,2,,,,,,3,2,2,0.047252,0.079,0.076,0.009666,0.078,7,4,4,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2
4,1796,2021,,,,,,3,2,2,4.42316,13.833333333333334,11.333333333333334,2.5284210000000003,11.466666666666669,4,3,3,0.0385808888888888,0.0654444444444444,0.0612222222222222,0.0067007777777777,0.0609999999999999,6,4,4,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2
5,1303,2021,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2,1.890776,30.5,19.0,2.906337,19.5,9,6,3
6,3126,2021,,,,,,3,2,2,,,,,,3,2,2,0.0464555,0.107,0.092,0.0147595,0.0955,7,4,4,,,,,,3,2,2,7.237862,24.95,24.0,4.3267165,24.0,8,5,1,0.9241095,4.6,3.45,0.7187600000000001,4.05,9,6,3
7,2422,2021,,,,,,3,2,2,,,,,,3,2,2,0.043206,0.071,0.065,0.01211,0.066,7,4,4,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2
8,921,2021,,,,,,3,2,2,,,,,,3,2,2,0.041337,0.0743333333333333,0.0696666666666666,0.0117803333333333,0.0713333333333333,7,4,4,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2
9,2073,2021,,,,,,3,2,2,9.6700714,29.4,23.2,4.461328399999999,24.76,4,3,3,0.0512102,0.0806,0.0754,0.0098608,0.0788,6,4,4,,,,,,3,2,2,,,,,,3,2,2,,,,,,3,2,2
10,1379,2021,,,,,,3,2,2,,,,,,3,2,2,0.05092325,0.121,0.1105,0.017247,0.116,7,4,4,,,,,,3,2,2,7.42,25.75,25.75,5.3271945,20.3,8,5,1,0.602247,3.4,2.9,0.618568,2.9,9,6,3


AsthmaID,YEAR,NUMBER OF ED VISITS,AGE-ADJUSTED ED VISIT RATE,COUNTY_ID
1,2015,191904,50.4,
2,2015,9939,64.3,3085.0
3,2015,196,58.4,843.0
4,2015,1044,50.2,152.0
5,2015,185,48.0,65.0
6,2015,97,41.4,545.0
7,2015,6858,65.2,46.0
8,2015,140,53.0,2947.0
9,2015,592,36.4,1751.0
10,2015,7936,77.0,2778.0


In [0]:
# Create a table relating the asthma indices matched to AQ indices by county and year. 
# There is no more than one match for each but not always a match for all of them
import pandas as pd
matched = pd.DataFrame([])
count = 0
for countyAsthma in asthmaTotalDF.toLocalIterator():
    matchedCol = AQIDFAggDF.filter((AQIDFAggDF.COUNTY_ID == countyAsthma.COUNTY_ID) & (AQIDFAggDF.YEAR == countyAsthma.YEAR)).select('AQ_ID')  
    if(matchedCol.count() >0):
        matched = matched.append({'AsthmaID': countyAsthma.AsthmaID, 
                                  'AQ_ID': matchedCol.collect()[0][0]},
                                 ignore_index=True)
matched = spark.createDataFrame(matched)
display(matched)

AQ_ID,AsthmaID
989.0,2.0
910.0,3.0
886.0,4.0
812.0,5.0
808.0,6.0
802.0,7.0
889.0,9.0
857.0,10.0
816.0,11.0
939.0,12.0


In [0]:
# Use a left outer join then
asthmaTotalDFjoined = asthmaTotalDF.join(matched, on='AsthmaID', how='left_outer')
display(asthmaTotalDFjoined)

AsthmaID,YEAR,NUMBER OF ED VISITS,AGE-ADJUSTED ED VISIT RATE,COUNTY_ID,AQ_ID
1,2015,191904,50.4,,
2,2015,9939,64.3,3085.0,989.0
3,2015,196,58.4,843.0,910.0
4,2015,1044,50.2,152.0,886.0
5,2015,185,48.0,65.0,812.0
6,2015,97,41.4,545.0,808.0
7,2015,6858,65.2,46.0,802.0
8,2015,140,53.0,2947.0,
9,2015,592,36.4,1751.0,889.0
10,2015,7936,77.0,2778.0,857.0


Look at the counties with missing air quality data. Let's see if there are missmatched spellings or other issues causing problems.

In [0]:
display(asthmaTotalDFjoined.filter((asthmaTotalDFjoined.AQ_ID.isNull())))

AsthmaID,YEAR,NUMBER OF ED VISITS,AGE-ADJUSTED ED VISIT RATE,COUNTY_ID,AQ_ID
1,2015,191904,50.4,,
8,2015,140,53.0,2947.0,
18,2015,148,45.3,29.0,
23,2015,479,57.4,568.0,
25,2015,53,67.9,2173.0,
26,2015,43,32.0,3048.0,
52,2015,67,60.5,2966.0,
57,2015,346,45.4,782.0,
58,2016,174733,45.8,,
65,2016,172,71.1,2947.0,


California is the statewide average. We can leave that in case we want to display that. The others don't appear to have any obvious mismatches in spelling. Some are missing for all years, while some are missing AQ data for one or two years. We will keep all of this is the dataset for now and handle it later before doing the model.

### 5. Move the data to a SQL database in our datalake

We need to first reorder the dataframe columns to match the schema in the database

Load in the air quality data first since the asthma data is linked to it

In [0]:
jdbcDF = spark.read.format("jdbc") \
    .option("url", url) \
    .option("dbtable", "CAAsthmaData") \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .load()
display(jdbcDF)
print(jdbcDF.printSchema())
asthmaTotalDFjoined = asthmaTotalDFjoined.select('AsthmaID', 'COUNTY_ID', 
                                                 'YEAR', 'NUMBER OF ED VISITS', 'AGE-ADJUSTED ED VISIT RATE', 'AQ_ID')

asthmaTotalDFjoined = asthmaTotalDFjoined.withColumn("AQ_ID", asthmaTotalDFjoined["AQ_ID"].cast('int'))
asthmaTotalDFjoined = asthmaTotalDFjoined.withColumn("COUNTY_ID", asthmaTotalDFjoined["COUNTY_ID"].cast('int'))
print(asthmaTotalDFjoined.printSchema())
try:
    asthmaTotalDFjoined = sqlContext.createDataFrame(asthmaTotalDFjoined.rdd, jdbcDF.schema)
except Exception as e:
    print(f"SCHEMA: {e}")
    



ASTHMA_ID,COUNTY_ID,YEAR,NUM_ED_VISITS,AGE_ADJ_ED_VISITS,AQ_ID
1,,2015,191904,50.4,
2,3085.0,2015,9939,64.3,989.0
3,843.0,2015,196,58.4,910.0
4,152.0,2015,1044,50.2,886.0
5,65.0,2015,185,48.0,812.0
6,545.0,2015,97,41.4,808.0
7,46.0,2015,6858,65.2,802.0
8,2947.0,2015,140,53.0,
9,1751.0,2015,592,36.4,889.0
10,2778.0,2015,7936,77.0,857.0


In [0]:
asthmaTotalDFjoined = asthmaTotalDFjoined.drop("ASTHMA_ID")

In [0]:
try:
    writer = asthmaTotalDFjoined.write.format("jdbc").option("url", url) \
        .mode("append") \
        .option("dbtable", 'CAAsthmaData') \
        .option("user", user) \
        .option("password", password) \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")
    writer.save()
except Exception as e:
    print(e)

    df_loaded = spark.read.format("jdbc") \
        .option("url", url) \
        .option("dbtable", "CAAsthmaData") \
        .option("user", user) \
        .option("password", password) \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()
    print('df_loaded')
    display(df_loaded)   
        
    # Subtract the two to only leave what is not in the database
    # remove the index column of the loaded dataframe
    df_loaded = df_loaded.drop('ASTHMA_ID')
    df_toload = asthmaTotalDFjoined.subtract(df_loaded)
    print('df_toload')
    display(df_toload)
    
    if(df_toload.count() > 0):
        # Upload all the data that wasn't already in the database
        df_toload.write.format("jdbc")  \
            .option("url", url) \
            .mode("append") \
            .option("dbtable", "CAAsthmaData") \
            .option("user", user) \
            .option("password", password) \
            .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
            .save()
    
asthmaTotalDF = spark.read.format("jdbc") \
    .option("url", url) \
    .option("dbtable", "CAAsthmaData") \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .load()
display(asthmaTotalDF) 

ASTHMA_ID,COUNTY_ID,YEAR,NUM_ED_VISITS,AGE_ADJ_ED_VISITS,AQ_ID
1,,2015,191904,50.4,
2,3085.0,2015,9939,64.3,989.0
3,843.0,2015,196,58.4,910.0
4,152.0,2015,1044,50.2,886.0
5,65.0,2015,185,48.0,812.0
6,545.0,2015,97,41.4,808.0
7,46.0,2015,6858,65.2,802.0
8,2947.0,2015,140,53.0,
9,1751.0,2015,592,36.4,889.0
10,2778.0,2015,7936,77.0,857.0


COUNTY_ID,YEAR,NUM_ED_VISITS,AGE_ADJ_ED_VISITS,AQ_ID


ASTHMA_ID,COUNTY_ID,YEAR,NUM_ED_VISITS,AGE_ADJ_ED_VISITS,AQ_ID
1,,2015,191904,50.4,
2,3085.0,2015,9939,64.3,989.0
3,843.0,2015,196,58.4,910.0
4,152.0,2015,1044,50.2,886.0
5,65.0,2015,185,48.0,812.0
6,545.0,2015,97,41.4,808.0
7,46.0,2015,6858,65.2,802.0
8,2947.0,2015,140,53.0,
9,1751.0,2015,592,36.4,889.0
10,2778.0,2015,7936,77.0,857.0
