In [1]:
%matplotlib inline

import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

##### This is the Jupyter Notebook file for my project exploring land use and water pollution. The original datasets, as well as all but one of the output files (manufacturing.json), are not included in the repository at this time because of their size. A link to the original can be found in the readme.

In [4]:
water = gpd.read_file('../data/water-pollution.geojson')
land_use = gpd.read_file('../data/land-use.geojson')

##### The datasets are loaded. Jupyter Notebooks is used to produce JSONs with the required data. First, extraneous columns are removed.

In [9]:
water = water.drop(['AssessmentUnitNumber','AssessmentUnitTypeCode','NHDReachCode','NHDFromMeasurePercent','NHDToMeasurePercent','GridCellNumber','UnmappableCode'], axis=1)
water.sample(5)

Unnamed: 0,OBJECTID,ListingNumber,CategoryCode,ParameterName,MediumName,ListingWaterbodyName,EnvironmentTypeCode,Shape_Length,Shape_Area,geometry
12535,12536,614464,1,Bis(2-Ethylhexyl)phthalate,Sediment,SNOHOMISH RIVER,Marine,4833.126902,1095488.0,"MULTIPOLYGON (((1226090.435 984247.373, 122617..."
11303,11304,510530,1,Mercury,Sediment,PUGET SOUND (CENTRAL),Marine,4640.712296,1115342.0,"MULTIPOLYGON (((1176325.464 892357.360, 117630..."
3976,3977,15703,1,Bacteria,Water,SARATOGA PASSAGE,Marine,12186.563926,8918984.0,"MULTIPOLYGON (((1153844.417 1018783.500, 11537..."
25876,25877,48403,1,Temperature,Water,LANE CREEK,Freshwater,27072.76621,1344217.0,"MULTIPOLYGON (((2172282.853 1192969.153, 21721..."
15011,15012,622844,1,Fluorene,Sediment,PORT ANGELES HARBOR,Marine,6092.14413,2228722.0,"MULTIPOLYGON (((923283.500 1033287.417, 923215..."


In [10]:
land_use = land_use.drop(['COUNTY_NM','Perimeter','New_Area','New_Acres','NewArea','NewAcres','layer','path'], axis=1)
land_use.sample(5)

Unnamed: 0,OBJECTID,LANDUSE_CD,Shape_Length,Shape_Area,geometry
50296,50167,11,665.873538,25538.809016,"MULTIPOLYGON (((2268984.500 1184452.250, 22691..."
581035,580906,11,1781.364482,149885.217907,"MULTIPOLYGON (((1977377.083 545242.333, 197728..."
577057,576928,18,792.565993,34235.918693,"MULTIPOLYGON (((1103485.750 557580.833, 110335..."
708828,708699,94,7308.622898,460074.011384,"MULTIPOLYGON (((1304417.250 142267.500, 130414..."
556414,556285,68,1517.749146,137218.33735,"MULTIPOLYGON (((1630460.083 607384.167, 163011..."


##### The land use file is split up into eight different datasets depending on type. This will reduce file size and simplify the final code.

In [42]:
residential = land_use.loc[(land_use['LANDUSE_CD'] > 10) & (land_use['LANDUSE_CD'] < 20)]
with open('residential.json', 'w') as f:
    f.write(residential.to_json())
residential = gpd.read_file('residential.json')

In [43]:
residential.sample(5)

Unnamed: 0,id,LANDUSE_CD,OBJECTID,Shape_Area,Shape_Length,geometry
69846,145784,11,145655,726991.332948,5552.196067,"MULTIPOLYGON (((1177234.750 990771.083, 117742..."
327036,655145,13,655016,346897.585944,3241.058102,"MULTIPOLYGON (((1948379.000 354505.333, 194798..."
98093,206272,14,206143,17652.651748,531.996078,"MULTIPOLYGON (((1189393.250 905353.417, 118953..."
114794,238770,11,238641,87102.579494,1365.313209,"MULTIPOLYGON (((1193192.500 874291.667, 119325..."
293982,583065,11,582936,173877.522363,1650.32808,"MULTIPOLYGON (((1007907.583 538614.167, 100790..."


In [45]:
manufacturing = land_use.loc[(land_use['LANDUSE_CD'] > 20) & (land_use['LANDUSE_CD'] < 40)]
with open('manufacturing.json', 'w') as f:
    f.write(manufacturing.to_json())
manufacturing = gpd.read_file('manufacturing.json')

In [46]:
transportation = land_use.loc[(land_use['LANDUSE_CD'] > 40) & (land_use['LANDUSE_CD'] < 50)]
with open('transportation.json', 'w') as f:
    f.write(transportation.to_json())
transportation = gpd.read_file('transportation.json')

In [47]:
trade = land_use.loc[(land_use['LANDUSE_CD'] > 49) & (land_use['LANDUSE_CD'] < 60)]
with open('trade.json', 'w') as f:
    f.write(trade.to_json())
trade = gpd.read_file('trade.json')

In [48]:
service = land_use.loc[(land_use['LANDUSE_CD'] > 60) & (land_use['LANDUSE_CD'] < 70)]
with open('service.json', 'w') as f:
    f.write(service.to_json())
service = gpd.read_file('service.json')

In [49]:
recreational = land_use.loc[(land_use['LANDUSE_CD'] > 70) & (land_use['LANDUSE_CD'] < 80)]
with open('recreational.json', 'w') as f:
    f.write(recreational.to_json())
recreational = gpd.read_file('recreational.json')

In [50]:
production = land_use.loc[(land_use['LANDUSE_CD'] > 80) & (land_use['LANDUSE_CD'] < 90)]
with open('production.json', 'w') as f:
    f.write(production.to_json())
production = gpd.read_file('production.json')

In [51]:
undeveloped = land_use.loc[land_use['LANDUSE_CD'] > 90]
with open('undeveloped.json', 'w') as f:
    f.write(undeveloped.to_json())
undeveloped = gpd.read_file('undeveloped.json')

##### In order to focus the project, the water pollution file is searched and sorted for the most common kinds of pollution. This automates the process of finding the most substantive data points to investigate.

In [60]:
pollution = water.set_index(["ParameterName", "OBJECTID"]).count(level="ParameterName")
pollution = pollution['ListingNumber']
pollution = pollution.sort_values(ascending=False)
pollution.head(10)

ParameterName
Temperature                         3629
Bacteria                            3112
Dissolved Oxygen                    2185
pH                                  1387
Mercury                              700
Ammonia-N                            461
Copper                               439
Polychlorinated Biphenyls (PCBs)     427
Zinc                                 416
Arsenic                              399
Name: ListingNumber, dtype: int64