# Find the Nearest Neighboring Emitter to Each Emitter

## ER 131 Project | Group 4
**Author: ** ['Marshall Worsham'] <br>
**Date: ** 11-24-2020

## Front matter

In [7]:
# import libraries
import numpy as np
import pandas as pd
import geopandas as gpd
import os
import glob
import time
import matplotlib.pyplot as plt

In [8]:
# set data directory and out directory
datadir = os.sep.join(['/Volumes', 'Brain', 'GIS', 'ER131', 'Project', 'Data'])
outdir = os.sep.join(['/Volumes', 'Brain', 'GIS', 'ER131', 'Project', 'Output'])

## Import VISTA points

In [10]:
vistadir = os.sep.join([outdir, 'CH4_Emitters'])
vista = gpd.read_file(os.sep.join([vistadir, 'CH4_Emitters_All.geojson']))

In [11]:
print(vista.shape)
vista.head()

(234576, 12)


Unnamed: 0,Latitude,Longitude,Vista_IPCC,Vista_Name,Vista_SType,Vista_ID,Vista_Date,State,City,Source,County,geometry
0,37.72799,-120.89865,4B Biological Treatment of Solid Waste,"3D Ag, LLC",Composting Sites,CMP000001,2019-08-20,CA,Oakdale,CalRecycle,,"POLYGON ((-120.89865 37.72808, -120.89863 37.7..."
1,33.70999,-117.32864,4B Biological Treatment of Solid Waste,"A. Lua Recycling, Inc.",Composting Sites,CMP000002,2019-08-20,CA,Lake Elsinore,CalRecycle,,"POLYGON ((-117.32864 33.71008, -117.32862 33.7..."
2,38.01667,-122.13333,4B Biological Treatment of Solid Waste,Acme Composting Compost Facility,Composting Sites,CMP000003,2019-08-20,CA,Martinez,CalRecycle,,"POLYGON ((-122.13333 38.01676, -122.13331 38.0..."
3,34.03281,-118.35424,4B Biological Treatment of Solid Waste,Adams Garden Supply,Composting Sites,CMP000004,2019-08-20,CA,Los Angeles (City),CalRecycle,,"POLYGON ((-118.35424 34.03290, -118.35422 34.0..."
4,34.44517,-117.29007,4B Biological Treatment of Solid Waste,Advance Disposal Center for the Environm,Composting Sites,CMP000005,2019-08-20,CA,Hesperia,CalRecycle,,"POLYGON ((-117.29007 34.44526, -117.29005 34.4..."


## Get a sample of VISTA points

In [7]:
def get_sample(gpdf, p, seed):
    '''
    Takes in p (a float representing the sample proportion) and seed (any number) and returns the VISTA dataframe
    filtered to include a sample of pN locations, randomly selected by proportional stratified sampling of each emitter
    type.
    '''
    
    sampdata = gpdf.groupby('Vista_SType').sample(frac = p, random_state = seed)

    return sampdata

In [8]:
vista_samp = get_sample(vista, 0.05, 999)

In [9]:
print(vista_samp.shape)
vista_samp.head()

(11730, 17)


Unnamed: 0,City,Latitude,Longitude,Source,State,Vista_Date,Vista_IPCC,Vista_Name,Vista_SType,Vista_ID,TROPOMI_CH4_Min,TROPOMI_CH4_Max,TROPOMI_CH4_Mean,TROPOMI_CH4_Count,TROPOMI_CH4_MEDIAN,County,geometry
140,Marina,36.716,-121.77222,CalRecycle,CA,2019-08-20,4B Biological Treatment of Solid Waste,Gabilan Ag Services,Composting Sites,CMP000141,1868.018443,1870.914246,1869.466344,2,1869.466344,,"POLYGON ((-121.77925 36.81584, -121.76519 36.8..."
40,Los Banos,37.04268,-120.96817,CalRecycle,CA,2019-08-20,4B Biological Treatment of Solid Waste,Billy Wright Composting Facility,Composting Sites,CMP000041,1870.707528,1881.189352,1875.702961,4,1875.457481,,"POLYGON ((-120.97517 37.14253, -120.96117 37.1..."
246,San Rafael,37.95416,-122.50517,CalRecycle,CA,2019-08-20,4B Biological Treatment of Solid Waste,Marin Sanitary Service Transfer Station,Composting Sites,CMP000247,,,,0,,,"POLYGON ((-122.51208 38.05401, -122.49826 38.0..."
391,Heber,32.7185,-115.5189,CalRecycle,CA,2019-08-20,4B Biological Treatment of Solid Waste,TruSource LLC- El Toro Yard,Composting Sites,CMP000392,1885.213989,1896.025464,1889.500953,3,1887.263406,,"POLYGON ((-115.52628 32.81832, -115.51890 32.8..."
333,Inglewood,33.97452,-118.33643,CalRecycle,CA,2019-08-20,4B Biological Treatment of Solid Waste,RJ`s Chipping and Grinding Operation,Composting Sites,CMP000334,1854.87068,1890.234865,1872.552773,2,1872.552773,,"POLYGON ((-118.34370 34.07435, -118.32916 34.0..."


## Find euclidean distance

In [10]:
def find_distance(x, y, a, b):  
    '''Returns a one-dimensional array containing the distances between the point (a,b) and a series of 
    points whose x-coordinates are represented as the one-dimensional array x, and whose y-coordinates 
    are represented by the one-dimensional array y'''
    
    euclid = np.sqrt((np.subtract(a, x))**2 + (np.subtract(b, y))**2)
    
    return euclid

## Find nearest neighbor

In [11]:
def nearest_neighbor(gpdf):
    '''
    Takes in a gpdf and calculates the Euclidean distance from each observation to its nearest neighbor. Returns the    
    float distance (km) and the ID and type of the nearest neighboring emitter.
    '''

    # get the crs into CA Albers, NAD1983 meters
    gpdf = gpdf.to_crs(epsg=3310)

    # find the lat and long of the centroid in new projection and add to gpdf
    gpdf['Cent_Long_TA83m'] = gpdf.centroid.x
    gpdf['Cent_Lat_TA83m'] = gpdf.centroid.y

    # make sure the index is full and ordered
    gpdf.index = range(len(gpdf))

    # intialize empty columns to store results
    gpdf['NearestNeighbor'] = ''
    gpdf['NN_Distance_m'] = np.nan
    gpdf['NN_Type'] = ''

    for i in range(len(gpdf)):
        a = gpdf.iloc[i,:]['Cent_Long_TA83m']
        b = gpdf.iloc[i,:]['Cent_Lat_TA83m']
        #restos = gpdf.drop(i, axis = 0)
        x = gpdf['Cent_Long_TA83m']
        y = gpdf['Cent_Lat_TA83m']
        dist = find_distance(x,y,a,b)
        dist[dist==0] = np.nan
        nn = dist.argmin()
        mindist = dist.min()

        if nn != -1:
           nn_ID = gpdf['Vista_ID'][nn]
           nn_type = gpdf['Vista_SType'][nn]
        else:
            nn_ID = ''
            nn_type = ''
        gpdf['NearestNeighbor'][i] = nn_ID
        gpdf['NN_Distance_m'][i] = mindist
        gpdf['NN_Type'][i] = nn_type
    
    return gpdf

In [12]:
start = time.time()
print(start)
vista_nn = nearest_neighbor(vista)
end = time.time()
print(f'Runtime: {end - start} seconds')

Runtime: 4340.113498210907 seconds


In [27]:
vista_nn[vista_nn['Vista_ID'] == 'OGW192803']

Unnamed: 0,City,Latitude,Longitude,Source,State,Vista_Date,Vista_IPCC,Vista_Name,Vista_SType,Vista_ID,...,TROPOMI_CH4_Mean,TROPOMI_CH4_Count,TROPOMI_CH4_MEDIAN,County,geometry,Cent_Long_TA83m,Cent_Lat_TA83m,NearestNeighbor,NN_Distance_m,NN_Type
132407,San Joaquin,37.829213,-121.292317,CEC DOGGR,CA,7/12/2019,1B2 Oil and Natural Gas,"""G.B. Unit 22"" 1",Oil and Gas Well,OGW192803,...,1873.519901,2,1873.519901,,"POLYGON ((-104799.816 -20146.156, -104855.846 ...",-113588.920535,-20033.509374,OGW218373,1821.959329,Oil and Gas Well


In [13]:
vista_nn.head(3)

Unnamed: 0,City,Latitude,Longitude,Source,State,Vista_Date,Vista_IPCC,Vista_Name,Vista_SType,Vista_ID,...,TROPOMI_CH4_Mean,TROPOMI_CH4_Count,TROPOMI_CH4_MEDIAN,County,geometry,Cent_Long_TA83m,Cent_Lat_TA83m,NearestNeighbor,NN_Distance_m,NN_Type
0,Oakdale,37.72799,-120.89865,CalRecycle,CA,2019-08-20,4B Biological Treatment of Solid Waste,"3D Ag, LLC",Composting Sites,CMP000001,...,1867.685878,2,1867.685878,,"POLYGON ((-79599.335 -20577.676, -78380.719 -2...",-79095.261295,-31685.429251,CMP000064,473.383614,Composting Sites
1,Lake Elsinore,33.70999,-117.32864,CalRecycle,CA,2019-08-20,4B Biological Treatment of Solid Waste,"A. Lua Recycling, Inc.",Composting Sites,CMP000002,...,1877.675215,2,1877.675215,,"POLYGON ((246679.581 -463794.602, 248029.681 -...",247666.543658,-474842.654463,OGW169487,3414.947497,Oil and Gas Well
2,Martinez,38.01667,-122.13333,CalRecycle,CA,2019-08-20,4B Biological Treatment of Solid Waste,Acme Composting Compost Facility,Composting Sites,CMP000003,...,1844.04057,2,1844.04057,,"POLYGON ((-187385.989 13249.108, -186177.082 1...",-187031.341105,2135.619288,COM000435,1382.42094,Natural Gas Station


In [20]:
vista_nn.to_file(os.sep.join([vistadir, 'CH4_Emitters_All.shp']))
vista_nn.to_file(os.sep.join([vistadir, 'CH4_Emitters_All.geojson']), driver = 'GeoJSON')

## Plot histogram

In [1]:
plt.hist(vista_nn.NN_Distance_m, bins = 12, color = 'aquamarine', edgecolor = 'grey')
plt.title('Frequency distribution of distance between nearest neighboring emitters')
plt.xlabel('Distance between nearest neighbors')
plt.ylabel('Log of number of emitters')
plt.yscale('log')

NameError: name 'plt' is not defined