## Front Matter

In [None]:
# import libraries
import numpy as np
import pandas as pd
import geopandas as gpd
import os
import glob
import time
import matplotlib.pyplot as plt

In [None]:
# set data directory and out directory
datadir = os.sep.join(['/Volumes', 'Brain', 'GIS', 'ER131', 'Project', 'Data'])
outdir = os.sep.join(['/Volumes', 'Brain', 'GIS', 'ER131', 'Project', 'Output'])

## Import AQI

In [None]:
aqidir = os.sep.join([datadir, 'AQI'])
aqi = gpd.read_file(os.sep.join([aqidir, 'AQI.csv']))

In [None]:
print(aqi.shape)
aqi.head()

## Import VISTA points

In [None]:
vistadir = os.sep.join([outdir, 'CH4_Emitters'])
vista = gpd.read_file(os.sep.join([vistadir, 'CH4_Emitters_All.geojson']))

In [None]:
print(vista.shape)
vista.head()

## Get sample for testing

In [None]:
def get_sample(gpdf, p, seed):
    '''
    Takes in p (a float representing the sample proportion) and seed (any number) and returns the VISTA dataframe
    filtered to include a sample of pN locations, randomly selected by proportional stratified sampling of each emitter
    type.
    '''
    
    sampdata = gpdf.groupby('Vista_SType').sample(frac = p, random_state = seed)

    return sampdata

## Find Euclidean distance

In [None]:
def find_distance(x, y, a, b):  
    '''Returns a one-dimensional array containing the distances between the point (a,b) and a series of 
    points whose x-coordinates are represented as the one-dimensional array x, and whose y-coordinates 
    are represented by the one-dimensional array y'''
    
    euclid = np.sqrt((np.subtract(a, x))**2 + (np.subtract(b, y))**2)
    
    return euclid

## Find nearest neighbors

In [None]:
def nearest_neighbor(target_gpdf, aqi_gpdf):
    '''
    Takes in a gpdf and calculates the Euclidean distance from each observation to its nearest neighbor. 
    Returns: a numpy array with N rows (for each emitter location in the target (VISTA) dataset) and M columns, 
    where M is equal to the number of rows in the aqi dataframe (i.e. the number of observed measurements across the
    state).
    '''

    # get the crs into CA Albers, NAD1983 meters
    target_gpdf = target_gpdf.to_crs(epsg=3310)

    # find the lat and long of the centroid in new projection and add to gpdf
    target_gpdf['Cent_Long_TA83m'] = target_gpdf.centroid.x
    target_gpdf['Cent_Lat_TA83m'] = target_gpdf.centroid.y

    # make sure the index is full and ordered
    target_gpdf.index = range(len(target_gpdf))

    # intialize empty columns to store results
    dist_array = np.full((len(target_gpdf), len(aqi_gpdf)), np.nan) # initialize an array of size NxM, filled with NaN 
    
    for i in range(len(target_gpdf)): # loop through VISTA points
        
        # set x and y for Euclidean distance
        aqi_x = aqi['Longitude'] # set x as aqi obs long
        aqi_y = aqi['Latitude'] # set y as aqi obs lat
        
        #set ca_sample index to standard list
        target_tmp = target_gpdf.copy()
        target_tmp.index = range(0, len(target_gpdf))
        
        # set a and b for Euclidean distance
        loc_x = target_tmp['Longitude'][i] # set a
        loc_y = target_tmp['Latitude'][i] # set b
        elements = find_distance(pm25_x, pm25_y, loc_x, loc_y) # calculate distances on {[x,y],[a,b]}
        dist_array[i,:] = elements # add to array row
    
    return dist_array

In [None]:
def predict_PM25(target_gpdf, aqi_gpdf, K):
    '''
    Takes in as parameters the target emitters (VISTA) dataframe, the AQI dataframe, and an integer value for K 
    (number of nearest neighbors). Returns the target_gpdf dataframe with a new 'Value' column, whose elements are the
    predicted AQI concentrations for each emitter point, calculated based on the mean of the K nearest Sample
    Measurements to that city or town.
    '''
    
    # add empty vector to ca_sample to store predictions
    target_gpdf['AQI_Value'] = np.full(len(target_gpdf), np.nan)
    
    # get distances between emitter points and measurement locations
    for i in range(len(target_gpdf)):
        distarr = np.argsort(nearest_neighbor(target_gpdf, aqi_gpdf)[i], axis = 0)
        K_near = distarr[:K]
        aqi_obs = list(aqi_gpdf.iloc[K_near]['AQI_Value'])
        
        # for each emitter, get the average value of the K nearest AQI measurement locations
        pm25_pred = (1/K) * np.sum(aqi_obs)
        
        # add predicted values to ca_sample
        target_gpdf['AQI_Value'][i] = pm25_pred

    return target_gpdf

## Test

In [None]:
# Test with samples from VISTA and AQI obs
target_samp = get_sample(vista, 0.02, 999)
aqi_samp = get_sample(aqi, 0.02, 999)
test = predict_PM25(target_samp, aqi_samp, 2)

In [None]:
test.head(10)

## Implement

In [None]:
# Run on full datasets
target_gpdf = vista
aqi_gpdf = aqi_gpdf
knn_predictions = predict_PM25(target_gpdf, aqi_gpdf, 6)

In [None]:
knn_predictions.head(10)

## Histogram

In [None]:
plt.hist(test.AQI_value, bins = 12, color = 'coral', edgecolor = 'grey')
plt.title('Frequency distribution of AQI values estimated through KNN on 6 neighbors')
plt.xlabel('AQI value')
plt.ylabel('Log of frequency')
plt.yscale('log')

## To file

In [None]:
# to geojson
knn_predictions.to_file(os.sep.join([vistadir, 'CH4_Emitters_All.geojson']), driver = 'GeoJSON')

# to shapefile
#knn_predictions.to_file(os.sep.join([vistadir, 'CH4_Emitters_All.shp']))