## Add Surrounding Data
Author: Jennifer Grant

This notebook takes individual data frames and adds 35km of surrounding emissions, inter-cloud flashcount emissions, and cloud-ground flashcount emissions to the frame.

#### Import Libraries

In [None]:
# Cannot live without our libraries
!pip install netCDF4
from netCDF4 import Dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from math import *

Collecting netCDF4
[?25l  Downloading https://files.pythonhosted.org/packages/09/39/3687b2ba762a709cd97e48dfaf3ae36a78ae603ec3d1487f767ad58a7b2e/netCDF4-1.5.4-cp36-cp36m-manylinux1_x86_64.whl (4.3MB)
[K     |████████████████████████████████| 4.3MB 4.8MB/s 
[?25hCollecting cftime
[?25l  Downloading https://files.pythonhosted.org/packages/81/f4/31cb9b65f462ea960bd334c5466313cb7b8af792f272546b68b7868fccd4/cftime-1.2.1-cp36-cp36m-manylinux1_x86_64.whl (287kB)
[K     |████████████████████████████████| 296kB 43.4MB/s 
Installing collected packages: cftime, netCDF4
Successfully installed cftime-1.2.1 netCDF4-1.5.4


  import pandas.util.testing as tm


#### Import Data

In [None]:
# mount drive first
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
#import data files used to get surrounding info (need original, raw data for this)
file_endings = ['05', '07', '08', '09', '11', '12', '13', '14']
raw_data = []
data_path = 'drive/My Drive/BroaderAtlanta/raw_data/met_broader_atlanta_20'

#import our data
for i in np.arange(len(file_endings)):
    raw_data.append(Dataset(data_path + file_endings[i]))

In [None]:
# lets grab our rural data (since we've already constructed dataframes for urban
# cells in previous iterations of our model)
data_path = 'drive/My Drive/urban_and_rural/rural/individual_data/rural_atlanta_'

rural_data = []

for cell in cells:
	rural_data.append(pd.read_csv(data_path + str(cell) + '.csv', index_col=0))

#### Find Surrounding Indices

Given a specific cell we want to be able to extract a certain radius of features to add onto its dataframe. We calculate distance from a cell using the [Haversine Formula.](https://en.wikipedia.org/wiki/Haversine_formula)

In [None]:
## We first build a couple of functions that will help us find surrounding
## indices for each cell.
##
## Function that returns indices of surrounding cells within a 20km radius
def haversine_distance(lons, lats, center_lon, center_lat):
  # more variables needed for the formula
  a = []
  distances = []
  earth_radius = 6371.009
  lat_diff = [radians(lat - center_lat) for lat in lats]
  lon_diff = [radians(lon - center_lon) for lon in lons]
  
  # need to convert lon,lat & center cell coordinates to radians for equation
  x_lat_rad = [radians(lat) for lat in lats]
  x_lon_rad = [radians(lon) for lon in lons]
  center_lat_rad = radians(center_lat)
  center_lon_rad = radians(center_lon)
  
  #calculate distances
  for i in np.arange(len(lats)):
      a.append(sin(lat_diff[i]/2)**2 + cos(x_lat_rad[i])*cos(center_lat_rad)*sin(lon_diff[i]/2)**2)
      c = 2*atan2(sqrt(a[i]), sqrt(1 - a[i]))
      distances.append(earth_radius*c)
  
  return find_surrounding_indices(distances)

# grabs indices of cells within a target_distance away
def find_surrounding_indices(distances):
  target_distance = 35
  surrounding_indices = []

  for distance in distances:
    if ((distance <= target_distance) & (distance !=0)): #0 means its our center cell, so we exclude
        surrounding_indices.append(distances.index(distance))
  return surrounding_indices

In [None]:
#indices of cells where we need to include surroundings
cells = [25, 26, 27, 28, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 56, 57, 58,
         59, 60, 61, 62, 63, 64, 65, 66, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 
         82, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 109, 110, 111, 112, 113,
         114]

#grab the lon, lat data from any of the datafiles (all the same)
lats = rural_data[0].variables['xlat'][:][0].data
lons = rural_data[0].variables['xlon'][:][0].data

# initialize a dictionary to hold surrounding indices for each cell
surroundings = {}
for cell in cells:
  surroundings[cell] = []

for cell in cells:
  #grab coordinates of cell
  center_lat = rural_data[0].variables['xlat'][:][0][cell]
  center_lon = rural_data[0].variables['xlon'][:][0][cell]

  #call function
  surroundings[cell].extend(haversine_distance(lons, lats, center_lon, center_lat))

In [None]:
surroundings

{19: [1,
  2,
  3,
  4,
  5,
  9,
  10,
  11,
  12,
  13,
  17,
  18,
  20,
  21,
  25,
  26,
  27,
  28,
  29,
  33,
  34,
  35,
  36,
  37],
 20: [2,
  3,
  4,
  5,
  6,
  10,
  11,
  12,
  13,
  14,
  18,
  19,
  21,
  22,
  26,
  27,
  28,
  29,
  30,
  34,
  35,
  36,
  37,
  38],
 21: [3,
  4,
  5,
  6,
  7,
  11,
  12,
  13,
  14,
  15,
  19,
  20,
  22,
  23,
  27,
  28,
  29,
  30,
  31,
  35,
  36,
  37,
  38,
  39],
 22: [4,
  5,
  6,
  7,
  8,
  12,
  13,
  14,
  15,
  16,
  20,
  21,
  23,
  24,
  28,
  29,
  30,
  31,
  32,
  36,
  37,
  38,
  39,
  40],
 27: [9,
  10,
  11,
  12,
  13,
  17,
  18,
  19,
  20,
  21,
  25,
  26,
  28,
  29,
  33,
  34,
  35,
  36,
  37,
  41,
  42,
  43,
  44,
  45],
 28: [10,
  11,
  12,
  13,
  14,
  18,
  19,
  20,
  21,
  22,
  26,
  27,
  29,
  30,
  34,
  35,
  36,
  37,
  38,
  42,
  43,
  44,
  45,
  46],
 29: [11,
  12,
  13,
  14,
  15,
  19,
  20,
  21,
  22,
  23,
  27,
  28,
  30,
  31,
  35,
  36,
  37,
  38,
  39,
  43,
  44

#### Extract Features

Now that we have the indices of cells within 35km from each needed cell, we will extract the features we need from each: emissions, cloud to ground flashcount (CG flashcount), and inter cloud flashcount (IC flashcount). After we extract these features for each surrounding cell we will label them according to where they lie geographically relative to its center and add them onto the dataframe of its center.

In [None]:
# initialize the number of iterations needed to run loops/list comprehension
num_arrays = raw_data[0].variables['no2'].shape[0] #could have picked any variable with first dimension=744

# function that extracts surrounding features and returns a dataframe 
# to be concatenated with its center
def extract_surroundings(surrounding_indices):
  all_frame = pd.DataFrame()

  #extract our features
  for surr in surrounding_indices:
    surr_frame = pd.DataFrame()

    for i in np.arange(len(file_endings)):
      emissions = []
      ic_flashcount = []
      cg_flashcount = []

      for j in np.arange(num_arrays):
          emissions.extend(raw_data[i].variables['E_NO'][j][surr].data)
          ic_flashcount.extend(raw_data[i].variables['IC_FLASHCOUNT'][j][surr].data)
          cg_flashcount.extend(raw_data[i].variables['CG_FLASHCOUNT'][j][surr].data)          
  
      # We need to adjust the cumulative flashcounts to the actual flashcounts
      updated_ic_flash = list(np.diff(ic_flashcount))
      updated_cg_flash = list(np.diff(cg_flashcount))

      #since we don't have continuous data we append one last zero
      updated_ic_flash.append(0)
      updated_cg_flash.append(0)

      #need to repeat each entry of ic_flashcount, cg_flashcount for each layer
      updated_ic_flash = np.repeat(updated_ic_flash, 29)
      updated_cg_flash = np.repeat(updated_cg_flash, 29)

      # recall that E_NO only had 19 layers and we want to make it 29 to match the
      # dimensions of the other features. We fix this by adding an additional ten
      # zeros to the end of each 19 layers
      zeros = np.zeros(10)
      num_obs = int(len(emissions) / 19)   #how many groups of 19 E_NO we have
      e_no = []

      for k in np.arange(num_obs):
        one_profile = emissions[k*19:k*19+19]
        one_profile.extend(zeros)
        e_no.extend(one_profile)
      
      #create a dataframe for this surrounding cell feature set
      features_for_cell = pd.DataFrame({'E_NO_' + str(surr): e_no, 
                          'IC_FLASHCOUNT_' + str(surr): updated_ic_flash,
                          'CG_FLASHCOUNT_' + str(surr): updated_cg_flash})
      
      #add this dataframe as new columns to the larger dataframe of surrounding 
      #features 
      surr_frame = pd.concat([surr_frame, features_for_cell], axis=0)
    all_frame = pd.concat([all_frame, surr_frame], axis=1)

  return all_frame

In [None]:
#initialize a dictionary to hold all surrounding frames and call function above
surrounding_frames = {}
for cell in cells:
  print('Working on cell: {}'.format(cell))
  surrounding_frames[cell] = extract_surroundings(surroundings[cell])

Working on cell: 19
Working on cell: 20
Working on cell: 21
Working on cell: 22
Working on cell: 27
Working on cell: 28
Working on cell: 29
Working on cell: 30
Working on cell: 35
Working on cell: 36
Working on cell: 37
Working on cell: 43
Working on cell: 44
Working on cell: 45
Working on cell: 51
Working on cell: 52
Working on cell: 53


Now that we have all the dataframes containing surrounding features for our centers we need to change the column labels so that they line up with where each surrounding cell lies with respect to its center.

In [None]:
#function that changes the labels according to surrounding cell's geographic
# location relative to its center. Does not return anything.
def rename_cols(cell):
  frame = surrounding_frames[cell]
  #saving list of surrounding indices into an easier variable to work with
  i = list(map(str,surroundings[cell]))
  #rename columns
  new_col_mapping = {'E_NO_' + i[0]:'E_NO_SW1', 'IC_FLASHCOUNT_' + i[0]:'IC_FLASHCOUNT_SW1', 
                    'CG_FLASHCOUNT_' + i[0]:'CG_FLASHCOUNT_SW1', 'E_NO_' + i[1]:'E_NO_SWS',
                    'IC_FLASHCOUNT_' + i[1]:'IC_FLASHCOUNT_SWS', 'CG_FLASHCOUNT_' + i[1]:'CG_FLASHCOUNT_SWS', 
                    'E_NO_' + i[2]:'E_NO_S1', 'IC_FLASHCOUNT_' + i[2]:'IC_FLASHCOUNT_S1',
                    'CG_FLASHCOUNT_' + i[2]:'CG_FLASHCOUNT_S1', 'E_NO_' + i[3]:'E_NO_SES', 
                    'IC_FLASHCOUNT_' + i[3]:'IC_FLASHCOUNT_SES', 'CG_FLASHCOUNT_' + i[3]:'CG_FLASHCOUNT_SES',
                    'E_NO_' + i[4]:'E_NO_SE1', 'IC_FLASHCOUNT_' + i[4]:'IC_FLASHCOUNT_SE1', 
                    'CG_FLASHCOUNT_' + i[4]:'CG_FLASHCOUNT_SE1', 'E_NO_' + i[5]:'E_NO_SWW', 
                    'IC_FLASHCOUNT_' + i[5]:'IC_FLASHCOUNT_SWW', 'CG_FLASHCOUNT_' + i[5]:'CG_FLASHCOUNT_SWW', 
                    'E_NO_' + i[6]:'E_NO_SW', 'IC_FLASHCOUNT_' + i[6]:'IC_FLASHCOUNT_SW', 
                    'CG_FLASHCOUNT_' + i[6]:'CG_FLASHCOUNT_SW', 'E_NO_' + i[7]:'E_NO_S', 
                    'IC_FLASHCOUNT_' + i[7]:'IC_FLASHCOUNT_S', 'CG_FLASHCOUNT_' + i[7]:'CG_FLASHCOUNT_S',
                    'E_NO_' + i[8]:'E_NO_SE', 'IC_FLASHCOUNT_' + i[8]:'IC_FLASHCOUNT_SE', 
                     'CG_FLASHCOUNT_' + i[8]:'CG_FLASHCOUNT_SE', 'E_NO_' + i[9]:'E_NO_SEE', 
                    'IC_FLASHCOUNT_' + i[9]:'IC_FLASHCOUNT_SEE', 'CG_FLASHCOUNT_' + i[9]:'CG_FLASHCOUNT_SEE',
                     'E_NO_' + i[10]:'E_NO_W1', 'IC_FLASHCOUNT_' + i[10]:'IC_FLASHCOUNT_W1', 
                     'CG_FLASHCOUNT_' + i[10]:'CG_FLASHCOUNT_W1', 'E_NO_' + i[11]:'E_NO_W', 
                    'IC_FLASHCOUNT_' + i[11]:'IC_FLASHCOUNT_W', 'CG_FLASHCOUNT_' + i[11]:'CG_FLASHCOUNT_W',
                     'E_NO_' + i[12]:'E_NO_E', 'IC_FLASHCOUNT_' + i[12]:'IC_FLASHCOUNT_E', 
                     'CG_FLASHCOUNT_' + i[12]:'CG_FLASHCOUNT_E', 'E_NO_' + i[13]:'E_NO_E1', 
                    'IC_FLASHCOUNT_' + i[13]:'IC_FLASHCOUNT_E1', 'CG_FLASHCOUNT_' + i[13]:'CG_FLASHCOUNT_E1',
                     'E_NO_' + i[14]:'E_NO_NWW', 'IC_FLASHCOUNT_' + i[14]:'IC_FLASHCOUNT_NWW', 
                     'CG_FLASHCOUNT_' + i[14]:'CG_FLASHCOUNT_NWW', 'E_NO_' + i[15]:'E_NO_NW', 
                    'IC_FLASHCOUNT_' + i[15]:'IC_FLASHCOUNT_NW', 'CG_FLASHCOUNT_' + i[15]:'CG_FLASHCOUNT_NW',
                     'E_NO_' + i[16]:'E_NO_N', 'IC_FLASHCOUNT_' + i[16]:'IC_FLASHCOUNT_N', 
                     'CG_FLASHCOUNT_' + i[16]:'CG_FLASHCOUNT_N', 'E_NO_' + i[17]:'E_NO_NE', 
                    'IC_FLASHCOUNT_' + i[17]:'IC_FLASHCOUNT_NE', 'CG_FLASHCOUNT_' + i[17]:'CG_FLASHCOUNT_NE',
                     'E_NO_' + i[18]:'E_NO_NEE', 'IC_FLASHCOUNT_' + i[18]:'IC_FLASHCOUNT_NEE', 
                     'CG_FLASHCOUNT_' + i[18]:'CG_FLASHCOUNT_NEE', 'E_NO_' + i[19]:'E_NO_NW1', 
                    'IC_FLASHCOUNT_' + i[19]:'IC_FLASHCOUNT_NW1', 'CG_FLASHCOUNT_' + i[19]:'CG_FLASHCOUNT_NW1', 
                     'E_NO_' + i[20]:'E_NO_NWN', 'IC_FLASHCOUNT_' + i[20]:'IC_FLASHCOUNT_NWN', 
                     'CG_FLASHCOUNT_' + i[20]:'CG_FLASHCOUNT_NWN', 'E_NO_' + i[21]:'E_NO_N1', 
                    'IC_FLASHCOUNT_' + i[21]:'IC_FLASHCOUNT_N1', 'CG_FLASHCOUNT_' + i[21]:'CG_FLASHCOUNT_N1',
                     'E_NO_' + i[22]:'E_NO_NEN', 'IC_FLASHCOUNT_' + i[22]:'IC_FLASHCOUNT_NEN', 
                     'CG_FLASHCOUNT_' + i[22]:'CG_FLASHCOUNT_NEN', 'E_NO_' + i[23]:'E_NO_NE1', 
                    'IC_FLASHCOUNT_' + i[23]:'IC_FLASHCOUNT_NE1', 'CG_FLASHCOUNT_' + i[23]:'CG_FLASHCOUNT_NE1'}
  frame.rename(new_col_mapping, axis=1, inplace=True)

In [None]:
#rename columns by calling function above
for cell in cells:
  rename_cols(cell)

Now that our surrounding dataframes are ready, we import the cleaned data for each center and concatendate the dataframes. We will then save these transformed dataframes to be used for creating our train and test sets.

In [None]:
str_cells = list(map(str, cells))
centers = []

#import cleaned data
for cell in str_cells:
    centers.append(pd.read_csv(r'drive/My Drive/urban_and_rural/rural/individual_data/rural_atlanta_' + cell + '.csv', index_col=0))

In [None]:
#add surrounding features to cleaned dataframes
i = 0 #helps with indexing

for frame in surrounding_frames.values():
  centers[i] = pd.concat([centers[i], frame.reset_index()], axis=1).drop('index', axis=1)
  i = i + 1

In [None]:
# save these updated dataframes
for i in np.arange(len(centers)):
  centers[i].to_csv('drive/My Drive/urban_and_rural/rural/individual_with_surroundings/rural_atlanta_' + str_cells[i] + '.csv')