# Main script to Join Data on Instrument 2 Fire Location

Modules: N/A <br>
Author: Jordan Meyer <br>
Email: jordan.meyer@berkeley.edu <br>
Date created: Feb 18, 2023 <br>

**Citations (data sources)**


**Citations (persons)**
1. Cornelia Ilin 

**Preferred environment**
1. Code written in Jupyter Notebooks

In [84]:
import sys

IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
    !pip install geopandas --quiet
    from google.colab import drive

    drive.mount("/content/drive")
    in_dir = (
        in_instrument
    ) = "/content/drive/MyDrive/capstone_fire/notebooks/instrument_2/"
    import drive.MyDrive.capstone_fire.modules.deep_ols as deep_ols
else:
    in_dir = in_instrument = "../data/instrument_2/"
    %cd '..'
    import modules.deep_ols as deep_ols
    %cd 'notebooks'

/Users/jordan/Documents/GitHub/fire_capstone
/Users/jordan/Documents/GitHub/fire_capstone/notebooks


In [85]:
import calendar
import os
from datetime import date, timedelta

# geography
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import shapely

# Moved from sklearn.neighbors to sklearn.metrics following their package change
import sklearn.metrics
from shapely.geometry import Point

# import cartopy.crs as ccrs
# import contextily as ctx
# import fiona
# import netCDF4 as ncdf
# import osmnx as ox
# from cartopy.mpl.gridliner import LATITUDE_FORMATTER, LONGITUDE_FORMATTER
# from mpl_toolkits.axes_grid1.axes_divider import make_axes_locatable


dist = sklearn.metrics.DistanceMetric.get_metric("haversine")

# ignore warnings
import warnings

from tqdm.notebook import tqdm, trange

warnings.filterwarnings("ignore")

# START HERE FOR AGGREGATION

## Aggregation of the dataframe for OLS

In [86]:
instrument_df = pd.read_csv(in_instrument + "aggregated_draft_2.csv",)
instrument_df

Unnamed: 0,ZCTA,year_month,ins_1_no_bin_raw,ins_2_add_acres_raw,ins_3_norms,ins_4_add_acres_norms,ins_5_norm_bins_acres,ins_6_bins_raw,zip_pm25,zip_elevation,elevation_difference,zip_wspd
0,89010,199101,0.014313,2.171619,2.193307,2.237374e-03,0.024999,24.261588,12.450976,4784.0,-3523.444444,0.878208
1,89010,199102,0.003401,1.473353,0.522705,1.519277e-03,0.017402,16.875655,11.255854,4784.0,-3412.000000,0.283772
2,89010,199104,0.024555,4.317860,3.803710,4.474625e-03,0.053075,51.209927,7.899268,4784.0,-3777.333333,1.547416
3,89010,199106,0.000017,0.000382,0.002707,3.951177e-07,0.000100,0.096979,8.160244,4784.0,-3290.000000,0.893248
4,89010,199107,0.007432,0.586126,1.136627,5.962933e-04,0.006781,6.661194,9.101220,4784.0,-3369.000000,0.817715
...,...,...,...,...,...,...,...,...,...,...,...,...
212285,97635,201608,0.076445,215.996429,11.801323,2.240041e-01,2.681704,2586.061987,3.558125,7370.0,-6782.840909,0.782387
212286,97635,201609,0.063459,187.393049,9.758271,1.940952e-01,2.265046,2186.942079,1.619554,7370.0,-6757.846154,0.464931
212287,97635,201610,0.040482,66.587634,6.272099,6.879567e-02,0.864082,836.545477,2.673929,7370.0,-6184.090909,2.635240
212288,97635,201611,0.008002,8.879766,1.231006,9.119611e-03,0.099499,96.892259,3.386071,7370.0,-6325.900000,1.906592


In [87]:
demographics_df = pd.read_csv(in_instrument+'demographics.csv')

In [88]:
demographics_df['year']=demographics_df['year'].map(lambda x: str(x)[-4:])
demographics_df = demographics_df.rename(columns={'zip_code':'ZCTA'})

In [89]:
temp = instrument_df.reset_index()
instrument_df['year'] = temp['year_month'].apply(lambda x:str(x)[:4])

In [90]:
final_df = instrument_df.merge(demographics_df, on=['year','ZCTA'], how='left')

In [91]:
final_df.to_csv(in_instrument + 'finalish_df_3.csv')

In [92]:
final_df.year[:36]

0     1991
1     1991
2     1991
3     1991
4     1991
5     1991
6     1991
7     1991
8     1991
9     1991
10    1992
11    1992
12    1992
13    1992
14    1992
15    1992
16    1992
17    1992
18    1992
19    1993
20    1993
21    1993
22    1993
23    1993
24    1993
25    1993
26    1993
27    1994
28    1994
29    1994
30    1994
31    1994
32    1994
33    1994
34    1994
35    1995
Name: year, dtype: object

In [94]:
raw = pd.read_csv(in_instrument+'finalish_df_4.csv').iloc[:,1:]
print('Dataframe shape: ',raw.shape)

#demographic data has null values
print('Dataframe info: ',raw.info())

#years are missing
raw['year'].unique()

raw['ZCTA'].nunique() #1633 zips
raw.loc[~raw['households_count'].isna()]['ZCTA'].nunique()  #1610 zips

Dataframe shape:  (547651, 31)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 547651 entries, 0 to 547650
Data columns (total 31 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   ZCTA                               547651 non-null  int64  
 1   year_month                         547651 non-null  int64  
 2   ins_1_no_bin_raw                   547651 non-null  float64
 3   ins_2_add_acres_raw                547651 non-null  float64
 4   ins_3_norms                        547650 non-null  float64
 5   ins_4_add_acres_norms              547650 non-null  float64
 6   ins_5_norm_bins_acres              547650 non-null  float64
 7   ins_6_bins_raw                     547651 non-null  float64
 8   zip_pm25                           541289 non-null  float64
 9   zip_elevation                      545321 non-null  float64
 10  elevation_difference               210990 non-null  float64
 11  zip_wspd

1612

In [95]:
raw[raw['households_count'].isna()].ZCTA.unique()

array([90263, 90747, 90831, 92132, 92135, 92147, 92155, 92222, 92267,
       92280, 92304, 92338, 92364, 93042, 93064, 93220, 93262, 93410,
       93524, 93592, 93627, 93633, 93943, 94128, 94573, 94575, 94613,
       94720, 94850, 94972, 95013, 95226, 95250, 95305, 95314, 95325,
       95364, 95375, 95430, 95431, 95463, 95486, 95537, 95559, 95587,
       95604, 95606, 95646, 95679, 95680, 95721, 95724, 95915, 95923,
       95925, 95930, 95944, 95978, 95981, 95984, 95986, 96108, 96126,
       96129, 96133, 96135])

In [96]:
test = sorted(demographics_df.ZCTA.unique())


In [97]:
demos = set(demographics_df.ZCTA)
zips = set(instrument_df.ZCTA)

In [98]:
len(demos)

1758

In [99]:
len(zips)

1633

In [100]:
len(zips-demos)

20

In [101]:
len(demos-zips)

145

In [102]:
raw.year.unique()

array([1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
       2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
       2013, 2014, 2015, 2016, 2017, 2018])