In [2]:
import pandas as pd
import numpy as np
import gc
import os
import time

In [18]:
keep_wf_cols = [
    'S2_Cells_ID',
    'WF_FIRE_DATE',
    'WF_WildFire',
    'WF_WildFire_COUNT_1YR_AGO',
    'WF_WildFire_COUNT_2YR_AGO',
    'WF_WildFire_COUNT_3YR_AGO',
    'WF_WildFire_COUNT_4YR_AGO',
    'WF_WildFire_COUNT_5YR_AGO'
]

new_wf_cols = [
    's2_cell_id',
    'measure_date',
    'wf_wildfire_ext',         
    'wf_count_1yr_ago',    
    'wf_count_2yr_ago',    
    'wf_count_3yr_ago',    
    'wf_count_4yr_ago',    
    'wf_count_5yr_ago'    
]
wf_hist_cols = [
    'wf_count_1yr_ago',    
    'wf_count_2yr_ago',    
    'wf_count_3yr_ago',    
    'wf_count_4yr_ago',    
    'wf_count_5yr_ago'    
]

### Re-processing final dataset

- `WildFire_S2Cells_Extended.csv` is available on GitHub in DataPrep/Data/Procecced directory
- It's created in the `DataPrep/WildFire_S2_Ext.ipynb`
- "Extended" means that the wildfire indicator for the cell extends beyond alarm date (up to containment date)

In [19]:
new_wf_data = pd.read_csv('../Data/Processed/WildFire_S2Cells_Extended.csv')
new_wf_data = new_wf_data[keep_wf_cols]
new_wf_data.columns = new_wf_cols
new_wf_data.shape

(86689, 8)

In [17]:
new_wf_data.wf_wildfire.sum()

86689

In [20]:
def sample_data(filepath, sample=0.1):
    temp = pd.read_csv(filepath)
    print(f"Observations in raw: {temp.shape[0]}")
    temp.drop(columns=wf_hist_cols, inplace=True)
    temp = temp.merge(new_wf_data, on=['s2_cell_id', 'measure_date'], how='left')
    temp_wf = temp[~temp.wf_wildfire_ext.isna()]
    print(f"WF Observations in raw: {temp_wf.shape[0]}")
    temp_nwf = temp[temp.wf_wildfire_ext.isna()].sample(frac=sample, random_state=1)
    print(f"NWF Observations sampled: {temp_nwf.shape[0]}")
    temp = temp_wf.append(temp_nwf)
    print(f"Observations in processed: {temp.shape[0]}")
    return temp

In [7]:
allfiles = os.listdir('/tf/notebooks/W210/Model/Data')
allfiles

['consolidated_by_cell_day_4_000000000012',
 'new_weather_000000000000',
 'consolidated_by_cell_day_4_000000000007',
 'consolidated_by_cell_day_4_000000000008',
 'consolidated_by_cell_day_4_000000000015',
 'consolidated_by_cell_day_4_000000000014',
 'consolidated_by_cell_day_4_000000000003',
 'consolidated_by_cell_day_4_000000000011',
 'consolidated_by_cell_day_4_000000000002',
 'consolidated_by_cell_day_4_000000000005',
 'consolidated_by_cell_day_4_000000000000',
 'consolidated_by_cell_day_4_000000000004',
 'weather_ma7_000000000000',
 'consolidated_by_cell_day_4_000000000001',
 'new_weather_000000000002',
 'consolidated_by_cell_day_4_000000000013',
 'consolidated_by_cell_day_4_000000000006',
 'consolidated_by_cell_day_4_000000000010',
 'new_weather_000000000001',
 'weather_lag1_000000000000',
 'consolidated_by_cell_day_4_000000000009']

In [32]:
final_df = pd.DataFrame()
for file in allfiles:
    if file[0] == 'c':
        print(f"Processing File {file}")
        path = '/tf/notebooks/W210/Model/Data/' + file
        final_df = final_df.append(sample_data(path))
        print(f"Observations in the dataset so far: {final_df.shape[0]}")
final_df.reset_index(drop=True,inplace=True)

Processing File consolidated_by_cell_day_4_000000000012
Observations in raw: 728502
WF Observations in raw: 5429
NWF Observations sampled: 72307
Observations in processed: 77736
Observations in the dataset so far: 77736
Processing File consolidated_by_cell_day_4_000000000007
Observations in raw: 728591
WF Observations in raw: 5594
NWF Observations sampled: 72300
Observations in processed: 77894
Observations in the dataset so far: 155630
Processing File consolidated_by_cell_day_4_000000000008
Observations in raw: 728849
WF Observations in raw: 5295
NWF Observations sampled: 72355
Observations in processed: 77650
Observations in the dataset so far: 233280
Processing File consolidated_by_cell_day_4_000000000015
Observations in raw: 728849
WF Observations in raw: 5376
NWF Observations sampled: 72347
Observations in processed: 77723
Observations in the dataset so far: 311003
Processing File consolidated_by_cell_day_4_000000000014
Observations in raw: 729638
WF Observations in raw: 5479
NWF 

In [33]:
gc.collect()

45

In [34]:
final_df.shape

(1244493, 152)

In [37]:
final_df.s2_cell_id.nunique()

10643

In [38]:
final_df.wf_wildfire.sum(), final_df.wf_wildfire_ext.sum()

(3673.0, 86689.0)

In [39]:
final_df.wf_wildfire_ext.sum() / final_df.shape[0]

0.06965808566219336

In [40]:
fuel_mean = 78.74 # Calculated in BigQuery
final_df.fuel_percent.replace('backfill',str(fuel_mean),inplace = True)
final_df['fuel_percent'] = pd.to_numeric(final_df.fuel_percent)

In [41]:
final_df[[col for col in final_df if col.startswith('tl_')]].dtypes

tl_object_id                 float64
tl_kv_sort_sum               float64
tl_kv_sort_mean              float64
tl_kv_sort_max               float64
tl_owner                     float64
tl_circuit                   float64
tl_length_mil_sum            float64
tl_length_mil_mean           float64
tl_length_mil_max            float64
tl_owner_amp                 float64
tl_owner_anza                float64
tl_owner_apud                float64
tl_owner_blythe_energy       float64
tl_owner_bpa                 float64
tl_owner_bves                float64
tl_owner_calpeco             float64
tl_owner_ccsf                float64
tl_owner_esj                 float64
tl_owner_iid                 float64
tl_owner_ipa                 float64
tl_owner_kmpud               float64
tl_owner_ladwp               float64
tl_owner_lmud                float64
tl_owner_mid                 float64
tl_owner_mwd                 float64
tl_owner_npua                float64
tl_owner_nvenergy            float64
t

In [42]:
final_df[[col for col in final_df if col.startswith('tl_')]].isna().sum()

tl_object_id                 764837
tl_kv_sort_sum               764837
tl_kv_sort_mean              764837
tl_kv_sort_max               764837
tl_owner                     764837
tl_circuit                   764837
tl_length_mil_sum            764837
tl_length_mil_mean           764959
tl_length_mil_max            764959
tl_owner_amp                 764837
tl_owner_anza                764837
tl_owner_apud                764837
tl_owner_blythe_energy       764837
tl_owner_bpa                 764837
tl_owner_bves                764837
tl_owner_calpeco             764837
tl_owner_ccsf                764837
tl_owner_esj                 764837
tl_owner_iid                 764837
tl_owner_ipa                 764837
tl_owner_kmpud               764837
tl_owner_ladwp               764837
tl_owner_lmud                764837
tl_owner_mid                 764837
tl_owner_mwd                 764837
tl_owner_npua                764837
tl_owner_nvenergy            764837
tl_owner_pcorp              

In [43]:
final_df[[col for col in final_df if col.startswith('wf_')]].dtypes

wf_wildfire         float64
wf_gis_acres        float64
wf_fire_dur         float64
wf_agency_bia       float64
wf_agency_blm       float64
wf_agency_coco      float64
wf_agency_cdf       float64
wf_agency_dod       float64
wf_agency_lra       float64
wf_agency_nps       float64
wf_agency_usf       float64
wf_cause_1          float64
wf_cause_2          float64
wf_cause_3          float64
wf_cause_4          float64
wf_cause_5          float64
wf_cause_6          float64
wf_cause_7          float64
wf_cause_8          float64
wf_cause_9          float64
wf_cause_10         float64
wf_cause_11         float64
wf_cause_14         float64
wf_cause_15         float64
wf_cause_16         float64
wf_cause_18         float64
wf_c_method_1       float64
wf_c_method_2       float64
wf_c_method_3       float64
wf_c_method_4       float64
wf_c_method_5       float64
wf_c_method_6       float64
wf_c_method_7       float64
wf_c_method_8       float64
wf_cum_area         float64
wf_wildfire_ext     

In [44]:
final_df[[col for col in final_df if col.startswith('fuel_')]].dtypes

fuel_uid         object
fuel_percent    float64
dtype: object

In [45]:
final_df[[col for col in final_df if col.startswith('sat_')]].dtypes

sat_faparval_min        float64
sat_faparval_max        float64
sat_faparval_mean       float64
sat_faparval_median     float64
sat_faparval_std        float64
sat_faparval_size         int64
sat_faparval_count        int64
sat_faparval              int64
sat_faparmask_min          bool
sat_faparmask_max          bool
sat_faparmask_mean      float64
sat_faparmask_median     object
sat_faparmask_std       float64
sat_faparmask_size        int64
dtype: object

In [46]:
final_df[[col for col in final_df if col.startswith('sat_')]].isna().sum()

sat_faparval_min           0
sat_faparval_max           0
sat_faparval_mean          0
sat_faparval_median     1393
sat_faparval_std           0
sat_faparval_size          0
sat_faparval_count         0
sat_faparval               0
sat_faparmask_min          0
sat_faparmask_max          0
sat_faparmask_mean         0
sat_faparmask_median       0
sat_faparmask_std          0
sat_faparmask_size         0
dtype: int64

In [47]:
final_df[[col for col in final_df if col.startswith('sat_')]].describe()

Unnamed: 0,sat_faparval_min,sat_faparval_max,sat_faparval_mean,sat_faparval_median,sat_faparval_std,sat_faparval_size,sat_faparval_count,sat_faparval,sat_faparmask_mean,sat_faparmask_std,sat_faparmask_size
count,1244493.0,1244493.0,1244493.0,1243100.0,1244493.0,1244493.0,1244493.0,1244493.0,1244493.0,1244493.0,1244493.0
mean,0.167972,0.5789316,0.3036527,0.3837108,0.06749477,570.0334,570.0334,105.9598,0.04286636,0.05665585,570.0334
std,0.1567811,0.2571035,0.2487487,0.2117039,0.05484031,735.315,735.315,95.94004,0.1492667,0.1246532,735.315
min,0.0,0.0,0.0,0.0,0.0,182.0,182.0,1.0,0.0,0.0,182.0
25%,0.04,0.428,0.0,0.216,0.0,199.0,199.0,63.0,0.0,0.0,199.0
50%,0.116,0.632,0.3067501,0.386,0.06845509,211.0,211.0,85.0,0.0,0.0,211.0
75%,0.268,0.784,0.5099282,0.544,0.1028945,788.0,788.0,116.0,0.001105278,0.02884748,788.0
max,0.9400001,0.9400001,0.9171257,0.936,0.3436576,3647.0,3647.0,1761.0,1.0,0.5,3647.0


In [48]:
final_df.sat_faparval_median.isna().sum()

1393

In [49]:
final_df[final_df.sat_faparval_median.isna()][['s2_cell_id', 'measure_date', 'sat_faparval', 'sat_faparval_median', 'sat_faparmask_median']].head()

Unnamed: 0,s2_cell_id,measure_date,sat_faparval,sat_faparval_median,sat_faparmask_median
5669,8097cb,2017-01-21,1,,1.0
6080,80bfe1c,2016-03-27,1,,1.0
6566,54c9574,2016-01-05,1,,True
6587,80958a4,2016-05-28,1,,True
6918,8095fb4,2017-02-01,1,,True


In [50]:
weather = [
 'wea_air_temp_max',
 'wea_air_temp_mean',
 'wea_precip_accum_max',
 'relative_humidity_max',
 'relative_humidity_min',
 'relative_humidity_mean',
 'wea_wind_speed_max',
 'wea_wind_speed_min',
 'wea_wind_speed_mean',
 'wind_gust_max'
]

In [51]:
final_df[weather].dtypes

wea_air_temp_max          float64
wea_air_temp_mean         float64
wea_precip_accum_max      float64
relative_humidity_max     float64
relative_humidity_min     float64
relative_humidity_mean    float64
wea_wind_speed_max        float64
wea_wind_speed_min        float64
wea_wind_speed_mean       float64
wind_gust_max             float64
dtype: object

In [52]:
final_df[weather].isna().sum()

wea_air_temp_max          0
wea_air_temp_mean         0
wea_precip_accum_max      0
relative_humidity_max     0
relative_humidity_min     0
relative_humidity_mean    0
wea_wind_speed_max        0
wea_wind_speed_min        0
wea_wind_speed_mean       0
wind_gust_max             0
dtype: int64

In [53]:
final_df.fillna(0, inplace=True)

In [54]:
final_df.head()

Unnamed: 0,s2_cell_id,measure_date,tl_object_id,tl_kv_sort_sum,tl_kv_sort_mean,tl_kv_sort_max,tl_owner,tl_circuit,tl_length_mil_sum,tl_length_mil_mean,...,wea_wind_speed_min,wea_wind_speed_mean,wea_stid_wind_gust,wind_gust_max,wf_wildfire_ext,wf_count_1yr_ago,wf_count_2yr_ago,wf_count_3yr_ago,wf_count_4yr_ago,wf_count_5yr_ago
0,54cb13,2018-07-14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.9,1.955,FLAC1,7.15,1.0,0.0,0.0,0.0,0.0,0.0
1,54cb65,2017-08-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.9,2.160833,FLAC1,7.6,1.0,0.0,0.0,0.0,0.0,1.0
2,54cb65,2017-08-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.9,2.029583,FLAC1,7.15,1.0,0.0,0.0,0.0,0.0,1.0
3,54cc87,2017-08-20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.45,2.198333,RSHC1,7.6,1.0,0.0,0.0,0.0,0.0,0.0
4,54cc87,2017-08-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.452917,RSHC1,11.62,1.0,0.0,0.0,0.0,0.0,0.0


In [55]:
gc.collect()

14

### Append new weather data

In [8]:
weather_df = pd.DataFrame()
for file in allfiles:
    if file[:3] == 'new':
        print("Processing {}".format(file))
        path = '/tf/notebooks/W210/Model/Data/' + file
        weather_df = weather_df.append(pd.read_csv(path))

Processing new_weather_000000000000


  interactivity=interactivity, compiler=compiler, result=result)


Processing new_weather_000000000002
Processing new_weather_000000000001


In [9]:
weather_temp1 = weather_df[weather_df.measure_date=='2016-01-01'].drop(columns=['fuel_percent_l1', 'wea_air_temp_mean_l1', 'wea_precip_accum_max_l1', 'relative_humidity_mean_l1'])
weather_temp2 = weather_df[weather_df.measure_date=='2016-01-02'][['s2_cell_id', 'fuel_percent_l1', 'wea_air_temp_mean_l1', 'wea_precip_accum_max_l1', 'relative_humidity_mean_l1']]
weather_temp1 = weather_temp1.merge(weather_temp2, on='s2_cell_id')
weather_temp1 = weather_temp1[weather_df.columns]
weather_df = weather_df[weather_df.measure_date!='2016-01-01'].append(weather_temp1)
weather_df.shape

(11664728, 14)

In [10]:
del weather_temp1, weather_temp2

In [11]:
gc.collect()

72

In [12]:
# final_df.drop(columns=[ 'wea_air_temp_mean_ma7',
#  'wea_precip_accum_max_ma7',
#  'relative_humidity_mean_ma7',
#  'fuel_percent_l1',
#  'wea_air_temp_mean_l1',
#  'wea_precip_accum_max_l1',
#  'relative_humidity_mean_l1'], inplace=True)

In [13]:
final_df = final_df.merge(weather_df, on=['s2_cell_id', 'measure_date'])
final_df['fuel_percent_l1'] = pd.to_numeric(final_df['fuel_percent_l1'], errors = 'coerce')
final_df['fuel_percent'].mean(), final_df['fuel_percent_l1'].mean()

(77.65220728441221, 77.60258163254542)

In [14]:
final_df['fuel_percent_l1'] = final_df['fuel_percent_l1'].fillna(78.8)

In [15]:
final_df.to_csv('./consolidated_4_10pct_sample_ext.csv', index=False)

In [4]:
# final_df = pd.read_csv('./consolidated_4_10pct_sample_ext.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [16]:
gc.collect()

28