In [1]:
import pandas as pd
import numpy as np
import gc
import os
import time

### Get the data from BQ bucket

In [17]:
!gsutil -m cp gs://wildfire-yz/consolidated/consolidated_by_cell_day_4_* Data/

Copying gs://wildfire-yz/consolidated/consolidated_by_cell_day_4_000000000000...
Copying gs://wildfire-yz/consolidated/consolidated_by_cell_day_4_000000000001...
Copying gs://wildfire-yz/consolidated/consolidated_by_cell_day_4_000000000002...
Copying gs://wildfire-yz/consolidated/consolidated_by_cell_day_4_000000000003...
/ [4 files][  1.5 GiB/  1.5 GiB]   39.5 MiB/s                                   
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying gs://wildfire-yz/consolidated/consolidated_by_cell_day_4_000000000004...
Copying gs://wildfire-yz/consolidated/consolidated_by_cell_day_4_000000000005...
Copying gs://wildfire-yz/consolidated/consolidated_by_cell_day_4_000000000006...
Copying gs://wildfire-yz/consolidated/consolidated_by_cell_day_4_000000000007...
Copying gs://wildfi

In [2]:
!gsutil -m cp gs://wildfire-yz/new_weather/new_weather* Data/

/bin/sh: 1: gsutil: not found


### Get 10% sample of non-wildfire cells

In [32]:
def sample_data(filepath, sample=0.1):
    """
    Takes sample of the non-wildfire cells
    """
    temp = pd.read_csv(filepath)
    print(f"Observations in raw: {temp.shape[0]}")
    temp_wf = temp[~temp.wf_wildfire.isna()]
    print(f"WF Observations in raw: {temp_wf.shape[0]}")
    temp_nwf = temp[temp.wf_wildfire.isna()].sample(frac=sample, random_state=1)
    print(f"NWF Observations sampled: {temp_nwf.shape[0]}")
    temp = temp_wf.append(temp_nwf)
    print(f"Observations in processed: {temp.shape[0]}")
    return temp

In [2]:
# allfiles = os.listdir('/home/yulia/Documents/MIDS/W210/Model/Data')
allfiles = os.listdir('/tf/notebooks/W210/Model/Data')
allfiles

['consolidated_by_cell_day_4_000000000012',
 'new_weather_000000000000',
 'consolidated_by_cell_day_4_000000000007',
 'consolidated_by_cell_day_4_000000000008',
 'consolidated_by_cell_day_4_000000000015',
 'consolidated_by_cell_day_4_000000000014',
 'consolidated_by_cell_day_4_000000000003',
 'consolidated_by_cell_day_4_000000000011',
 'consolidated_by_cell_day_4_000000000002',
 'consolidated_by_cell_day_4_000000000005',
 'consolidated_by_cell_day_4_000000000000',
 'consolidated_by_cell_day_4_000000000004',
 'weather_ma7_000000000000',
 'consolidated_by_cell_day_4_000000000001',
 'new_weather_000000000002',
 'consolidated_by_cell_day_4_000000000013',
 'consolidated_by_cell_day_4_000000000006',
 'consolidated_by_cell_day_4_000000000010',
 'new_weather_000000000001',
 'weather_lag1_000000000000',
 'consolidated_by_cell_day_4_000000000009']

In [41]:
final_df = pd.DataFrame()
for file in allfiles:
    if file[0] == 'c':
        print(f"Processing File {file}")
        path = '/tf/notebooks/W210/Model/Data/' + file
        final_df = final_df.append(sample_data(path))
        print(f"Observations in final dataset: {final_df.shape[0]}")
final_df.reset_index(drop=True,inplace=True)

Processing File consolidated_by_cell_day_4_000000000012
Observations in raw: 728502
WF Observations in raw: 240
NWF Observations sampled: 72826
Observations in processed: 73066
Observations in final dataset: 73066
Processing File consolidated_by_cell_day_4_000000000007
Observations in raw: 728591
WF Observations in raw: 248
NWF Observations sampled: 72834
Observations in processed: 73082
Observations in final dataset: 146148
Processing File consolidated_by_cell_day_4_000000000008
Observations in raw: 728849
WF Observations in raw: 211
NWF Observations sampled: 72864
Observations in processed: 73075
Observations in final dataset: 219223
Processing File consolidated_by_cell_day_4_000000000015
Observations in raw: 728849
WF Observations in raw: 228
NWF Observations sampled: 72862
Observations in processed: 73090
Observations in final dataset: 292313
Processing File consolidated_by_cell_day_4_000000000014
Observations in raw: 729638
WF Observations in raw: 226
NWF Observations sampled: 729

In [42]:
gc.collect()

24

In [43]:
final_df.shape

(1169778, 151)

In [212]:
final_df.s2_cell_id.nunique()

10643

In [44]:
final_df.wf_wildfire.sum()

3673.0

In [45]:
final_df.wf_wildfire.sum() / final_df.shape[0]

0.0031399120174939177

### Checking features

In [49]:
fuel_mean = 78.74 # Calculated in BigQuery
final_df.fuel_percent.replace('backfill',str(fuel_mean),inplace = True)
final_df['fuel_percent'] = pd.to_numeric(final_df.fuel_percent)

In [56]:
final_df[[col for col in final_df if col.startswith('tl_')]].dtypes

tl_object_id                 float64
tl_kv_sort_sum               float64
tl_kv_sort_mean              float64
tl_kv_sort_max               float64
tl_owner                     float64
tl_circuit                   float64
tl_length_mil_sum            float64
tl_length_mil_mean           float64
tl_length_mil_max            float64
tl_owner_amp                 float64
tl_owner_anza                float64
tl_owner_apud                float64
tl_owner_blythe_energy       float64
tl_owner_bpa                 float64
tl_owner_bves                float64
tl_owner_calpeco             float64
tl_owner_ccsf                float64
tl_owner_esj                 float64
tl_owner_iid                 float64
tl_owner_ipa                 float64
tl_owner_kmpud               float64
tl_owner_ladwp               float64
tl_owner_lmud                float64
tl_owner_mid                 float64
tl_owner_mwd                 float64
tl_owner_npua                float64
tl_owner_nvenergy            float64
t

In [59]:
final_df[[col for col in final_df if col.startswith('tl_')]].isna().sum()

tl_object_id                 707861
tl_kv_sort_sum               707861
tl_kv_sort_mean              707861
tl_kv_sort_max               707861
tl_owner                     707861
tl_circuit                   707861
tl_length_mil_sum            707861
tl_length_mil_mean           707968
tl_length_mil_max            707968
tl_owner_amp                 707861
tl_owner_anza                707861
tl_owner_apud                707861
tl_owner_blythe_energy       707861
tl_owner_bpa                 707861
tl_owner_bves                707861
tl_owner_calpeco             707861
tl_owner_ccsf                707861
tl_owner_esj                 707861
tl_owner_iid                 707861
tl_owner_ipa                 707861
tl_owner_kmpud               707861
tl_owner_ladwp               707861
tl_owner_lmud                707861
tl_owner_mid                 707861
tl_owner_mwd                 707861
tl_owner_npua                707861
tl_owner_nvenergy            707861
tl_owner_pcorp              

In [61]:
final_df[[col for col in final_df if col.startswith('wf_')]].dtypes

wf_wildfire         float64
wf_gis_acres        float64
wf_fire_dur         float64
wf_agency_bia       float64
wf_agency_blm       float64
wf_agency_coco      float64
wf_agency_cdf       float64
wf_agency_dod       float64
wf_agency_lra       float64
wf_agency_nps       float64
wf_agency_usf       float64
wf_cause_1          float64
wf_cause_2          float64
wf_cause_3          float64
wf_cause_4          float64
wf_cause_5          float64
wf_cause_6          float64
wf_cause_7          float64
wf_cause_8          float64
wf_cause_9          float64
wf_cause_10         float64
wf_cause_11         float64
wf_cause_14         float64
wf_cause_15         float64
wf_cause_16         float64
wf_cause_18         float64
wf_c_method_1       float64
wf_c_method_2       float64
wf_c_method_3       float64
wf_c_method_4       float64
wf_c_method_5       float64
wf_c_method_6       float64
wf_c_method_7       float64
wf_c_method_8       float64
wf_count_1yr_ago    float64
wf_count_2yr_ago    

In [62]:
final_df[[col for col in final_df if col.startswith('fuel_')]].dtypes

fuel_uid         object
fuel_percent    float64
dtype: object

In [63]:
final_df[[col for col in final_df if col.startswith('sat_')]].dtypes

sat_faparval_min        float64
sat_faparval_max        float64
sat_faparval_mean       float64
sat_faparval_median     float64
sat_faparval_std        float64
sat_faparval_size         int64
sat_faparval_count        int64
sat_faparval              int64
sat_faparmask_min          bool
sat_faparmask_max          bool
sat_faparmask_mean      float64
sat_faparmask_median     object
sat_faparmask_std       float64
sat_faparmask_size        int64
dtype: object

In [64]:
final_df[[col for col in final_df if col.startswith('sat_')]].isna().sum()

sat_faparval_min           0
sat_faparval_max           0
sat_faparval_mean          0
sat_faparval_median     1400
sat_faparval_std           0
sat_faparval_size          0
sat_faparval_count         0
sat_faparval               0
sat_faparmask_min          0
sat_faparmask_max          0
sat_faparmask_mean         0
sat_faparmask_median       0
sat_faparmask_std          0
sat_faparmask_size         0
dtype: int64

In [88]:
final_df[[col for col in final_df if col.startswith('sat_')]].describe()

Unnamed: 0,sat_faparval_min,sat_faparval_max,sat_faparval_mean,sat_faparval_median,sat_faparval_std,sat_faparval_size,sat_faparval_count,sat_faparval,sat_faparmask_mean,sat_faparmask_std,sat_faparmask_size
count,1169778.0,1169778.0,1169778.0,1169778.0,1169778.0,1169778.0,1169778.0,1169778.0,1169778.0,1169778.0,1169778.0
mean,0.1685741,0.5764128,0.3013177,0.3824433,0.06649632,580.3085,580.3085,106.4777,0.04531432,0.0586908,580.3085
std,0.1584758,0.2604321,0.2499692,0.2133349,0.05485793,743.6246,743.6246,97.49316,0.153791,0.1271404,743.6246
min,0.0,0.0,0.0,0.0,0.0,182.0,182.0,1.0,0.0,0.0,182.0
25%,0.04,0.42,0.0,0.212,0.0,200.0,200.0,63.0,0.0,0.0,200.0
50%,0.116,0.632,0.301537,0.384,0.06716503,212.0,212.0,85.0,0.0,0.0,212.0
75%,0.272,0.784,0.5098532,0.544,0.1014885,791.0,791.0,117.0,0.001196172,0.03436038,791.0
max,0.9400001,0.9400001,0.9171257,0.936,0.3436576,3647.0,3647.0,1832.0,1.0,0.5,3647.0


In [67]:
final_df.sat_faparval_median.isna().sum()

1400

In [69]:
final_df[final_df.sat_faparval_median.isna()][['s2_cell_id', 'measure_date', 'sat_faparval', 'sat_faparval_median', 'sat_faparmask_median']].head()

Unnamed: 0,s2_cell_id,measure_date,sat_faparval,sat_faparval_median,sat_faparmask_median
375,54c97cc,2016-01-06,1,,True
732,80be284,2016-03-22,1,,1.0
1453,809713c,2017-03-25,1,,1.0
1876,80bfc64,2017-05-09,1,,1.0
1961,80973d4,2018-03-20,1,,1.0


In [91]:
weather = [
 'wea_air_temp_max',
 'wea_air_temp_mean',
 'wea_precip_accum_max',
 'relative_humidity_max',
 'relative_humidity_min',
 'relative_humidity_mean',
 'wea_wind_speed_max',
 'wea_wind_speed_min',
 'wea_wind_speed_mean',
 'wind_gust_max'
]

In [92]:
final_df[weather].dtypes

wea_air_temp_max          float64
wea_air_temp_mean         float64
wea_precip_accum_max      float64
relative_humidity_max     float64
relative_humidity_min     float64
relative_humidity_mean    float64
wea_wind_speed_max        float64
wea_wind_speed_min        float64
wea_wind_speed_mean       float64
wind_gust_max             float64
dtype: object

In [93]:
final_df[weather].isna().sum()

wea_air_temp_max          0
wea_air_temp_mean         0
wea_precip_accum_max      0
relative_humidity_max     0
relative_humidity_min     0
relative_humidity_mean    0
wea_wind_speed_max        0
wea_wind_speed_min        0
wea_wind_speed_mean       0
wind_gust_max             0
dtype: int64

In [73]:
final_df.fillna(0, inplace=True)

In [74]:
final_df.head()

Unnamed: 0,s2_cell_id,measure_date,tl_object_id,tl_kv_sort_sum,tl_kv_sort_mean,tl_kv_sort_max,tl_owner,tl_circuit,tl_length_mil_sum,tl_length_mil_mean,...,stid_relative_humidity,relative_humidity_max,relative_humidity_min,relative_humidity_mean,stid_wind_speed,wea_wind_speed_max,wea_wind_speed_min,wea_wind_speed_mean,wea_stid_wind_gust,wind_gust_max
0,54cb13,2018-07-14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,FLAC1,67.0,13.0,42.0,FLAC1,3.13,0.9,1.955,FLAC1,7.15
1,54ce7c,2017-09-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,BOLC1,33.0,14.0,23.083333,BOLC1,3.58,0.9,2.4575,BOLC1,6.7
2,54d2fb,2018-07-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,WYTC1,64.0,19.0,39.423611,OBRC1,1.34,0.0,0.355417,WYTC1,3.66
3,54d361,2017-08-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,RLKC1,74.0,11.0,40.833333,RLKC1,1.79,0.0,0.635,RLKC1,5.37
4,808204,2018-07-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,MASC1,34.0,15.0,24.541667,MASC1,7.6,0.0,3.706667,MASC1,10.73


In [54]:
gc.collect()

349

### Getting the new weather data

In [4]:
weather_df = pd.DataFrame()
for file in allfiles:
    if file[:3] == 'new':
        print("Processing {}".format(file))
        path = '/tf/notebooks/W210/Model/Data/' + file
        weather_df = weather_df.append(pd.read_csv(path))

Processing new_weather_000000000000


  interactivity=interactivity, compiler=compiler, result=result)


Processing new_weather_000000000002
Processing new_weather_000000000001


In [18]:
weather_df.shape

(11664728, 14)

In [19]:
weather_df.head()

Unnamed: 0,s2_cell_id,measure_date,fuel_percent_ma7,fuel_percent_ma30,fuel_percent_l1,wea_air_temp_mean_l1,wea_precip_accum_max_l1,relative_humidity_mean_l1,wea_air_temp_mean_ma7,wea_precip_accum_max_ma7,relative_humidity_mean_ma7,wea_air_temp_mean_ma30,wea_precip_accum_max_ma30,relative_humidity_mean_ma30
0,54d2af,2017-08-16,84.142857,76.0,112,19.16625,1648.968,58.708333,21.392067,1648.968,48.14881,23.362051,1648.2568,40.119444
1,80c1a4,2017-04-21,119.714286,116.966667,133,10.463333,1005.586,55.291667,11.415298,1005.041714,51.005952,9.535111,1002.529533,50.388889
2,54d361,2016-06-17,109.285714,120.066667,185,10.047083,1665.732,73.166667,13.429464,1664.897429,63.386905,15.630542,1661.778067,62.3125
3,54d1f0c,2017-03-19,127.428571,109.533333,328,9.9525,1307.084,98.958333,10.373122,1282.808857,82.406926,5.127119,1227.387267,87.42472
4,54d28f4,2017-06-01,78.142857,100.5,124,19.974583,2497.328,67.041667,21.39244,2496.021714,60.27381,20.150986,2495.465333,52.051389


In [20]:
weather_df.isna().sum()

s2_cell_id                         0
measure_date                       0
fuel_percent_ma7                   0
fuel_percent_ma30                  0
fuel_percent_l1                10643
wea_air_temp_mean_l1           10643
wea_precip_accum_max_l1        10643
relative_humidity_mean_l1      10643
wea_air_temp_mean_ma7              0
wea_precip_accum_max_ma7           0
relative_humidity_mean_ma7         0
wea_air_temp_mean_ma30             0
wea_precip_accum_max_ma30          0
relative_humidity_mean_ma30        0
dtype: int64

In [21]:
weather_df[weather_df.measure_date=='2016-01-01'].head()

Unnamed: 0,s2_cell_id,measure_date,fuel_percent_ma7,fuel_percent_ma30,fuel_percent_l1,wea_air_temp_mean_l1,wea_precip_accum_max_l1,relative_humidity_mean_l1,wea_air_temp_mean_ma7,wea_precip_accum_max_ma7,relative_humidity_mean_ma7,wea_air_temp_mean_ma30,wea_precip_accum_max_ma30,relative_humidity_mean_ma30
528,8086b44,2016-01-01,78.0,78.0,,,,,5.21625,423.672,80.423611,5.21625,423.672,80.423611
569,809b264,2016-01-01,78.0,78.0,,,,,0.460833,91.948,67.315694,0.460833,91.948,67.315694
1834,8094a1,2016-01-01,78.0,78.0,,,,,3.304947,0.0,80.964413,3.304947,0.0,80.964413
1971,54ceb24,2016-01-01,78.0,78.0,,,,,-17.409167,328.93,81.833333,-17.409167,328.93,81.833333
2044,808389c,2016-01-01,73.0,73.0,,,,,2.22125,0.254,36.666667,2.22125,0.254,36.666667


In [22]:
weather_df.shape

(11664728, 14)

#### Replace missing values for the lag variables

In [5]:
weather_temp1 = weather_df[weather_df.measure_date=='2016-01-01'].drop(columns=['fuel_percent_l1', 'wea_air_temp_mean_l1', 'wea_precip_accum_max_l1', 'relative_humidity_mean_l1'])
weather_temp2 = weather_df[weather_df.measure_date=='2016-01-02'][['s2_cell_id', 'fuel_percent_l1', 'wea_air_temp_mean_l1', 'wea_precip_accum_max_l1', 'relative_humidity_mean_l1']]
weather_temp1 = weather_temp1.merge(weather_temp2, on='s2_cell_id')
weather_temp1 = weather_temp1[weather_df.columns]
weather_df = weather_df[weather_df.measure_date!='2016-01-01'].append(weather_temp1)
weather_df.shape

(11664728, 14)

In [26]:
weather_df.isna().sum()

s2_cell_id                     0
measure_date                   0
fuel_percent_ma7               0
fuel_percent_ma30              0
fuel_percent_l1                0
wea_air_temp_mean_l1           0
wea_precip_accum_max_l1        0
relative_humidity_mean_l1      0
wea_air_temp_mean_ma7          0
wea_precip_accum_max_ma7       0
relative_humidity_mean_ma7     0
wea_air_temp_mean_ma30         0
wea_precip_accum_max_ma30      0
relative_humidity_mean_ma30    0
dtype: int64

In [6]:
del weather_temp1, weather_temp2

In [7]:
gc.collect()

49

In [8]:
# final_df.drop(columns=[ 'wea_air_temp_mean_ma7',
#  'wea_precip_accum_max_ma7',
#  'relative_humidity_mean_ma7',
#  'fuel_percent_l1',
#  'wea_air_temp_mean_l1',
#  'wea_precip_accum_max_l1',
#  'relative_humidity_mean_l1'], inplace=True)

In [20]:
final_df = final_df.merge(weather_df, on=['s2_cell_id', 'measure_date'])
final_df['fuel_percent_l1'] = pd.to_numeric(final_df['fuel_percent_l1'], errors = 'coerce')
final_df['fuel_percent'].mean(), final_df['fuel_percent_l1'].mean()

(78.79543935686944, 78.8062260156661)

In [21]:
final_df['fuel_percent_l1'] = final_df['fuel_percent_l1'].fillna(78.8)

In [23]:
final_df.to_csv('./consolidated_4_10pct_sample.csv', index=False)

In [None]:
# final_df = pd.read_csv('./consolidated_4_10pct_sample.csv')

In [34]:
list(weather_df)

['s2_cell_id',
 'measure_date',
 'fuel_percent_ma7',
 'fuel_percent_ma30',
 'fuel_percent_l1',
 'wea_air_temp_mean_l1',
 'wea_precip_accum_max_l1',
 'relative_humidity_mean_l1',
 'wea_air_temp_mean_ma7',
 'wea_precip_accum_max_ma7',
 'relative_humidity_mean_ma7',
 'wea_air_temp_mean_ma30',
 'wea_precip_accum_max_ma30',
 'relative_humidity_mean_ma30']

In [12]:
final_df.isna().sum().sum()

0

In [15]:
tl_features = [
    'tl_object_id',
    'tl_kv_sort_sum'
]

wf_features = [
    'wf_count_1yr_ago',
    'wf_count_2yr_ago',
    'wf_count_3yr_ago',
    'wf_count_4yr_ago',
    'wf_count_5yr_ago'
]

sat_features = [
 'sat_faparval_min',
 'sat_faparval_max',
 'sat_faparval_mean',
#  'sat_faparval_median',
 'sat_faparval_std',
 'sat_faparval_size',
 'sat_faparval_count',
 'sat_faparval',
 'sat_faparmask_min',
 'sat_faparmask_max',
 'sat_faparmask_mean',
#  'sat_faparmask_median',
 'sat_faparmask_std',
 'sat_faparmask_size'
]

wea_features = [
 'wea_air_temp_max',
 'wea_air_temp_mean',
 'wea_precip_accum_max',
 'relative_humidity_max',
 'relative_humidity_min',
 'relative_humidity_mean',
 'wea_wind_speed_max',
 'wea_wind_speed_min',
 'wea_wind_speed_mean',
 'wind_gust_max',
 'wea_air_temp_mean_ma7',
 'wea_precip_accum_max_ma7',
 'relative_humidity_mean_ma7',
 'wea_air_temp_mean_l1',
 'wea_precip_accum_max_l1',
 'relative_humidity_mean_l1'
]

new_features = [
 'fuel_percent_ma7',
 'fuel_percent_ma30',
 'fuel_percent_l1',
 'wea_air_temp_mean_ma30',
 'wea_precip_accum_max_ma30',
 'relative_humidity_mean_ma30'
]

In [22]:
final_df[wea_features + tl_features + sat_features + ['fuel_percent'] + new_features].mean()

wea_air_temp_max                20.899048
wea_air_temp_mean               13.745029
wea_precip_accum_max           454.564976
relative_humidity_max           75.537893
relative_humidity_min           32.317954
relative_humidity_mean          56.507760
wea_wind_speed_max               4.607323
wea_wind_speed_min               0.564945
wea_wind_speed_mean              2.114124
wind_gust_max                    8.257752
wea_air_temp_mean_ma7           13.713285
wea_precip_accum_max_ma7       453.093549
relative_humidity_mean_ma7      56.452337
wea_air_temp_mean_l1            13.679229
wea_precip_accum_max_l1        453.551936
relative_humidity_mean_l1       56.415700
tl_object_id                     1.884175
tl_kv_sort_sum                 288.744253
sat_faparval_min                 0.168574
sat_faparval_max                 0.576413
sat_faparval_mean                0.301318
sat_faparval_std                 0.066496
sat_faparval_size              580.308453
sat_faparval_count             580