In [118]:
import glob
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging
import os
import cloudstorage as gcs
from joblib import dump,load

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score,recall_score,precision_score
from google.cloud import bigquery

#### [Minimize Usage to save Costs] Pull the data from GCP


In [None]:
# Takes too long dont use
# %%bigquery wildfire_data
# SELECT *
# FROM `wildfire-242801.WildFire.consolidated_by_cell_day` 
# WHERE measure_date < '2017-01-01'

In [2]:
path = r'../../full_dataset' # use your path
all_files = glob.glob(path + "/*.csv")

li = []
i = 0
for filename in all_files:
    print(filename)
    if i == 0:
        wildfire_data = pd.read_csv(filename)
    else:
        wildfire_data = wildfire_data.append(pd.read_csv(filename))
    i += 1
    print(wildfire_data.shape)
 



../../full_dataset/consolidated_data000000000001.csv
(1166860, 137)
../../full_dataset/consolidated_data000000000004.csv
(2333059, 137)
../../full_dataset/consolidated_data000000000007.csv
(3500338, 137)
../../full_dataset/consolidated_data000000000006.csv
(4667345, 137)
../../full_dataset/consolidated_data000000000008.csv
(5833798, 137)
../../full_dataset/consolidated_data000000000005.csv
(6999750, 137)
../../full_dataset/consolidated_data000000000000.csv
(8166773, 137)
../../full_dataset/consolidated_data000000000002.csv
(9332797, 137)
../../full_dataset/consolidated_data000000000003.csv
(10498695, 137)
../../full_dataset/consolidated_data000000000009.csv
(11664728, 137)


In [3]:
fuel_mean = 78.74
wildfire_data.fuel_percent.replace('backfill',str(fuel_mean),inplace = True)
wildfire_data['fuel_percent'] = pd.to_numeric(wildfire_data.fuel_percent)

train = wildfire_data[wildfire_data.measure_date < '2018-01-01'].copy()
test = wildfire_data[wildfire_data.measure_date >= '2018-01-01'].copy()

y_train = train.wf_wildfire.fillna(0).copy().values
y_test = test.wf_wildfire.fillna(0).copy().values

feature_cols = ['tl_object_id','fuel_percent',
                'wea_air_temp_max', 'wea_air_temp_mean', 
                'wea_precip_accum_max', 'wea_relative_humidity_max',
                'wea_relative_humidity_min', 'wea_relative_humidity_mean',
                'wea_wind_speed_max', 'wea_wind_speed_min', 'wea_wind_speed_mean',
                'wea_wind_gust_max']

train = train[feature_cols]
test = test[feature_cols]

train = train.fillna(0)
test = test.fillna(0)

train['fuel_percent'] = pd.to_numeric(train.fuel_percent)
test['fuel_percent'] = pd.to_numeric(test.fuel_percent)

del wildfire_data
gc.collect()

28

In [131]:
# Preprocessing 
scaler = MinMaxScaler()

# Initialize the model with defaults
lr = LogisticRegression()
lr.fit(scaler.fit_transform(train),y_train)

# lr = load('../../wildfire_lr.joblib')


# Predictions
predictions = lr.predict_proba(scaler.transform(test))
# predictions = lr.predict_proba(test)


# Custom classification threshold
pred = np.zeros(len(predictions))
pred[predictions[:,1] > .0007] = 1
predictions = pred


# Accuracy Metrics
f1 = f1_score(y_test.values.tolist(),pred)
precision = precision_score(y_test.values,pred)
recall = recall_score(y_test.values,pred)

FP = np.sum((y_test.values != predictions) & (predictions == 1))
FN = np.sum((y_test.values != predictions) & (predictions == 0))
TP = np.sum((y_test.values == predictions) & (predictions == 1))
TN = np.sum((y_test.values == predictions) & (predictions == 0))


# Results
print('Acc :', (TP + TN) / (TP + TN + FP + FN))
print('FP :', FP)
print('TP :', TP)
print('FN :', FN)
print('TN :', TN)

print('Pred_pos :',np.sum(predictions == 1))
print('Pred_neg :',np.sum(predictions == 0))
print('Total_pos :',np.sum(y_test.values == 1))
print('Total_neg :',np.sum(y_test.values == 0))

print('Precision : {} \nRecall : {} \nF1 {}'.format(precision,recall,f1))

# # dump(lr, ('../../wildfire_lr.joblib'))


Acc : 0.9439464359492831
FP : 216670
TP : 161
FN : 1081
TN : 3666783
Pred_pos : 216831
Pred_neg : 3667864
Total_pos : 1242
Total_neg : 3883453
Precision : 0.0007425137549520133 
Recall : 0.12962962962962962 
F1 0.0014765697725073715


In [132]:
for i_iter in range(len(test.columns)):
    print(test.columns[i_iter], lr.coef_[0][i_iter])

tl_object_id 3.2730610039769985
fuel_percent -1.173877748209051
wea_air_temp_max -0.08977758312553644
wea_air_temp_mean -0.14209920283792138
wea_precip_accum_max -0.9403963811860762
wea_relative_humidity_max 2.1871261109047917
wea_relative_humidity_min -10.00713375567121
wea_relative_humidity_mean -5.424576105020377
wea_wind_speed_max 0.03227479759846397
wea_wind_speed_min 0.49748645430157656
wea_wind_speed_mean -1.666761113271551
wea_wind_gust_max 1.3010496925961077


In [61]:
#Save the model
# dump(lr, ('../../wildfire_lr.joblib'))

3883453
