In [97]:
import cloudstorage as gcs
import glob
import gc
import logging
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas_gbq
import pandas as pd
import time
import tensorflow as tf
import geopandas as gpd

from tensorflow.keras import layers
from joblib import dump,load
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score,recall_score,precision_score
from google.cloud import bigquery

In [104]:
# Load each csv into a pandas DF

def load_data(path=None):
    """Loads the csvs into pandas df
    
    Kwargs:
        path - str - path to csv data files
    Returns:
        wildfire_data - Pandas DF - DF of all data"""
    
    if not path:
        path = r'../../full_dataset' # use your path
        
    all_files = glob.glob(path + "/*.csv")

    li = []
    i = 0
    for filename in all_files:
        print(filename)
        if i == 0:
            wildfire_data = pd.read_csv(filename)
        else:
            wildfire_data = wildfire_data.append(pd.read_csv(filename))
        i += 1
        print(wildfire_data.shape)
        break
    
    wildfire_data.reset_index(drop=True,inplace=True)
    
    return wildfire_data

def preprocess_dataset(wildfire_data,downsample_size=None):
    """Fills in NA's, creates train/test split, removes unused cols
    Args :
        wildfire_data - Pandas DF - consolidated dataset
    Returns :
        train - Pandas DF - 2016-2017 input data
        test - Pandas DF - 2018 input data
        y_train - numpy array - 2016-2017 labels
        y_test - numpy array - 2018 labels 
        indexer - Pandas DF - lookup for results Data/S2Cell by index
        downsample_size - float - percentage of negative samples to include in Train
    """
    
    fuel_mean = 78.74 # Calculated in BigQuery
    wildfire_data.fuel_percent.replace('backfill',str(fuel_mean),inplace = True)
    wildfire_data['fuel_percent'] = pd.to_numeric(wildfire_data.fuel_percent)

    # DF used to resolve which date/s2 cell each prediction corresponds too
    indexer = wildfire_data[['s2_cell_id','measure_date']].copy()

    train = wildfire_data[wildfire_data.measure_date < '2018-01-01'].copy()
    test = wildfire_data[wildfire_data.measure_date >= '2018-01-01'].copy()

    if downsample_size:
        train = train[train.wf_wildfire.fillna(0) == 0].sample(frac=downsample_size).append(train[train.wf_wildfire > 0])
        gc.collect()
    y_train = train.wf_wildfire.fillna(0).copy().values
    y_test = test.wf_wildfire.fillna(0).copy().values
    
    y_train[y_train > 0] = 1
    y_test[y_test > 0] = 1
    
    # TODO identify any additional/engineered features to include
    feature_cols = ['tl_object_id','fuel_percent',
                    'wea_air_temp_max', 'wea_air_temp_mean', 
                    'wea_precip_accum_max', 'relative_humidity_max',
                    'relative_humidity_min', 'relative_humidity_mean',
                    'wea_wind_speed_max', 'wea_wind_speed_min', 'wea_wind_speed_mean',
                    'wind_gust_max',
                    'sat_faparval_min',
                    'sat_faparval_max', 'sat_faparval_mean', 
                    'sat_faparval_median'
                   ]

    train = train[feature_cols]
    test = test[feature_cols]

    train = train.fillna(0)
    test = test.fillna(0)

    train['fuel_percent'] = pd.to_numeric(train.fuel_percent)
    test['fuel_percent'] = pd.to_numeric(test.fuel_percent)

    del wildfire_data
    gc.collect()

    return train,test,y_train,y_test,indexer

def train_models(models,train_data,train_labels,scaler=None,save=True):
    """Given a list of Sklearn models returns a list of trained models
    
    Args:
        model - list of Sklearn model objects - models to be trained
        train_data - Pandas DF - preprocessed training data
        train_labels - Numpy Array - training data labels
        scaler - Sklearn Scaler Object - [Optional]  
        save - Bool - if true writes the models to disk 
    """
    trained_models = []
    model_path = 'wildfire_{}.joblib'
    if save == False:
        trained_models = [load(model) for model in glob.glob('*.joblib')]
    else:
        if scaler:
            for model in models:
                model_name = model.__class__.__name__
                print('Training -',model_name)
                t0 = time.time()

                trained_models.append(
                    model.fit(scaler.fit_transform(train_data),train_labels))

                dump(trained_models[-1], (model_path.format(model_name)))
                print('Training time -',str(round(time.time() - t0,2))+'s')

        else:
            for model in models:
                trained_models.append(
                    model.fit(train_data,train_labels))

                dump(trained_models[-1], (model_path.format(model_name)))
    return trained_models
    

def make_probability_predictions(model,test_data,scaler=None):
    """Makes probability predictions on the validation data for each model"""
    
    if scaler:
        if model.__class__.__name__ == 'IsolationForest':
            predictions = ifc.decision_function(scaler.transform(test_data))
        else:
            predictions = model.predict_proba(scaler.transform(test_data))
    else:
        if model.__class__.__name__ == 'IsolationForest':
            predictions = ifc.decision_function(test_data)
        else:
            predictions = model.predict_proba(test_data)
    return predictions

def binarize_predictions(predictions,threshold=.5):
    """Implements a custom classification threshold"""
    
    pred = np.zeros(len(predictions))
    try:
        if predictions.shape[1] == 2:
            pred[[predictions[:,1] > threshold]] = 1
    except: 
        pred[predictions > threshold] = 1
    return pred

def model_accuracies(y_test,predictions, model=None):
    """Calculates a variety of accuracy metrics
    
    Args:
        y_test - Numpy Array - true labels
        predictions - Numpy Array - binarized (0,1) model outputs
        model - Sklearn Classifier 
    
    """
    # # Accuracy Metrics
    f1 = f1_score(y_test,predictions)
    precision = precision_score(y_test,predictions)
    recall = recall_score(y_test,predictions)

    FP = np.sum((y_test != predictions) & (predictions == 1))
    FN = np.sum((y_test != predictions) & (predictions == 0))
    TP = np.sum((y_test == predictions) & (predictions == 1))
    TN = np.sum((y_test == predictions) & (predictions == 0))


    # Results
    print('/n', model.__class__.__name__)
    print('Acc :', (TP + TN) / (TP + TN + FP + FN))
    print('FP :', FP)
    print('TP :', TP)
    print('FN :', FN)
    print('TN :', TN)
    
    print('Pred_pos :',np.sum(predictions == 1))
    print('Pred_neg :',np.sum(predictions == 0))
    print('Total_pos :',np.sum(y_test == 1))
    print('Total_neg :',np.sum(y_test == 0))

    print('Precision : {} \nRecall : {} \nF1 {}'.format(precision,recall,f1))
        
def run_pipeline(model_list,scaler=None,downsample_size=.01):
    """Function to call each individual pipeline step"""
    train,test,y_train,y_test,indexer = preprocess_dataset(load_data(),downsample_size)
    trained_models = train_models(model_list,train,y_train,None,save=False)
    prediction_list = []
    for model in trained_models:
        predictions = binarize_predictions(
                make_probability_predictions(model,test),.5)
        prediction_list.append(predictions)
        model_accuracies(y_test,predictions,model)
    return prediction_list


In [102]:
ca_df = gpd.read_file("../Data/Processed/CA_S2Cells/CA_S2Cells.shp")
ca_df = ca_df.to_crs({'init': 'epsg:4326'})
ca_df.rename(columns={'S2_Cells_I': 'S2_Cells_ID'}, inplace=True)
ca_df.shape

selected_date = '2018-11-08'
ca_preds_df = ca_df.merge(ca_wf[ca_wf.WF_ALARM_DATE_DT_DT==selected_date][['S2_Cells_ID','WF_WildFire']], on='S2_Cells_ID', how='left')\
                   .merge(mlp_adam_noprecip_cw2000_probs[mlp_adam_noprecip_cw2000_probs.Date==selected_date].drop(columns='Date'), on='S2_Cells_ID')\
                   .merge(mlp_adam_precip_cw2000_probs[mlp_adam_precip_cw2000_probs.Date==selected_date].drop(columns='Date'), on='S2_Cells_ID')\
                   .merge(mlp_adam_noprecip_cwEQ_ext_probs[mlp_adam_noprecip_cwEQ_ext_probs.Date==selected_date].drop(columns='Date'), on='S2_Cells_ID')\
                   .merge(mlp_adam_precip_cwEQ_ext_probs[mlp_adam_precip_cwEQ_ext_probs.Date==selected_date].drop(columns='Date'), on='S2_Cells_ID')\
                   .fillna(0)
ca_preds_df.shape

DriverError: ../Data/Processed/CA_S2Cells/CA_S2Cells.shp: No such file or directory

In [95]:
gc.collect()

0

In [None]:
gc.collect()
# Models to try
scaler = MinMaxScaler()

# Initialize the model with defaults
lr = LogisticRegression()
ifc = IsolationForest(contamination=.001)
rf = RandomForestClassifier()

# add any additional models to model_list
model_list = [lr,ifc,rf]
predictions = run_pipeline(model_list)

# Check column names and types
# for col in wildfire_data.columns:
#     print('Name :', col,'dtyp : ', wildfire_data[col].dtype)

../../full_dataset/consolidated-data-000000000000.csv


In [None]:
for model in 

In [32]:
print(train.shape)

(488051, 16)


In [83]:
gc.collect()

# train,test,y_train,y_test,indexer = preprocess_dataset(load_data(),.01)
def train_model(num_layers,num_epochs=1,dropout_rate=.2):
    input_dim = train.shape[1]

    model = tf.keras.Sequential()
    model.add(layers.Dense(input_dim, activation='relu'))
    model.add(layers.Reshape((2,8)))
    model.add(layers.Conv1D(32,2, activation='relu'))

    for layer in range(num_layers -1):
        model.add(layers.Conv1D(32,1, activation='relu'))
        model.add(layers.Dropout(rate=dropout_rate))
    # model.add(layers.Conv1D(32,1, activation='relu'))
    # model.add(layers.Conv1D(32,1, activation='relu'))
    # model.add(layers.Conv1D(32,1, activation='relu'))
    # model.add(layers.Conv1D(32,1, activation='relu'))
    model.add(layers.Flatten())
    model.add(layers.Dense(1, activation = 'softmax'))

    model.compile(optimizer=tf.train.AdamOptimizer(.01),
                 loss='binary_crossentropy',
                 metrics=[tf.keras.metrics.Precision(),
                          tf.keras.metrics.Recall()])
    # train1 = np.vstack(train.values)
    model.fit(train.values,y_train, epochs=num_epochs,
              validation_data=(test.values,y_test))

    print(model.evaluate(test.values,y_test))
    return model.predict_proba(test.values)

In [84]:
predictions1 = train_model(5,10)#.flatten()

plt.hist(predictions1)

predictions1.flatten()
# model_accuracies(y_test,predictions1, model)

Train on 5023 samples, validate on 243449 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10

KeyboardInterrupt: 

In [None]:
 model_accuracies(y_test,model.predict(test.values))