# Process WFABBA FIgLib Join

<b>Summary:</b><br>
Reads in parsed WFABBA data from csv files created from 1_process_wfabba_merge_files.ipynb (WFABBA GOES-16 & WFABBA GOES-17 detections), smokeynet_test.json & smokeynet_valid.json (SmokeyNet predictions), and camera_metadata_hpwren.csv (contains locations of camera stations associated with SmokeyNet predictions). Join SmokeyNet detections with camera metadata to associate coordinates with every SmokeyNet prediction. For every camera station, join SmokeyNet predictions with potential WFABBA GOES-16 and WFABBA GOES-17 detections, then output results to csv files.<br>

- Read in parsed WFABBA data (outputted from 1_process_wfabba_merge_files.ipynb), SmokeyNet predictions, and camera metadata.
- Join SmokeyNet predictions with potential WFABBA GOES-16/GOES-17 detections.
- Output results to csv files

<b>Output:</b><br>
../..<br>
└── data<br>
&emsp;&emsp;&emsp;└── processed<br>
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;└── \<CAMERA_STATION_NAME\>_all_hard_voting_35.csv<br>

<b>Areas for Improvement:</b><br>
Need to further look into approaches to join SmokeyNet detections with WFABBA GOES-16/GOES-17 detections. Currently looking at joining SmokeyNet predictions with WFABBA detections by location proximity distances (default of 35 miles), camera direction, and whether the detections happen at the exact minute. May need to look into temporal joins if going with current implementation of joins. 
If considering another join approach instead: Currently considering each image as an independent event. May need to consider groupings of images as an event instead. Consider finding first instance of SmokeyNet, WFABBA GOES-16, WFABBA GOES-17 detections?

In [44]:
import pandas as pd
import urllib.request
import datetime as dt
import requests
import os
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from haversine import haversine, Unit
from shapely.geometry import Point
import geopandas as gpd
from geopandas import GeoDataFrame
import pytz
from sklearn.metrics import accuracy_score
import numpy as np
from prettytable import PrettyTable
from tqdm import tqdm
pd.set_option('display.max_columns', None)

## 1) Read in WFABBA Data and Consolidate into WFABBA GOES-16/GOES-17

In [45]:
# definte the processed and raw data directories
processed_data_dir = "../../data/processed/wfabba/"
raw_data_dir = "../../data/raw/"

In [46]:
# read in GOES 16 inputs
wfabba_goes_16_2019_df = pd.read_csv(processed_data_dir + "GOES-16-2019.csv")
wfabba_goes_16_2020_df = pd.read_csv(processed_data_dir + "GOES-16-2020.csv")
wfabba_goes_16_jan_2021_df = pd.read_csv(processed_data_dir + "GOES-16-Jan-2021.csv")
wfabba_goes_16_2021_df = pd.read_csv(processed_data_dir + "GOES-16-2021.csv")
wfabba_goes_16_2022_df = pd.read_csv(processed_data_dir + "GOES-16-2022.csv")

In [47]:
# get rid of unnecessary columns including ones which contain the same values or all NaN
wfabba_goes_16_2019_df = wfabba_goes_16_2019_df.drop(columns = ["Unnamed: 0", "Algorithm","Instrument","DataSource", "DataCreationTimestamp", "NavProjSubPtLong", "ActualSatSubPtLong", "NumFire", "Line", "Element"])
wfabba_goes_16_2020_df = wfabba_goes_16_2020_df.drop(columns = ["Unnamed: 0", "Algorithm","Instrument","DataSource", "DataCreationTimestamp", "NavProjSubPtLong", "ActualSatSubPtLong", "NumFire", "Line", "Element"])
wfabba_goes_16_jan_2021_df = wfabba_goes_16_jan_2021_df.drop(columns = ["Unnamed: 0", "Algorithm","Instrument","DataSource", "DataCreationTimestamp", "NavProjSubPtLong", "ActualSatSubPtLong", "NumFire", "Line", "Element"])
wfabba_goes_16_2021_df = wfabba_goes_16_2021_df.drop(columns = ["Unnamed: 0", "Algorithm","Instrument","DataSource", "DataCreationTimestamp", "NavProjSubPtLong", "ActualSatSubPtLong", "NumFire", "Line", "Element"]) #2021 detections
wfabba_goes_16_2022_df = wfabba_goes_16_2022_df.drop(columns = ["Unnamed: 0", "Algorithm","Instrument","DataSource", "DataCreationTimestamp", "NavProjSubPtLong", "ActualSatSubPtLong", "NumFire", "Line", "Element"]) #2022 detections

In [48]:
# filter out any January data in wfabba_goes_16_2021_df since it already exists in wfabba_goes_16_jan_2021_df
print(len(wfabba_goes_16_2021_df))
wfabba_goes_16_2021_df = wfabba_goes_16_2021_df[wfabba_goes_16_2021_df["Timestamp"] >= "2021-02-01"]
wfabba_goes_16_2021_df = wfabba_goes_16_2021_df.reset_index()
wfabba_goes_16_2021_df = wfabba_goes_16_2021_df.drop(columns=["index"])
print(len(wfabba_goes_16_2021_df))
# wfabba_goes_16_2021_df

93784
92739


In [49]:
# join all GOES-16 dataframes into unified wfabba_goes_16_df
wfabba_goes_16_df = pd.concat([wfabba_goes_16_2019_df, wfabba_goes_16_2020_df, wfabba_goes_16_jan_2021_df, wfabba_goes_16_2021_df, wfabba_goes_16_2022_df])
wfabba_goes_16_df["timestamp_converted"] = pd.to_datetime(wfabba_goes_16_df["Timestamp"], infer_datetime_format=True, origin="unix", utc=True)
wfabba_goes_16_df = wfabba_goes_16_df.reset_index()
wfabba_goes_16_df = wfabba_goes_16_df.drop(columns=["index"])
# wfabba_goes_16_df

In [50]:
#read in GOES 17 inputs
wfabba_goes_17_2019_df = pd.read_csv(processed_data_dir + "GOES-17-2019.csv")
wfabba_goes_17_2020_df = pd.read_csv(processed_data_dir + "GOES-17-2020.csv")
wfabba_goes_17_jan_2021_df = pd.read_csv(processed_data_dir + "GOES-17-Jan-2021.csv")
wfabba_goes_17_2021_df = pd.read_csv(processed_data_dir + "GOES-17-2021.csv")
wfabba_goes_17_2022_df = pd.read_csv(processed_data_dir + "GOES-17-2022.csv")

In [51]:
#get rid of unnecessary columns including ones which contain the same values or all NaN
wfabba_goes_17_2019_df = wfabba_goes_17_2019_df.drop(columns = ["Unnamed: 0", "Algorithm","Instrument","DataSource", "DataCreationTimestamp", "NavProjSubPtLong", "ActualSatSubPtLong", "NumFire", "Line", "Element"])
wfabba_goes_17_2020_df = wfabba_goes_17_2020_df.drop(columns = ["Unnamed: 0", "Algorithm","Instrument","DataSource", "DataCreationTimestamp", "NavProjSubPtLong", "ActualSatSubPtLong", "NumFire", "Line", "Element"])
wfabba_goes_17_jan_2021_df = wfabba_goes_17_jan_2021_df.drop(columns = ["Unnamed: 0", "Algorithm","Instrument","DataSource", "DataCreationTimestamp", "NavProjSubPtLong", "ActualSatSubPtLong", "NumFire", "Line", "Element"])
wfabba_goes_17_2021_df = wfabba_goes_17_2021_df.drop(columns = ["Unnamed: 0", "Algorithm","Instrument","DataSource", "DataCreationTimestamp", "NavProjSubPtLong", "ActualSatSubPtLong", "NumFire", "Line", "Element"]) #2021 detections
wfabba_goes_17_2022_df = wfabba_goes_17_2022_df.drop(columns = ["Unnamed: 0", "Algorithm","Instrument","DataSource", "DataCreationTimestamp", "NavProjSubPtLong", "ActualSatSubPtLong", "NumFire", "Line", "Element"]) #2022 detections

In [52]:
# filter out any January data in wfabba_goes_17_2021_df since it already exists in wfabba_goes_17_jan_2021_df
print(len(wfabba_goes_17_2021_df))
wfabba_goes_17_2021_df = wfabba_goes_17_2021_df[wfabba_goes_17_2021_df["Timestamp"] >= "2021-02-01"]
wfabba_goes_17_2021_df = wfabba_goes_17_2021_df.reset_index()
wfabba_goes_17_2021_df = wfabba_goes_17_2021_df.drop(columns=["index"])
print(len(wfabba_goes_17_2021_df))
# wfabba_goes_17_2021_df

376689
334881


In [53]:
# join all GOES-17 dataframes into unified wfabba_goes_17_df
wfabba_goes_17_df = pd.concat([wfabba_goes_17_2019_df, wfabba_goes_17_2020_df, wfabba_goes_17_jan_2021_df, wfabba_goes_17_2021_df, wfabba_goes_17_2022_df])
wfabba_goes_17_df["timestamp_converted"] = pd.to_datetime(wfabba_goes_17_df["Timestamp"], infer_datetime_format=True, origin="unix", utc=True)
wfabba_goes_17_df = wfabba_goes_17_df.reset_index()
wfabba_goes_17_df = wfabba_goes_17_df.drop(columns=["index"])
# wfabba_goes_17_df

In [54]:
# filter out any low probability fire detections in both GOES-16 and GOES-17 WFABBA data
print(wfabba_goes_16_df.shape)
print(wfabba_goes_17_df.shape)

# wfabba_goes_16_df = wfabba_goes_16_df[(wfabba_goes_16_df["Code"] != 15) & (wfabba_goes_16_df["Code"] != 35)]
# wfabba_goes_17_df = wfabba_goes_17_df[(wfabba_goes_17_df["Code"] != 15) & (wfabba_goes_17_df["Code"] != 35)]

print(wfabba_goes_16_df.shape)
print(wfabba_goes_17_df.shape)

(314298, 24)
(1665536, 24)
(314298, 24)
(1665536, 24)


### Convert the coordinates of WFABBA GOES-17 and GOES-16 from EPSG 4326 to EPSG 3310 to allow for distance calculations down to the meter

In [55]:
#convert WFABBA GOES 16 coordinates from EPSG 4326 to EPSG 3310
coords = [Point(xy) for xy in zip(wfabba_goes_16_df['Longitude'], wfabba_goes_16_df['Latitude'])]
wfabba_goes_16_df = GeoDataFrame(wfabba_goes_16_df, crs = "EPSG:4326", geometry = coords) 
wfabba_goes_16_df = wfabba_goes_16_df.to_crs('EPSG:3310')
# wfabba_goes_16_df[["Latitude","Longitude","geometry"]]

In [56]:
#convert WFABBA GOES 17 coordinates from EPSG 4326 to EPSG 3310
coords = [Point(xy) for xy in zip(wfabba_goes_17_df['Longitude'], wfabba_goes_17_df['Latitude'])]
wfabba_goes_17_df = GeoDataFrame(wfabba_goes_17_df, crs = "EPSG:4326", geometry = coords) 
wfabba_goes_17_df = wfabba_goes_17_df.to_crs('EPSG:3310')
# wfabba_goes_17_df[["Latitude","Longitude","geometry"]]

## 2) Camera Metadata Processing

In [57]:
# read in camera metadata
camera_metadata_df = pd.read_csv("../../data/processed/camera_metadata_hpwren.csv")
# camera_metadata_df

In [58]:
# read in camera metadata
camera_metadata_df = pd.read_csv("../../data/processed/camera_image_id_mappings.csv")
# camera_metadata_df

## 3) Matching WFABBA to SmokeyNet 

In [59]:
#Create dataframe for every minute of specified time period
times = []
start = datetime(2019, 6 , 1, 0, 0, 0, 0, pytz.UTC)
end = datetime(2021, 7, 11, 23, 59, 0, 0, pytz.UTC)

while start <= end:
    times.append(start)
    start = start + timedelta(minutes = 1)

minutes_df = pd.DataFrame(times, columns = ["timestamp"])
# minutes_df

In [60]:
# Create testing SmokeyNet df
# df_test = pd.read_json(raw_data_dir + "smokeynet_test.json", orient="index").reset_index().rename(columns={"index":"filepath"})
header = ['index', 'image_pred', 'image_prob', 'image_loss']
df_test = pd.read_csv("../../data/raw/smokeynet_outputs/image_preds_test.csv", names=header).rename(columns={"index":"filepath"})
# df_test = pd.read_csv("../../data/raw/smokeynet_outputs/test_paper_baseline_version4_last_model_image_preds.csv", names=header).rename(columns={"index":"filepath"})
df_test["type"] = "test"
# df_test

In [61]:
#Create validating SmokeyNet df
# df_valid = pd.read_json(raw_data_dir + "smokeynet_valid.json", orient="index").reset_index().rename(columns={"index":"filepath"})
header = ['index', 'image_pred', 'image_prob']
df_valid = pd.read_csv("../../data/raw/smokeynet_outputs/image_preds_valid.csv", names=header).rename(columns={"index":"filepath"})
# df_valid = pd.read_csv("../../data/raw/smokeynet_outputs/val_paper_baseline_version4_last_model_image_preds.csv", names=header).rename(columns={"index":"filepath"})
df_valid["type"] = "valid"
# df_valid

In [62]:
#Join the SmokeyNet DFs together. For now just joining validation and test DFs
df_labels = pd.concat([df_test, df_valid]).reset_index().drop(columns = ["index"])
# df_labels

In [63]:
df_labels['camera_name'] = df_labels.filepath.str.split("/").str[0]

In [64]:
def get_ground_truth_label(image_name):
    """Description: Returns 1 if image_name has a + in it (ie. is a positive) or 0 otherwise"""
    ground_truth_label = 1 if "+" in image_name else 0
    return ground_truth_label

df_labels['image_gt'] = df_labels.filepath.apply(get_ground_truth_label)

In [65]:
df_labels.head()
# df_labels.shape

Unnamed: 0,filepath,image_pred,image_prob,image_loss,type,camera_name,image_gt
0,20191001_FIRE_smer-tcs9-mobo-c/1569950517_-02400,0,0.213379,0.240006,test,20191001_FIRE_smer-tcs9-mobo-c,0
1,20191001_FIRE_smer-tcs9-mobo-c/1569950577_-02340,0,0.047089,0.048218,test,20191001_FIRE_smer-tcs9-mobo-c,0
2,20191001_FIRE_smer-tcs9-mobo-c/1569950637_-02280,0,0.046539,0.04767,test,20191001_FIRE_smer-tcs9-mobo-c,0
3,20191001_FIRE_smer-tcs9-mobo-c/1569950697_-02220,0,0.037201,0.037896,test,20191001_FIRE_smer-tcs9-mobo-c,0
4,20191001_FIRE_smer-tcs9-mobo-c/1569950757_-02160,0,0.063477,0.065587,test,20191001_FIRE_smer-tcs9-mobo-c,0


In [66]:
# set the date and year columns
df_labels["date"] = df_labels["camera_name"].str.split("_", n=1, expand=True)[0]
df_labels["year"] = df_labels["date"].str[:4]
df_labels["date"] = pd.to_datetime(df_labels["date"])
# df_labels

In [67]:
# keeping only entries from 2019-06-01 onwards
df_labels_filtered = df_labels[df_labels["date"] >= "2019-06-01"].reset_index().drop(columns=["index"])
# df_labels_filtered

In [68]:
#create time, datetime, event_name, camera_name attributes
df_labels_filtered["time"] = df_labels_filtered["filepath"].str.split("/").str[1]
df_labels_filtered["time"] = df_labels_filtered["time"].str.split("_").str[0]
df_labels_filtered["datetime"] = pd.to_datetime(df_labels_filtered["time"], unit="s", origin="unix", utc=True)
df_labels_filtered["event_name"] = df_labels_filtered["filepath"].str.split("/").str[0]
df_labels_filtered["camera_name"] = df_labels_filtered["camera_name"].str.split("_").str[-1]
# df_labels_filtered

In [69]:
# join SmokeyNet data with camera metadata
df_labels_filtered = df_labels_filtered.merge(camera_metadata_df, left_on="camera_name", right_on="image_id", how="left")
# df_labels_filtered

In [70]:
# convert joined SmokeyNet-camera metadata dataframe's coordinates from EPSG 4326 to EPSG 3310
coords = [Point(xy) for xy in zip(df_labels_filtered['long'], df_labels_filtered['lat'])]
df_labels_filtered = GeoDataFrame(df_labels_filtered, crs = "EPSG:4326", geometry = coords) 
df_labels_filtered = df_labels_filtered.to_crs('EPSG:3310')
# df_labels_filtered

In [71]:
# rounds timestamps to nearest minute on the dot
def round_secs(x):
    x = x + timedelta(minutes = 1)
    x = x.replace(second=0)
    return x

In [72]:
# determines if a WFABBA detection is within the same direction as the camera
def is_in_camera_direction(camera_geometry_pt, direction, wfabba_geometry_pt):
    if direction == "north":
        # Has to be true for the image to be in front of the camera
        return wfabba_geometry_pt.y >= camera_geometry_pt.y
    elif direction == "south":
        return wfabba_geometry_pt.y <= camera_geometry_pt.y
    elif direction == "east":
        return wfabba_geometry_pt.x >= camera_geometry_pt.x
    elif direction == "west":
        return wfabba_geometry_pt.x <= camera_geometry_pt.x
    else:
        # unknown or something else
        pass
    

In [73]:
# finds any matches with specified WFABBA dataset based off of 
# whether distance to camera is within specified radius & camera direction
def matches_distance_prox(camera_geometry, direction, radius_miles, wfabba_df):    
    wfabba_df["distance_m"] = wfabba_df["geometry"].distance(camera_geometry)
    wfabba_df["distance_mi"] = wfabba_df["distance_m"]/1609.344        
    match_results_df = wfabba_df[(wfabba_df["distance_mi"] <= radius_miles)].copy()
    
    #filter for detections within same direction
    match_results_df["is_in_direction"] = match_results_df.apply(
        lambda row: is_in_camera_direction(camera_geometry, direction, row["geometry"]), axis=1
    )
    match_results_df = match_results_df[match_results_df["is_in_direction"] == True]

    return match_results_df

In [74]:
# round the SmokeyNet timestamps to nearest minute on the dot
df_labels_filtered["datetime_rounded"] = df_labels_filtered["datetime"].apply(lambda x: round_secs(x))
# df_labels_filtered

In [75]:
# get all unique cameras being considered
unique_cameras = df_labels_filtered["camera_name"].unique()
# unique_cameras

In [76]:
# if there are cameras that don't have associated directions, filter them out
unusable_cameras = df_labels_filtered[df_labels_filtered["direction"].isna()]["camera_name"].unique()
unique_cameras = np.setdiff1d(unique_cameras, unusable_cameras)
unique_cameras 

array(['69bravo-e-mobo-c', 'bh-w-mobo-c', 'bl-n-mobo-c', 'bl-s-mobo-c',
       'bm-e-mobo-c', 'bm-w-mobo-c', 'cp-s-mobo-c', 'dwpgm-n-mobo-c',
       'hp-e-mobo-c', 'hp-n-mobo-c', 'hp-s-mobo-c', 'hp-w-mobo-c',
       'lo-s-mobo-c', 'lp-e-mobo-c', 'lp-n-mobo-c', 'lp-s-mobo',
       'lp-s-mobo-c', 'lp-w-mobo-c', 'marconi-n-mobo-c', 'mg-n-mobo-c',
       'ml-s-mobo-c', 'ml-w-mobo-c', 'mlo-n-mobo-c', 'mlo-s-mobo-c',
       'om-e-mobo-c', 'om-n-mobo-c', 'om-s-mobo', 'om-s-mobo-c',
       'om-w-mobo', 'om-w-mobo-c', 'pi-e-mobo-c', 'pi-n-mobo-c',
       'pi-s-mobo', 'pi-s-mobo-c', 'pi-w-mobo-c', 'rm-e-mobo-c',
       'rm-n-mobo-c', 'rm-w-mobo-c', 'sclm-e-mobo-c', 'sjh-n-mobo-c',
       'sm-e-mobo-c', 'sm-n-mobo-c', 'sm-s-mobo-c', 'sm-w-mobo-c',
       'smer-tcs8-mobo-c', 'smer-tcs9-mobo-c', 'so-w-mobo-c',
       'sp-n-mobo-c', 'syp-w-mobo-c', 'tp-s-mobo-c', 'tp-w-mobo-c',
       'vo-n-mobo-c', 'wc-e-mobo-c', 'wc-n-mobo-c', 'wc-s-mobo-c'],
      dtype=object)

### Join smokeynet preds, goes16 and goes17 (direct join)

In [35]:
%%time

scores_arr = []
csv_suffix = "_all_hard_voting_35.csv"

# spatial radius of potential WFABBA matches
distance_miles = 35
# distance_miles = 80

#looping for each camera station
for camera in tqdm(unique_cameras):
    
    # print("Camera:",camera)
    camera_df = df_labels_filtered[df_labels_filtered["camera_name"].str.contains(camera)].copy()

    camera_instance = camera_df.iloc[0]
    
    #Find GOES-16 matches
    goes_16_dist_match_df = matches_distance_prox(camera_instance["geometry"], camera_instance["direction"], distance_miles, wfabba_goes_16_df)
    goes_16_dist_match_df["timestamp_converted_rounded"] = goes_16_dist_match_df["timestamp_converted"].apply(lambda x: round_secs(x))
    goes_16_dist_match_df = goes_16_dist_match_df.drop_duplicates(subset = ["timestamp_converted_rounded"], keep="last")

    #Find GOES-17 matches
    goes_17_dist_match_df = matches_distance_prox(camera_instance["geometry"], camera_instance["direction"], distance_miles, wfabba_goes_17_df)
    goes_17_dist_match_df["timestamp_converted_rounded"] = goes_17_dist_match_df["timestamp_converted"].apply(lambda x: round_secs(x))
    goes_17_dist_match_df = goes_17_dist_match_df.drop_duplicates(subset = ["timestamp_converted_rounded"], keep="last")

    #SmokeyNet_join
    test_df = minutes_df.merge(camera_df, left_on = "timestamp", right_on = "datetime_rounded",how="left")
    test_df = test_df.rename(columns = {"geometry":"HPWREN_Station_geometry", "lat":"HPWREN_lat", "long":"HPWREN_long", "datetime_rounded":"SmokeyNet_datetime_rounded"})
    # print("joined SmokeyNet")

    select_goes_cols = ["timestamp_converted_rounded", "geometry", "Code", "FRP", "Fire Temp", 
                                    "Pixel Size", "Obs BT4", "Obs BT11", "Bkg BT4", "Bkg BT11"]
    
    #GOES-16 Join
    test_df = test_df.merge(goes_16_dist_match_df[select_goes_cols], left_on = "timestamp", right_on = "timestamp_converted_rounded",how="left")
    renamed_col_dict = {x:"WFABBA_GOES16_"+x.replace(" ", "_") for x in select_goes_cols}
    test_df = test_df.rename(columns = renamed_col_dict)
    test_df = test_df[["timestamp","camera_name", "image_gt", "image_pred", "image_prob", "type"] + list(renamed_col_dict.values())]
    test_df.loc[test_df["WFABBA_GOES16_geometry"] != None,'goes16_pred'] = 1
    test_df.loc[test_df["WFABBA_GOES16_geometry"] == None,'goes16_pred'] = 0
    # print("joined GOES16")

    #GOES-17 Join
    old_columns = test_df.columns.to_list()
    test_df = test_df.merge(goes_17_dist_match_df[select_goes_cols], left_on = "timestamp", right_on = "timestamp_converted_rounded",how="left")
    renamed_col_dict = {x:"WFABBA_GOES17_"+x.replace(" ", "_") for x in select_goes_cols}
    test_df = test_df.rename(columns = renamed_col_dict)
    test_df = test_df[old_columns + list(renamed_col_dict.values())]
    test_df.loc[test_df["WFABBA_GOES17_geometry"] != None,'goes17_pred'] = 1
    test_df.loc[test_df["WFABBA_GOES17_geometry"] == None,'goes17_pred'] = 0
    # print("joined GOES17")

    #Get all votes and determine if smoke was detected by majority rule
    test_df["final_vote"] = test_df["image_pred"] + test_df["goes16_pred"] + test_df["goes17_pred"]
    test_df.loc[test_df["final_vote"] >= 2,'final_pred'] = 1
    test_df.loc[test_df["final_vote"] < 2,'final_pred'] = 0

    image_labels = test_df[~test_df["image_gt"].isna()]["image_gt"]
    smokeynet_preds = test_df[~test_df["image_gt"].isna()]["image_pred"]
    ensemble_preds = test_df[~test_df["image_gt"].isna()]["final_pred"]

    baseline_score = accuracy_score(image_labels, smokeynet_preds)
    ensemble_score = accuracy_score(image_labels, ensemble_preds)
    scores_arr.append([camera, baseline_score, ensemble_score])
    
    # print("Baseline score:", baseline_score)
    # print("Ensemble score:", ensemble_score)
    test_df[~test_df["image_gt"].isna()]\
        .to_csv(processed_data_dir + camera + csv_suffix)
    # print("=====================================================")

100%|██████████| 55/55 [16:33<00:00, 18.06s/it]

Wall time: 16min 33s





In [156]:
output_table = PrettyTable(["Camera", "Baseline Acc", "Ensemble Acc"])
output_table.add_rows(scores_arr[-55:])
output_table

Camera,Baseline Acc,Ensemble Acc
69bravo-e-mobo-c,0.9625,0.5125
bh-w-mobo-c,0.961783439490446,0.7070063694267515
bl-n-mobo-c,0.75,0.4875
bl-s-mobo-c,0.9230769230769232,0.6410256410256411
bm-e-mobo-c,0.935897435897436,0.5256410256410257
bm-w-mobo-c,0.8625,0.4875
cp-s-mobo-c,0.9113924050632912,0.7531645569620253
dwpgm-n-mobo-c,0.975,1.0
hp-e-mobo-c,0.935064935064935,0.5064935064935064
hp-n-mobo-c,0.8625,0.81875


## Join smokeynet, goes16, goes17 based on sliding window join

In [36]:
scores_arr = []

In [39]:
%%time

csv_suffix = "_all_hard_voting_35.csv"
sliding_window_data_dir = '../../data/processed/wfabba_sliding_time_offset_120/'
time_offset = 120
# os.system(f"mkdir -p {sliding_window_data_dir}")

# spatial radius of potential WFABBA matches
distance_miles = 35
# distance_miles = 80

for camera in tqdm(unique_cameras):
    camera_df = df_labels_filtered[df_labels_filtered["camera_name"].str.contains(camera)].copy()
    camera_instance = camera_df.iloc[0]
    
    #Find GOES-16 matches
    goes_16_dist_match_df = matches_distance_prox(camera_instance["geometry"], camera_instance["direction"], distance_miles, wfabba_goes_16_df)
    goes_16_dist_match_df["timestamp_converted_rounded"] = goes_16_dist_match_df["timestamp_converted"].apply(lambda x: round_secs(x))
    goes_16_dist_match_df = goes_16_dist_match_df.drop_duplicates(subset = ["timestamp_converted_rounded"], keep="last")

    #Find GOES-17 matches
    goes_17_dist_match_df = matches_distance_prox(camera_instance["geometry"], camera_instance["direction"], distance_miles, wfabba_goes_17_df)
    goes_17_dist_match_df["timestamp_converted_rounded"] = goes_17_dist_match_df["timestamp_converted"].apply(lambda x: round_secs(x))
    goes_17_dist_match_df = goes_17_dist_match_df.drop_duplicates(subset = ["timestamp_converted_rounded"], keep="last")

    camera_df.sort_values(by=['event_name', 'datetime_rounded'], inplace=True)
    goes_16_dist_match_df.sort_values(by=['timestamp_converted_rounded'], inplace=True)
    goes_17_dist_match_df.sort_values(by=['timestamp_converted_rounded'], inplace=True)

    #SmokeyNet_join
    test_df = minutes_df.merge(camera_df, left_on = "timestamp", right_on = "datetime_rounded", how = "left")
    test_df = test_df.rename(columns = {"geometry":"HPWREN_Station_geometry", "lat":"HPWREN_lat", "long":"HPWREN_long", "datetime_rounded":"SmokeyNet_datetime_rounded"})
    # print("joined SmokeyNet")

    select_goes_cols = ["timestamp_converted_rounded", "geometry", "Code", "FRP", "Fire Temp", 
                                    "Pixel Size", "Obs BT4", "Obs BT11", "Bkg BT4", "Bkg BT11"]
    
    #GOES-16 Join
    test_df = pd.merge_asof(
        left=test_df,
        right=goes_16_dist_match_df[select_goes_cols],
        left_on='timestamp',
        right_on='timestamp_converted_rounded',
        tolerance=pd.Timedelta(minutes=time_offset),
        direction='forward'
    )
    renamed_col_dict = {x:"WFABBA_GOES16_"+x.replace(" ", "_") for x in select_goes_cols}
    test_df = test_df.rename(columns = renamed_col_dict)
    test_df = test_df[["timestamp","camera_name", "image_gt", "image_pred", "image_prob", "type"] + list(renamed_col_dict.values())]
    test_df.loc[test_df["WFABBA_GOES16_geometry"] != None,'goes16_pred'] = 1
    test_df.loc[test_df["WFABBA_GOES16_geometry"] == None,'goes16_pred'] = 0
    # print("joined GOES16")

    #GOES-17 Join
    old_columns = test_df.columns.to_list()
    test_df = pd.merge_asof(
        left=test_df,
        right=goes_17_dist_match_df[select_goes_cols],
        left_on='timestamp',
        right_on='timestamp_converted_rounded',
        tolerance=pd.Timedelta(minutes=time_offset),
        direction='forward'
    )
    renamed_col_dict = {x:"WFABBA_GOES17_"+x.replace(" ", "_") for x in select_goes_cols}
    test_df = test_df.rename(columns = renamed_col_dict)
    test_df = test_df[old_columns + list(renamed_col_dict.values())]
    test_df.loc[test_df["WFABBA_GOES17_geometry"] != None,'goes17_pred'] = 1
    test_df.loc[test_df["WFABBA_GOES17_geometry"] == None,'goes17_pred'] = 0
    # print("joined GOES17")

    #Get all votes and determine if smoke was detected by majority rule
    test_df["final_vote"] = test_df["image_pred"] + test_df["goes16_pred"] + test_df["goes17_pred"]
    test_df.loc[test_df["final_vote"] >= 2,'final_pred'] = 1
    test_df.loc[test_df["final_vote"] < 2,'final_pred'] = 0

    image_labels = test_df[~test_df["image_gt"].isna()]["image_gt"]
    smokeynet_preds = test_df[~test_df["image_gt"].isna()]["image_pred"]
    ensemble_preds = test_df[~test_df["image_gt"].isna()]["final_pred"]

    baseline_score = accuracy_score(image_labels, smokeynet_preds)
    ensemble_score = accuracy_score(image_labels, ensemble_preds)
    scores_arr.append([camera, baseline_score, ensemble_score])
    
    # print("Baseline score:", baseline_score)
    # print("Ensemble score:", ensemble_score)
    test_df[~test_df["image_gt"].isna()] \
        .to_csv(sliding_window_data_dir + camera + csv_suffix)
    # print("=====================================================")

100%|██████████| 55/55 [18:53<00:00, 20.60s/it]

Wall time: 18min 53s





In [35]:
output_table = PrettyTable(["Camera", "Baseline Acc", "Ensemble Acc"])
output_table.add_rows(scores_arr[-55:])
output_table

Camera,Baseline Acc,Ensemble Acc
69bravo-e-mobo-c,0.9625,0.4375
bh-w-mobo-c,0.961783439490446,0.7006369426751592
bl-n-mobo-c,0.75,0.4875
bl-s-mobo-c,0.9230769230769232,0.5128205128205128
bm-e-mobo-c,0.935897435897436,0.0384615384615384
bm-w-mobo-c,0.8625,0.4875
cp-s-mobo-c,0.9113924050632912,0.5063291139240507
dwpgm-n-mobo-c,0.975,0.9875
hp-e-mobo-c,0.935064935064935,0.5064935064935064
hp-n-mobo-c,0.8625,0.5


### Sliding Window: join based on nearest time observation --> issue could be in the past as well

In [56]:
df_labels_filtered.columns
# df_labels_filtered['type'].value_counts()

Index(['filepath', 'image_pred', 'image_prob', 'type', 'camera_name', 'date',
       'year', 'image_gt', 'time', 'datetime', 'event_name', 'camera_id',
       'image_id', 'lat', 'long', 'direction', 'geometry', 'datetime_rounded'],
      dtype='object')

In [54]:
scores_arr = []

In [73]:
print (df_labels_filtered[df_labels_filtered['type'] == 'test'].shape)
print (df_labels_filtered[df_labels_filtered['type'] == 'valid'].shape)

(4584, 18)
(4584, 18)


In [78]:
%%time

csv_suffix = "_all_hard_voting_10_window_20.csv"
sliding_window_data_dir = '../../data/processed/wfabba_sliding_window_new_data/'
# sliding_window_data_dir = '../../data/processed/wfabba_sliding_window_nearest_without_code_15/'

# spatial radius of potential WFABBA matches
distance_miles = 20
time_offset = 10
# distance_miles = 80

for camera in tqdm(unique_cameras):
    camera_df = df_labels_filtered[df_labels_filtered["camera_name"].str.contains(camera)].copy()
    camera_instance = camera_df.iloc[0]
    
    #Find GOES-16 matches
    goes_16_dist_match_df = matches_distance_prox(camera_instance["geometry"], camera_instance["direction"], distance_miles, wfabba_goes_16_df)
    goes_16_dist_match_df["timestamp_converted_rounded"] = goes_16_dist_match_df["timestamp_converted"].apply(lambda x: round_secs(x))
    goes_16_dist_match_df = goes_16_dist_match_df.drop_duplicates(subset = ["timestamp_converted_rounded"], keep="last")

    #Find GOES-17 matches
    goes_17_dist_match_df = matches_distance_prox(camera_instance["geometry"], camera_instance["direction"], distance_miles, wfabba_goes_17_df)
    goes_17_dist_match_df["timestamp_converted_rounded"] = goes_17_dist_match_df["timestamp_converted"].apply(lambda x: round_secs(x))
    goes_17_dist_match_df = goes_17_dist_match_df.drop_duplicates(subset = ["timestamp_converted_rounded"], keep="last")

    camera_df.sort_values(by=['event_name', 'datetime_rounded'], inplace=True)
    goes_16_dist_match_df.sort_values(by=['timestamp_converted_rounded'], inplace=True)
    goes_17_dist_match_df.sort_values(by=['timestamp_converted_rounded'], inplace=True)

    #SmokeyNet_join
    test_df = minutes_df.merge(camera_df, left_on = "timestamp", right_on = "datetime_rounded", how = "left")
    test_df = test_df.rename(columns = {"geometry":"HPWREN_Station_geometry", "lat":"HPWREN_lat", "long":"HPWREN_long", "datetime_rounded":"SmokeyNet_datetime_rounded"})
    # print("joined SmokeyNet")
    
    select_goes_cols = ["timestamp_converted_rounded", "geometry", "Code", "FRP", "Fire Temp", 
                                    "Pixel Size", "Obs BT4", "Obs BT11", "Bkg BT4", "Bkg BT11"]

    #GOES-16 Join
    test_df = pd.merge_asof(
        left=test_df,
        right=goes_16_dist_match_df[select_goes_cols],
        left_on='timestamp',
        right_on='timestamp_converted_rounded',
        tolerance=pd.Timedelta(minutes=time_offset),
        direction='nearest'
    )
    renamed_col_dict = {x:"WFABBA_GOES16_"+x.replace(" ", "_") for x in select_goes_cols}
    test_df = test_df.rename(columns = renamed_col_dict)
    test_df = test_df[["timestamp","camera_name", "image_gt", "image_pred", "image_prob", "type"] + list(renamed_col_dict.values())]
    test_df.loc[test_df["WFABBA_GOES16_geometry"] != None,'goes16_pred'] = 1
    test_df.loc[test_df["WFABBA_GOES16_geometry"] == None,'goes16_pred'] = 0
    # print("joined GOES16")

    #GOES-17 Join
    old_columns = test_df.columns.to_list()
    test_df = pd.merge_asof(
        left=test_df,
        right=goes_17_dist_match_df[select_goes_cols],
        left_on='timestamp',
        right_on='timestamp_converted_rounded',
        tolerance=pd.Timedelta(minutes=time_offset),
        direction='nearest'
    )
    renamed_col_dict = {x:"WFABBA_GOES17_"+x.replace(" ", "_") for x in select_goes_cols}
    test_df = test_df.rename(columns = renamed_col_dict)
    test_df = test_df[old_columns + list(renamed_col_dict.values())]
    test_df.loc[test_df["WFABBA_GOES17_geometry"] != None,'goes17_pred'] = 1
    test_df.loc[test_df["WFABBA_GOES17_geometry"] == None,'goes17_pred'] = 0
    # print("joined GOES17")

    #Get all votes and determine if smoke was detected by majority rule
    test_df["final_vote"] = test_df["image_pred"] + test_df["goes16_pred"] + test_df["goes17_pred"]
    test_df.loc[test_df["final_vote"] >= 2,'final_pred'] = 1
    test_df.loc[test_df["final_vote"] < 2,'final_pred'] = 0

    image_labels = test_df[~test_df["image_gt"].isna()]["image_gt"]
    smokeynet_preds = test_df[~test_df["image_gt"].isna()]["image_pred"]
    ensemble_preds = test_df[~test_df["image_gt"].isna()]["final_pred"]

    baseline_score = accuracy_score(image_labels, smokeynet_preds)
    ensemble_score = accuracy_score(image_labels, ensemble_preds)
    scores_arr.append([camera, baseline_score, ensemble_score])
    
    # print("Baseline score:", baseline_score)
    # print("Ensemble score:", ensemble_score)
    # test_df[~test_df["image_gt"].isna()][["timestamp","image_gt", "image_pred", "goes16_pred", "goes17_pred", "final_pred", "type", "WFABBA_GOES16_Code", "WFABBA_GOES17_Code"]]\
    #     .to_csv(sliding_window_data_dir + camera + csv_suffix)
    test_df[~test_df["image_gt"].isna()]\
        .to_csv(sliding_window_data_dir + camera + csv_suffix)
    # print("=====================================================")

100%|██████████| 55/55 [15:35<00:00, 17.01s/it]

Wall time: 15min 35s





In [43]:
l = test_df.columns.to_list() + ["timestamp_converted_rounded", "geometry", "Code", "FRP", "Fire Temp", 
                                    "Pixel Size", "Obs BT4", "Obs BT11", "Bkg BT4", "Bkg BT11"]
{x:"Goes16"+x for x in l}

list(renamed_col_dict.values())

['WFABBA_GOES16_timestamp_converted_rounded',
 'WFABBA_GOES16_geometry',
 'WFABBA_GOES16_Code',
 'WFABBA_GOES16_FRP',
 'WFABBA_GOES16_Fire_Temp',
 'WFABBA_GOES16_Pixel_Size',
 'WFABBA_GOES16_Obs_BT4',
 'WFABBA_GOES16_Obs_BT11',
 'WFABBA_GOES16_Bkg_BT4',
 'WFABBA_GOES16_Bkg_BT11']

### Sliding Window: Manual Calculation

In [None]:
%%time

csv_suffix = "_all_hard_voting_35.csv"
sliding_window_data_dir = '../../data/processed/wfabba_sliding_window_nearest/'
# sliding_window_data_dir = '../../data/processed/wfabba_sliding_window_nearest_without_code_15/'

# spatial radius of potential WFABBA matches
distance_miles = 35
time_offset = 60
# distance_miles = 80

for camera in tqdm(unique_cameras):
    camera_df = df_labels_filtered[df_labels_filtered["camera_name"].str.contains(camera)].copy()
    camera_instance = camera_df.iloc[0]
    
    #Find GOES-16 matches
    goes_16_dist_match_df = matches_distance_prox(camera_instance["geometry"], camera_instance["direction"], distance_miles, wfabba_goes_16_df)
    goes_16_dist_match_df["timestamp_converted_rounded"] = goes_16_dist_match_df["timestamp_converted"].apply(lambda x: round_secs(x))
    goes_16_dist_match_df = goes_16_dist_match_df.drop_duplicates(subset = ["timestamp_converted_rounded"], keep="last")

    #Find GOES-17 matches
    goes_17_dist_match_df = matches_distance_prox(camera_instance["geometry"], camera_instance["direction"], distance_miles, wfabba_goes_17_df)
    goes_17_dist_match_df["timestamp_converted_rounded"] = goes_17_dist_match_df["timestamp_converted"].apply(lambda x: round_secs(x))
    goes_17_dist_match_df = goes_17_dist_match_df.drop_duplicates(subset = ["timestamp_converted_rounded"], keep="last")

    camera_df.sort_values(by=['event_name', 'datetime_rounded'], inplace=True)
    goes_16_dist_match_df.sort_values(by=['timestamp_converted_rounded'], inplace=True)
    goes_17_dist_match_df.sort_values(by=['timestamp_converted_rounded'], inplace=True)

    #SmokeyNet_join
    test_df = minutes_df.merge(camera_df, left_on = "timestamp", right_on = "datetime_rounded", how = "left")
    test_df = test_df.rename(columns = {"geometry":"HPWREN_Station_geometry", "lat":"HPWREN_lat", "long":"HPWREN_long", "datetime_rounded":"SmokeyNet_datetime_rounded"})
    # print("joined SmokeyNet")
    
    #GOES-16 Join
    test_df = pd.merge_asof(
        left=test_df,
        right=goes_16_dist_match_df[["timestamp_converted_rounded", "geometry", "Code"]],
        left_on='timestamp',
        right_on='timestamp_converted_rounded',
        tolerance=pd.Timedelta(minutes=time_offset),
        direction='nearest'
    )
    test_df = test_df.rename(columns = {"geometry":"WFABBA_GOES16_geometry", "timestamp_converted_rounded":"WFABBA_GOES16_timestamp_converted_rounded", "Code":"WFABBA_GOES16_Code"})
    test_df = test_df[["timestamp","camera_name", "image_gt", "image_pred", "type", "WFABBA_GOES16_geometry", "WFABBA_GOES16_Code"]]
    test_df.loc[test_df["WFABBA_GOES16_geometry"] != None,'goes16_pred'] = 1
    test_df.loc[test_df["WFABBA_GOES16_geometry"] == None,'goes16_pred'] = 0
    # print("joined GOES16")

    #GOES-17 Join
    test_df = pd.merge_asof(
        left=test_df,
        right=goes_17_dist_match_df[["timestamp_converted_rounded", "geometry", "Code"]],
        left_on='timestamp',
        right_on='timestamp_converted_rounded',
        tolerance=pd.Timedelta(minutes=time_offset),
        direction='nearest'
    )
    test_df = test_df.rename(columns = {"geometry":"WFABBA_GOES17_geometry", "timestamp_converted_rounded":"WFABBA_GOES17_timestamp_converted_rounded", "Code":"WFABBA_GOES17_Code"})
    test_df = test_df[["timestamp","camera_name", "image_gt", "image_pred", "type", "WFABBA_GOES16_geometry", "goes16_pred", "WFABBA_GOES17_geometry", "WFABBA_GOES16_Code", "WFABBA_GOES17_Code"]]
    test_df.loc[test_df["WFABBA_GOES17_geometry"] != None,'goes17_pred'] = 1
    test_df.loc[test_df["WFABBA_GOES17_geometry"] == None,'goes17_pred'] = 0
    # print("joined GOES17")

    #Get all votes and determine if smoke was detected by majority rule
    test_df["final_vote"] = test_df["image_pred"] + test_df["goes16_pred"] + test_df["goes17_pred"]
    test_df.loc[test_df["final_vote"] >= 2,'final_pred'] = 1
    test_df.loc[test_df["final_vote"] < 2,'final_pred'] = 0

    image_labels = test_df[~test_df["image_gt"].isna()]["image_gt"]
    smokeynet_preds = test_df[~test_df["image_gt"].isna()]["image_pred"]
    ensemble_preds = test_df[~test_df["image_gt"].isna()]["final_pred"]

    baseline_score = accuracy_score(image_labels, smokeynet_preds)
    ensemble_score = accuracy_score(image_labels, ensemble_preds)
    scores_arr.append([camera, baseline_score, ensemble_score])
    
    # print("Baseline score:", baseline_score)
    # print("Ensemble score:", ensemble_score)
    test_df[~test_df["image_gt"].isna()][["timestamp","image_gt", "image_pred", "goes16_pred", "goes17_pred", "final_pred", "type", "WFABBA_GOES16_Code", "WFABBA_GOES17_Code"]]\
        .to_csv(sliding_window_data_dir + camera + csv_suffix)
    # print("=====================================================")