# Meteorology Data Cleaning
Cleaning and preprocessing involved dropping excessive columns, calculating average atmospheric measurememnts for the entire wildfire dates range, renaming columns, data types conversion, merging datasets together.

In [117]:
# Imports
import pandas as pd
import numpy as np

# Importing sys
import sys

# Adding Config file
sys.path.insert(0, '../config/')

from config import Config

In [118]:
# Reading data we collected with APIs
# Load Meteo stat dataset
mdf = pd.read_csv(Config().get_raw_meteorology_path("meteostat_weather"))
# Load Nasa dataset
ndf = pd.read_csv(Config().get_raw_meteorology_path("nasa_weather"))

# Load wildfire dataset
fdf = pd.read_csv("../../data/processed/wildfire.csv")


In [119]:
mdf.head()

Unnamed: 0.1,Unnamed: 0,time,tavg,tmin,tmax,prcp,snow,wdir,wspd,wpgt,pres,tsun,lat,long,pid,index,station
0,0,2020-07-22,35.4,30.0,40.0,,,,,,,,33.19581,-111.3487,0,,
1,1,2020-07-23,30.6,27.0,34.0,,,,,,,,33.19581,-111.3487,0,,
2,0,2020-07-19,25.9,14.4,35.6,0.0,,,8.0,,1013.1,,40.602563,-115.719777,1,,
3,1,2020-07-20,26.3,13.3,35.6,0.0,,,5.7,,1012.4,,40.602563,-115.719777,1,,
4,2,2020-07-21,25.3,15.0,33.9,0.0,,359.0,9.0,,1010.6,,40.602563,-115.719777,1,,


Data we got via Meteostat API had too many missing values so we didn't use it in the project.

In [120]:
# Checking columns
ndf.columns

Index(['YEAR', 'MO', 'DY', 'T2M', 'T2M_MAX', 'QV2M', 'PRECTOTCORR', 'WS2M',
       'WS2M_MAX', 'WS10M', 'WS10M_MAX', 'GWETTOP', 'GWETPROF', 'LAT', 'LONG',
       'PID'],
      dtype='object')

In [121]:
# Renaming POWER API dataset columns
ndf.rename(
    columns={
        "WS2M": "wind_speed_2m_mean",
        "WS2M_MAX": "wind_speed_2m_max_mean",
        "WS10M": "wind_speed_10m_mean",
        "WS10M_MAX": "wind_speed_10m_max_mean",
        "QV2M": "humidity_mean",
        "PRECTOTCORR": "rain_sum",
        "PID": "pid",
        "T2M": "temp_2m",
        "GWETTOP": "surface_soil_wetness_5cm_below",
        "GWETPROF": "surface_soil_wetness_to_bedrock",
        'T2M_MAX': 'temp_2m_max'
    },
    inplace=True,
)

# Drop year, month , day, lat, long columns
ndf.drop(columns=["YEAR", "MO", "DY", "LAT", "LONG"], inplace=True)

# Get mean by pid
ndf_mean = ndf.groupby("pid").mean()[
    [
        "wind_speed_2m_mean",
        "wind_speed_2m_max_mean",
        "wind_speed_10m_mean",
        "wind_speed_10m_max_mean",
        "humidity_mean",
        "temp_2m",
    ]
]


# Calculate sum of rain 
ndf_sum = ndf.groupby("pid").sum()[["rain_sum"]]

# Check results
print(ndf_sum.head(1))
ndf_mean.head(1)


     rain_sum
pid          
0         2.6


Unnamed: 0_level_0,wind_speed_2m_mean,wind_speed_2m_max_mean,wind_speed_10m_mean,wind_speed_10m_max_mean,humidity_mean,temp_2m
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.41,3.09,1.895,4.18,13.21,29.9


As long as POWER API collcets data for every day of fire, we got rid of everything except first dat of the fire.

In [122]:
# Keep fire start date and drop rest
first_date_fire_weather = ndf.drop_duplicates(subset="pid", keep="first")


# set pid as index
first_date_fire_weather.set_index("pid", inplace=True)

first_date_fire_weather.head()


Unnamed: 0_level_0,temp_2m,temp_2m_max,humidity_mean,rain_sum,wind_speed_2m_mean,wind_speed_2m_max_mean,wind_speed_10m_mean,wind_speed_10m_max_mean,surface_soil_wetness_5cm_below,surface_soil_wetness_to_bedrock
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,32.63,38.06,12.02,0.16,1.26,2.77,1.71,3.84,0.12,0.38
1,24.8,33.19,5.92,0.0,1.76,3.42,2.61,4.52,0.31,0.37
2,18.91,26.46,7.93,3.44,2.5,3.65,4.16,5.82,0.26,0.35
3,5.48,16.3,2.56,0.0,1.25,2.98,1.88,4.02,0.17,0.34
4,31.5,39.08,8.91,0.26,2.3,4.45,3.26,5.9,0.15,0.36


We were considering using average measurements for the entire dates range. So we calculated those and merged them with original weather data.

In [123]:
# Merging datasets
first_date_fire_weather = first_date_fire_weather.merge(
    ndf_mean, how="inner", left_index=True, right_index=True
)

# Merge sum of rain
first_date_fire_weather = first_date_fire_weather.merge(
    ndf_sum, how="inner", left_index=True, right_index=True
)


Then we merged a dataset from above to the wild fires dataset.

In [124]:
# Set wildfire id as index
fdf.set_index('id', inplace=True)

In [125]:
# Merge wildfire with weather dataset
merged_df = fdf.merge(
    first_date_fire_weather, how="left", left_index=True, right_index=True
)


In [126]:
# Check nulls
merged_df.isnull().sum()

X                                  0
Y                                  0
ContainmentDateTime                0
ControlDateTime                    0
DailyAcres                         0
DiscoveryAcres                     0
FireCause                          0
FireDiscoveryDateTime              0
IncidentTypeCategory               0
IncidentTypeKind                   0
InitialLatitude                    0
InitialLongitude                   0
IrwinID                            0
LocalIncidentIdentifier            0
POOCounty                          0
POODispatchCenterID                0
POOFips                            0
POOState                           0
UniqueFireIdentifier               0
temp_2m_x                          4
temp_2m_max                        4
humidity_mean_x                    4
rain_sum_x                         4
wind_speed_2m_mean_x               4
wind_speed_2m_max_mean_x           4
wind_speed_10m_mean_x              4
wind_speed_10m_max_mean_x          4
s

drop missing 4 rows

In [127]:
# Drop nulls
merged_df.dropna(inplace=True)

Renaming column names so they are in power index.

In [128]:
merged_df.columns = merged_df.columns.str.lower()

In [129]:
merged_df.head(1)

Unnamed: 0_level_0,x,y,containmentdatetime,controldatetime,dailyacres,discoveryacres,firecause,firediscoverydatetime,incidenttypecategory,incidenttypekind,...,wind_speed_10m_max_mean_x,surface_soil_wetness_5cm_below,surface_soil_wetness_to_bedrock,wind_speed_2m_mean_y,wind_speed_2m_max_mean_y,wind_speed_10m_mean_y,wind_speed_10m_max_mean_y,humidity_mean_y,temp_2m_y,rain_sum_y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-111.348611,33.195755,2020-07-23 05:29:59+00:00,2020-07-23 05:29:59+00:00,8.0,2.5,Human,2020-07-22 21:51:00+00:00,WF,FI,...,3.84,0.12,0.38,1.41,3.09,1.895,4.18,13.21,29.9,2.6


During EDA, we've discovered that some wildfires' start points were typed in incorrectly, so we got rid of them.

In [130]:
# Drop wrong lat long from wildfire dataset
wrong_lat_long = [
    748,
    967,
    3462,
    3501,
    4010,
    5067,
    5148,
    5642,
    6740,
    7809,
    9293,
    10254,
    11920,
    13328,
    13802,
    15195,
    15601,
    16424,
    17029,
    19384,
    19768,
    20767,
]
merged_df = merged_df[~merged_df.index.isin(wrong_lat_long)]


In [131]:
# Store merged dataset that contains wildfire and weather
merged_df.reset_index().to_csv(
    Config().get_cleaned_fire_all_path("weather"), index=False
)


In [132]:
# Checking dimensions
merged_df.shape

(21518, 36)

----
Add extra dataset

In [133]:
# Read in extra weather dataset for big fires (acres > 1 and loner than 1 day)
extra_weather = pd.read_csv(Config().get_raw_meteorology_path("nasa_weather_extra"))
extra_weather.head(1)


Unnamed: 0,YEAR,MO,DY,T2MDEW,T2MWET,RH2M,CLRSKY_SFC_PAR_TOT,ALLSKY_SFC_PAR_TOT,ALLSKY_SFC_UV_INDEX,LAT,LONG,PID
0,2020,7,19,3.02,13.91,27.31,155.74,135.5,2.55,40.602563,-115.719777,1


In [134]:
# Drop cols
extra_weather.drop(columns=['YEAR', 'MO', 'DY', 'LAT', 'LONG'], inplace=True)

In [135]:
# Keep start date
extra_weather.drop_duplicates(subset="PID", keep='first', inplace=True)

In [136]:
# Rename columns
extra_weather.rename(columns={
  'T2MDEW': 'dew_frost_point_2m',
  'T2MWET': 'wet_bulb_temp_2m',
  'RH2M': 'relative_humidity',
  'CLRSKY_SFC_PAR_TOT': 'clear_sky_photosynthetically_active_radiation',
  'ALLSKY_SFC_PAR_TOT': 'all_sky_photosynthetically_active_radiation',
  'ALLSKY_SFC_UV_INDEX': 'all_sky_surface_uv_index',
  'PID': 'pid'
}, inplace=True)

extra_weather.head(1)

Unnamed: 0,dew_frost_point_2m,wet_bulb_temp_2m,relative_humidity,clear_sky_photosynthetically_active_radiation,all_sky_photosynthetically_active_radiation,all_sky_surface_uv_index,pid
0,3.02,13.91,27.31,155.74,135.5,2.55,1


In [137]:
extra_weather.set_index('pid', inplace=True)

In [138]:
extra_weather.shape

(2858, 6)

In [139]:
merged_df.head(1)

Unnamed: 0_level_0,x,y,containmentdatetime,controldatetime,dailyacres,discoveryacres,firecause,firediscoverydatetime,incidenttypecategory,incidenttypekind,...,wind_speed_10m_max_mean_x,surface_soil_wetness_5cm_below,surface_soil_wetness_to_bedrock,wind_speed_2m_mean_y,wind_speed_2m_max_mean_y,wind_speed_10m_mean_y,wind_speed_10m_max_mean_y,humidity_mean_y,temp_2m_y,rain_sum_y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-111.348611,33.195755,2020-07-23 05:29:59+00:00,2020-07-23 05:29:59+00:00,8.0,2.5,Human,2020-07-22 21:51:00+00:00,WF,FI,...,3.84,0.12,0.38,1.41,3.09,1.895,4.18,13.21,29.9,2.6


In [140]:
# Merged extra weather dataset with wildfire
merged_df = merged_df.merge(extra_weather, how='left', left_index=True, right_index=True)

In [141]:
# Drop nulls
merged_df.dropna(inplace=True)

In [142]:
merged_df.shape

(2850, 42)

Drop -999.00

In [143]:
# Drop missing values which labeled as -999.00
merged_df = merged_df[(merged_df != -999.00).all(axis=1)]

In [144]:
merged_df.shape

(2109, 42)

In [145]:
# Store big fire with extra weather dataset
merged_df.reset_index().to_csv(Config().get_cleaned_fire_filtered_path("weather_extra"), index=False)

In [146]:
merged_df.head(1)

Unnamed: 0_level_0,x,y,containmentdatetime,controldatetime,dailyacres,discoveryacres,firecause,firediscoverydatetime,incidenttypecategory,incidenttypekind,...,wind_speed_10m_max_mean_y,humidity_mean_y,temp_2m_y,rain_sum_y,dew_frost_point_2m,wet_bulb_temp_2m,relative_humidity,clear_sky_photosynthetically_active_radiation,all_sky_photosynthetically_active_radiation,all_sky_surface_uv_index
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-115.748812,40.617506,2020-08-03 23:00:00+00:00,2020-09-02 15:00:00+00:00,5985.9,5.0,Natural,2020-07-19 23:00:00+00:00,WF,FI,...,5.237174,5.657391,23.396304,12.62,3.02,13.91,27.31,155.74,135.5,2.55


----
Merge big fire 180 days dataset

In [147]:
# Load rain and snow for previous 6 months from wildfire start date
prec_180d = pd.read_csv(
    Config().get_processed_meteorology_path("historical_rain_snow_sum_6mo")
)

prec_180d.head(1)

Unnamed: 0,UniqueFireIdentifier,id,sum_rain,sum_snow
0,2020-NVECFX-010145,1,158.2,2.64


In [148]:
# Set pid as index
prec_180d.set_index('id', inplace=True)

In [149]:
# Merge wildfire with 6 months dataset
merged_df = merged_df.merge(prec_180d, left_index=True, right_index=True)

In [150]:
merged_df.head()

Unnamed: 0_level_0,x,y,containmentdatetime,controldatetime,dailyacres,discoveryacres,firecause,firediscoverydatetime,incidenttypecategory,incidenttypekind,...,rain_sum_y,dew_frost_point_2m,wet_bulb_temp_2m,relative_humidity,clear_sky_photosynthetically_active_radiation,all_sky_photosynthetically_active_radiation,all_sky_surface_uv_index,UniqueFireIdentifier,sum_rain,sum_snow
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-115.748812,40.617506,2020-08-03 23:00:00+00:00,2020-09-02 15:00:00+00:00,5985.9,5.0,Natural,2020-07-19 23:00:00+00:00,WF,FI,...,12.62,3.02,13.91,27.31,155.74,135.5,2.55,2020-NVECFX-010145,158.2,2.64
5,-113.751212,35.515265,2020-10-17 21:59:59+00:00,2020-10-19 15:00:00+00:00,135.0,1.0,Human,2020-10-15 18:17:00+00:00,WF,FI,...,0.0,-6.5,8.11,13.62,97.43,97.5,1.37,2020-AZCRD-002177,10.54,0.0
10,-120.073602,39.769989,2020-06-24 19:02:00+00:00,2020-06-25 16:05:59+00:00,132.0,15.0,Natural,2020-06-24 00:17:59+00:00,WF,FI,...,0.16,7.16,16.69,33.44,160.5,160.38,2.82,2020-NVCCD-030327,152.93,2.64
12,-120.772613,39.361785,2021-01-20 03:29:00+00:00,2021-01-21 00:57:00+00:00,1.88,1.0,Human,2021-01-19 19:40:00+00:00,WF,FI,...,0.0,-9.35,-4.95,51.38,59.11,57.91,0.37,2021-CATNF-000111,284.77,7.83
15,-122.974081,41.825916,2020-07-23 17:00:00+00:00,2020-07-26 15:00:00+00:00,13.0,0.1,Human,2020-07-22 19:09:59+00:00,WF,FI,...,1.61,8.76,15.53,48.62,142.94,114.08,1.96,2020-CAKNF-005480,395.5,4.62


In [151]:
# Store wildfire dataset merged with 6 months rain and snow
merged_df.reset_index().to_csv(
    Config().get_cleaned_fire_filtered_path("weather_extra_180d"), index=False
)