<a href="https://colab.research.google.com/github/knattarina/wildfire_challenge/blob/main/Data_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
##### IMPORTS AND SETUP #####

import sys
import pandas as pd
import numpy as np
import datetime
import os

#from project_lib import Project
#from pyspark.sql import SparkSession
from google.colab import drive

#project = Project(sc,"ff75f9c0-4f85-495d-bbc1-6305f6b1dbb8", "p-12f1b4bbd0ab3b00eacd4ce1201f20ce039b72ec")
#spark = SparkSession.builder.getOrCreate()

In [None]:
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/Wildfire_Challenge/Data')

Mounted at /content/drive


In [63]:
def get_df_watson(file_name):
    file = project.get_file(file_name)
    file.seek(0)
    return pd.read_csv(file)

wildfires_df = pd.read_csv('Historical_Wildfires.csv')
weather_df = pd.read_csv('HistoricalWeather.csv')
forecasts_df = pd.read_csv('HistoricalWeatherForecasts.csv')
landclass_df = pd.read_csv('LandClass.csv')

forecasts_df['Date'] = pd.to_datetime(forecasts_df['Date'])
wildfires_df['Date'] = pd.to_datetime(wildfires_df['Date'])
weather_df['Date'] = pd.to_datetime(weather_df['Date'])

In [64]:
##### ADD FIX DATA TO LANDCLASS #####
temp_df = pd.DataFrame.drop_duplicates(weather_df[['Region','count()[unit: km^2]']])
landclass_df = pd.merge(landclass_df, temp_df)

In [65]:
##### REPLACE ZEROES WITH NAN #####
wildfires_df = wildfires_df.mask(wildfires_df==0)
wildfires_df = wildfires_df.drop(columns = ['Replaced', 'Count'])

In [66]:
##### REFORMAT WEATHER DATA #####
def reformat_weather_data(df):
    df = df.rename(columns={"count()[unit: km^2]": "Area", "min()": "Min", "max()": "Max", "mean()": "Mean", "variance()": "Variance"})

    # Reformat the data
    df_pivot = df.pivot_table(values=['Min','Max','Mean','Variance'], index=['Date','Region'], columns=['Parameter'])
    # Reset dataframe index
    df_pivot.reset_index(inplace=True)

    # Renaming Column names
    df_pivot.columns = [col[0] if not(col[1]) else '{1}_{0}'.format(*col) for col in df_pivot.columns.values]

    # Rearranging Data and column
    params = df_pivot.columns.tolist()[3:]
    params.sort()
    return df_pivot[df_pivot.columns.tolist()[:3] + params].copy()

weather_df = reformat_weather_data(weather_df)
forecasts_df = reformat_weather_data(forecasts_df)

In [67]:
##### MERGE DATA #####
aggr_df = wildfires_df.merge(weather_df, how='left', on=['Date', 'Region'])

# check for missing values
aggr_df.isna().sum()

Region                                    0
Date                                      0
Estimated_fire_area                       0
Mean_estimated_fire_brightness            0
Mean_estimated_fire_radiative_power       7
Mean_confidence                           0
Std_confidence                         2388
Var_confidence                         2388
Precipitation_Max                         4
Precipitation_Mean                        4
Precipitation_Min                         4
Precipitation_Variance                    4
RelativeHumidity_Max                     27
RelativeHumidity_Mean                    27
RelativeHumidity_Min                     27
RelativeHumidity_Variance                27
SoilWaterContent_Max                      0
SoilWaterContent_Mean                     0
SoilWaterContent_Min                      0
SoilWaterContent_Variance                 0
SolarRadiation_Max                        6
SolarRadiation_Mean                       6
SolarRadiation_Min              

In [83]:
##### Prepare Data for Injection in aggr_df ####
columns = ['RelativeHumidity_Max', 'RelativeHumidity_Mean', 'RelativeHumidity_Min', 'RelativeHumidity_Variance', 'SolarRadiation_Max', 'SolarRadiation_Mean', 'SolarRadiation_Min', 'SolarRadiation_Variance', 'Temperature_Max', 'Temperature_Mean', 'Temperature_Min', 'Temperature_Variance', 'WindSpeed_Max', 'WindSpeed_Mean', 'WindSpeed_Min', 'WindSpeed_Variance']
forecasts_df[columns] = forecasts_df[columns].fillna((forecasts_df[columns].fillna(method='ffill') + forecasts_df[columns].fillna(method='bfill'))/2)

##### FILL MISSING DATES ######
def fill_frames(df):
    df_all = pd.DataFrame()
    for region in landclass_df.Region:
        df_temp = df[df['Region'] == region]
        df_temp = df_temp.resample('1D', on='Date').first().drop('Date', 1).reset_index()
        df_temp[['Region']] = df_temp[['Region']].fillna(value = region)
        df_all = df_all.append(df_temp)
    return df_all

aggr_df = fill_frames(aggr_df)
aggr_df = aggr_df.set_index(['Date','Region']).fillna(forecasts_df.set_index(['Date','Region'])).reset_index()

In [84]:
aggr_df.isna().sum()

Date                                       0
Region                                     0
Estimated_fire_area                    14031
Mean_estimated_fire_brightness         14031
Mean_estimated_fire_radiative_power    14038
Mean_confidence                        14031
Std_confidence                         16419
Var_confidence                         16419
Precipitation_Max                       9420
Precipitation_Mean                      9420
Precipitation_Min                       9420
Precipitation_Variance                  9420
RelativeHumidity_Max                    8134
RelativeHumidity_Mean                   8134
RelativeHumidity_Min                    8134
RelativeHumidity_Variance               8134
SoilWaterContent_Max                   14031
SoilWaterContent_Mean                  14031
SoilWaterContent_Min                   14031
SoilWaterContent_Variance              14031
SolarRadiation_Max                      8125
SolarRadiation_Mean                     8125
SolarRadia

In [None]:
def fill_missing_values(df, columns):
    print(df.head())
    temp_df = df[['Date', 'Region']]
    df = df[columns]
    print(type(df))
    df_year = pd.DataFrame()
    for i in range(1,13):
      df = pd.DataFrame(df)
      df_month = df[df['Month']== i]
      df = df.fillna(value=df.mean(), inplace=True)
      #df = df.fillna((df.fillna(method='ffill') + df.fillna(method='bfill'))/2)
      #df = df.fillna(method='ffill')
      #df = df.fillna(method='bfill')
      df_year = df_year.append(df_month)
    return df_year.assign(Date = temp_df['Date'], Region = temp_df['Region'])

aggr_columns = ['Estimated_fire_area', 'Mean_estimated_fire_brightness', 'Mean_estimated_fire_radiative_power', 'Mean_confidence', 'Std_confidence', 'Var_confidence', 'Precipitation_Max', 'Precipitation_Mean', 'Precipitation_Min', 'Precipitation_Variance', 'RelativeHumidity_Max', 'RelativeHumidity_Mean', 'RelativeHumidity_Min', 'RelativeHumidity_Variance', 'SoilWaterContent_Max', 'SoilWaterContent_Mean', 'SoilWaterContent_Min', 'SoilWaterContent_Variance', 'SolarRadiation_Max', 'SolarRadiation_Mean', 'SolarRadiation_Min', 'SolarRadiation_Variance', 'Temperature_Max', 'Temperature_Mean', 'Temperature_Min', 'Temperature_Variance', 'WindSpeed_Max', 'WindSpeed_Mean', 'WindSpeed_Min', 'WindSpeed_Variance', 'Year', 'Month']
forecasts_columns = ['Precipitation_Max', 'Precipitation_Mean', 'Precipitation_Min', 'Precipitation_Variance', 'RelativeHumidity_Max', 'RelativeHumidity_Mean', 'RelativeHumidity_Min', 'RelativeHumidity_Variance', 'SolarRadiation_Max', 'SolarRadiation_Mean', 'SolarRadiation_Min', 'SolarRadiation_Variance', 'Temperature_Max', 'Temperature_Mean', 'Temperature_Min', 'Temperature_Variance', 'WindSpeed_Max', 'WindSpeed_Mean', 'WindSpeed_Min', 'WindSpeed_Variance', 'Month', 'Year']
#aggr_df = fill_frame(aggr_df, aggr_columns)
#forecasts_df = fill_missing_values(forecasts_df)

#KEIN SOILWATER IN FORECAST --> in weather model löschen

In [None]:
##### Prepare Weather Data for Weather Model #####
def prepare(df):
    df_all = pd.DataFrame()
    for region in landclass_df.Region:
        df_temp = df[df['Region'] == region]
        df_temp = df_temp.resample('1D', on='Date').first().drop('Date', 1).reset_index()
        df_temp[['Region']] = df_temp[['Region']].fillna(value = region)
        df_temp["month"] = df_temp["Date"].dt.month
        df_temp["year"] = df_temp["Date"].dt.year
        df_temp = fill_missing_values(df_temp)
        df_all = df_all.append(df_temp)
    return df_all

forecasts_df.isna().sum()

In [None]:
aggr_df.isna().sum()

In [None]:
# segregation of day, month, year in the original dataset to check the seasonality
def segregate_date(df):
    df["day"] = df["Date"].dt.day
    df["month"] = df["Date"].dt.month
    df["year"] = df["Date"].dt.year
    return df

aggr_df = segregate_date(aggr_df)
forecasts_df = segregate_date(forecasts_df)

In [None]:
aggr_df = aggr_df.set_index(['Date', 'Region'])
forecasts_df = forecasts_df.set_index(['Date', 'Region'])

In [None]:
##### SAVE FILES #####
def save_files_watson():
  project.save_data("Cleansed_Data.csv", aggr_df.to_csv(index=False), overwrite=True)
  project.save_data("LandClass.csv", landclass_df.to_csv(index=False), overwrite=True)
  project.save_data("Cleansed_Forecasts.csv", forecasts_df.to_csv(index=False), overwrite=True)

aggr_df.to_csv('Cleansed_Data.csv', index = True)
landclass_df.to_csv('Cleansed_LandClass.csv', index = True)
forecasts_df.to_csv('Cleansed_Forecasts.csv', index = True)

In [None]:
# Download as CSV: data frame, optional title and filename
def create_download_link_csv(df, title = "Download CSV file", filename = "data.csv"):  
    # generate in-memory CSV, then base64-encode it
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

#create_download_link_csv(df_NSW,"Download my data","NSW.csv")