<a href="https://colab.research.google.com/github/knattarina/wildfire_challenge/blob/main/Data_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [45]:
##### IMPORTS AND SETUP #####

import sys
import pandas as pd
import numpy as np
import datetime
import os

#from project_lib import Project
#from pyspark.sql import SparkSession
from google.colab import drive

#project = Project(sc,"ff75f9c0-4f85-495d-bbc1-6305f6b1dbb8", "p-12f1b4bbd0ab3b00eacd4ce1201f20ce039b72ec")
#spark = SparkSession.builder.getOrCreate()

In [46]:
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/Wildfire_Challenge/Data')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [47]:
def get_df_watson(file_name):
    file = project.get_file(file_name)
    file.seek(0)
    return pd.read_csv(file)

wildfires_df = pd.read_csv('Historical_Wildfires.csv')
weather_df = pd.read_csv('HistoricalWeather.csv')
forecasts_df = pd.read_csv('HistoricalWeatherForecasts.csv')
landclass_df = pd.read_csv('LandClass.csv')

forecasts_df['Date'] = pd.to_datetime(forecasts_df['Date'])
wildfires_df['Date'] = pd.to_datetime(wildfires_df['Date'])
weather_df['Date'] = pd.to_datetime(weather_df['Date'])

In [48]:
##### ADD FIX DATA TO LANDCLASS #####
temp_df = pd.DataFrame.drop_duplicates(weather_df[['Region','count()[unit: km^2]']])
landclass_df = pd.merge(landclass_df, temp_df)

In [49]:
##### REPLACE ZEROES WITH NAN #####
#wildfires_df = wildfires_df.mask(wildfires_df==0)
wildfires_df = wildfires_df.drop(columns = ['Replaced', 'Count'])

In [50]:
##### REFORMAT WEATHER DATA #####
def reformat_weather_data(df):
    df = df.rename(columns={"count()[unit: km^2]": "Area", "min()": "Min", "max()": "Max", "mean()": "Mean", "variance()": "Variance"})

    # Reformat the data
    df_pivot = df.pivot_table(values=['Min','Max','Mean','Variance'], index=['Date','Region'], columns=['Parameter'])
    # Reset dataframe index
    df_pivot.reset_index(inplace=True)

    # Renaming Column names
    df_pivot.columns = [col[0] if not(col[1]) else '{1}_{0}'.format(*col) for col in df_pivot.columns.values]

    # Rearranging Data and column
    params = df_pivot.columns.tolist()[3:]
    params.sort()
    return df_pivot[df_pivot.columns.tolist()[:3] + params].copy()

weather_df = reformat_weather_data(weather_df)
forecasts_df = reformat_weather_data(forecasts_df)

In [51]:
##### FUNCTIONS FOR MISSING VALUES #####

def fill_frames(df, columns):
    df_all = pd.DataFrame()
    for region in landclass_df.Region:
      df_temp = df[df['Region'] == region]
      df_temp = df_temp.resample('1D', on='Date').first().drop('Date', 1).reset_index()
      df_temp[['Region']] = df_temp[['Region']].fillna(value = region)
      df_temp[columns]=df_temp[columns].interpolate(method='linear', direction = 'backward')
      df_temp = df_temp.fillna(method='ffill')
      df_temp = df_temp.fillna(method='bfill')
      df_all = df_all.append(df_temp)
    return segregate_date(df_all)

def segregate_date(df):
    #df["Day"] = df["Date"].dt.day
    df["Month"] = df["Date"].dt.month
    df["Year"] = df["Date"].dt.year
    return df

In [52]:
##### PREPROCESSING FORECAST_DF ######
forecasts_df.drop_duplicates(inplace = True)
forecasts_columns = ['Precipitation_Max', 'Precipitation_Mean', 'Precipitation_Min', 'Precipitation_Variance', 'RelativeHumidity_Max', 'RelativeHumidity_Mean', 'RelativeHumidity_Min', 'RelativeHumidity_Variance', 'SolarRadiation_Max', 'SolarRadiation_Mean', 'SolarRadiation_Min', 'SolarRadiation_Variance', 'Temperature_Max', 'Temperature_Mean', 'Temperature_Min', 'Temperature_Variance', 'WindSpeed_Max', 'WindSpeed_Mean', 'WindSpeed_Min', 'WindSpeed_Variance']
forecasts_df = fill_frames(forecasts_df, forecasts_columns)
#KEIN SOILWATER IN FORECAST --> in weather model löschen

In [59]:
##### PROCESSING DATA #####
aggr_df = wildfires_df.merge(weather_df, how='left', on=['Date', 'Region'])

aggr_df = aggr_df.set_index(['Date','Region']).fillna(forecasts_df.set_index(['Date','Region'])).reset_index()
aggr_df.drop(['Std_confidence', 'Var_confidence'], axis=1, inplace = True)
aggr_columns = ['Estimated_fire_area','Mean_estimated_fire_brightness','Mean_estimated_fire_radiative_power','Mean_confidence','Precipitation_Max','Precipitation_Mean','Precipitation_Min','Precipitation_Variance','RelativeHumidity_Max','RelativeHumidity_Mean','RelativeHumidity_Min','RelativeHumidity_Variance','SoilWaterContent_Max','SoilWaterContent_Mean','SoilWaterContent_Min','SoilWaterContent_Variance','SolarRadiation_Max','SolarRadiation_Mean','SolarRadiation_Min','SolarRadiation_Variance','Temperature_Max','Temperature_Mean','Temperature_Min','Temperature_Variance', 'WindSpeed_Max','WindSpeed_Mean','WindSpeed_Min','WindSpeed_Variance']
aggr_df = fill_frames(aggr_df, aggr_columns)

#add vegetation data
aggr_df = aggr_df.merge(landclass_df, how='left', on=['Region'])
#TOdo: aggr_df wetter erst außen vor und dann erst mit 2014 Daten auffüllen? oder ncícht sinnvoll weil feuerdaten interpoliert

In [61]:
##### SAVE FILES #####
def save_files_watson():
  project.save_data("Cleansed_Data.csv", aggr_df.to_csv(index=False), overwrite=True)
  project.save_data("LandClass.csv", landclass_df.to_csv(index=False), overwrite=True)
  project.save_data("Cleansed_Forecasts.csv", forecasts_df.to_csv(index=False), overwrite=True)

aggr_df.to_csv('Cleansed_Data.csv', index = False)
landclass_df.to_csv('Cleansed_LandClass.csv', index = False)
forecasts_df.to_csv('Cleansed_Forecasts.csv', index = False)

In [55]:
# Download as CSV: data frame, optional title and filename
def create_download_link_csv(df, title = "Download CSV file", filename = "data.csv"):  
    # generate in-memory CSV, then base64-encode it
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

#create_download_link_csv(df_NSW,"Download my data","NSW.csv")

In [56]:
sample_dict = { 'Date': [10, 20, np.NaN, np.NaN],
                'Region': [5, np.NaN, np.NaN, 29],
                'S3': [15, np.NaN, np.NaN, 11],
                'S4': [21, 22, 23, 25],
                'Subjects': ['Maths', 'Finance', 'History', 'Geography']}
df = pd.DataFrame(sample_dict)
columns = ['Maths', 'Finance', 'History', 'Geography']
df
#df.fillna(value=df.mean(), inplace=True)

Unnamed: 0,Date,Region,S3,S4,Subjects
0,10.0,5.0,15.0,21,Maths
1,20.0,,,22,Finance
2,,,,23,History
3,,29.0,11.0,25,Geography


In [57]:
data=aggr_df[aggr_df['Region']== 'SA']
fires_SA = data[['Date' , 'Region','Estimated_fire_area']]
fires_SA = fires_SA.resample('1D', on='Date').first().drop('Date', 1).reset_index()
fires_SA[['Region']] = fires_SA[['Region']].fillna(value = 'SA')


fires_SA['Date'] = pd.to_datetime(fires_SA['Date'])
fires_SA.set_index(['Date'])
#print(fires_SA.head(20))
#print("\nInterpolate the missing values using the Linear Interpolation method (purch_amt):")
#fires_SA['Estimated_fire_area'].interpolate(method='from_derivatives', order=30, inplace=True) #, direction = 'forward'

fires_SA['Fill']= fires_SA['Estimated_fire_area'].fillna((fires_SA['Estimated_fire_area'].fillna(method='ffill') + fires_SA['Estimated_fire_area'].fillna(method='bfill'))/2)
fires_SA['Fill'].fillna(method='ffill', inplace=True)
fires_SA['Fill'].fillna(method='bfill', inplace=True)
fires_SA['Interpolate']=fires_SA['Estimated_fire_area'].interpolate(method='linear',direction = 'backward', inplace=False)
fires_SA['MA'] = fires_SA['Fill'].rolling(window=5).mean()
fires_SA['BFILL'] = fires_SA['Estimated_fire_area'].fillna(method='bfill')

mean = fires_SA.mean()
data = fires_SA.iloc[3000:3400,]
plt.figure(figsize=(20, 6))
e = sns.lineplot(x="Date", y="Estimated_fire_area", data=data, color='black')
e.set_yscale("log")
plt.figure(figsize=(20, 6))
g = sns.lineplot(x="Date", y="Interpolate", data=data, color='red')
g.set_yscale("log")
plt.figure(figsize=(20, 6))
h = sns.lineplot(x="Date", y="Fill", data=data,color='orange')
h.set_yscale("log")
plt.figure(figsize=(20, 6))
i = sns.lineplot(x="Date", y="MA", data=data,color='green')
i.set_yscale("log")
plt.figure(figsize=(20, 6))
j = sns.lineplot(x="Date", y="BFILL", data=data,color='pink')
j.set_yscale("log")

dRan = pd.date_range(start ='1-1-2018', end ='8-01-2018', freq ='D') 
print(fires_SA.isna().sum())



NameError: ignored