In [1]:
##### IMPORTS AND SETUP #####

import sys
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install pandas

import pandas as pd
import numpy as np
import ibm_boto3
import datetime

from project_lib import Project
from pyspark.sql import SparkSession

project = Project(sc,"ff75f9c0-4f85-495d-bbc1-6305f6b1dbb8", "p-12f1b4bbd0ab3b00eacd4ce1201f20ce039b72ec")
spark = SparkSession.builder.getOrCreate()

Waiting for a Spark session to start...
Spark Initialization Done! ApplicationId = app-20201215101308-0000
KERNEL_ID = 11fb5412-3630-4e0d-ad9b-30ef5227cabb
Collecting numpy
  Downloading numpy-1.19.4-cp37-cp37m-manylinux2010_x86_64.whl (14.5 MB)
[K     |████████████████████████████████| 14.5 MB 9.3 MB/s eta 0:00:01
[31mERROR: tensorflow 2.1.0 has requirement scipy==1.4.1; python_version >= "3", but you'll have scipy 1.5.0 which is incompatible.[0m
[?25hInstalling collected packages: numpy
Successfully installed numpy-1.19.4
Collecting pandas
  Downloading pandas-1.1.5-cp37-cp37m-manylinux1_x86_64.whl (9.5 MB)
[K     |████████████████████████████████| 9.5 MB 18.2 MB/s eta 0:00:01
[?25hCollecting numpy>=1.15.4
  Using cached numpy-1.19.4-cp37-cp37m-manylinux2010_x86_64.whl (14.5 MB)
Collecting python-dateutil>=2.7.3
  Downloading python_dateutil-2.8.1-py2.py3-none-any.whl (227 kB)
[K     |████████████████████████████████| 227 kB 39.8 MB/s eta 0:00:01
[?25hCollecting pytz>=2017.2


In [103]:
##### IMPORT DATA #####
def get_df(file_name):
    file = project.get_file(file_name)
    file.seek(0)
    return pd.read_csv(file)

wildfires_df = get_df('Historical_Wildfires.csv')
weather_df = get_df('HistoricalWeather.csv')
forecasts_df = get_df('HistoricalWeatherForecasts.csv')
landclass_df = get_df('LandClass.csv')

forecasts_df['Date'] = pd.to_datetime(forecasts_df['Date'])
wildfires_df['Date'] = pd.to_datetime(wildfires_df['Date'])
weather_df['Date'] = pd.to_datetime(weather_df['Date'])

In [104]:
##### ADD FIX DATA TO LANDCLASS #####
temp_df = pd.DataFrame.drop_duplicates(weather_df[['Region','count()[unit: km^2]']])
landclass_df = pd.merge(landclass_df, temp_df)

In [105]:
##### REPLACE ZEROES WITH NAN #####
wildfires_df = wildfires_df.mask(wildfires_df==0)
wildfires_df = wildfires_df.drop(columns = 'Replaced')

In [106]:
##### REFORMAT WEATHER DATA #####
def reformat_weather_data(df):
    df = df.rename(columns={"count()[unit: km^2]": "Area", "min()": "Min", "max()": "Max", "mean()": "Mean", "variance()": "Variance"})

    # Reformat the data
    df_pivot = df.pivot_table(values=['Min','Max','Mean','Variance'], index=['Date','Region'], columns=['Parameter'])
    # Reset dataframe index
    df_pivot.reset_index(inplace=True)

    # Renaming Column names
    df_pivot.columns = [col[0] if not(col[1]) else '{1}_{0}'.format(*col) for col in df_pivot.columns.values]

    # Rearranging Data and column
    params = df_pivot.columns.tolist()[3:]
    params.sort()
    return df_pivot[df_pivot.columns.tolist()[:3] + params].copy()

weather_df = reformat_weather_data(weather_df)
forecasts_df = reformat_weather_data(forecasts_df)

In [107]:
##### MERGE DATA #####
aggr_df = wildfires_df.merge(weather_df, how='left', on=['Date', 'Region'])
# check for missing values
aggr_df.isna().sum()

Region                                    0
Date                                      0
Estimated_fire_area                       0
Mean_estimated_fire_brightness            0
Mean_estimated_fire_radiative_power       7
Mean_confidence                           0
Std_confidence                         2388
Var_confidence                         2388
Count                                     0
Precipitation_Max                         4
Precipitation_Mean                        4
Precipitation_Min                         4
Precipitation_Variance                    4
RelativeHumidity_Max                     27
RelativeHumidity_Mean                    27
RelativeHumidity_Min                     27
RelativeHumidity_Variance                27
SoilWaterContent_Max                      0
SoilWaterContent_Mean                     0
SoilWaterContent_Min                      0
SoilWaterContent_Variance                 0
SolarRadiation_Max                        6
SolarRadiation_Mean             

In [108]:
##### FILL MISSING DATES ######
def fill_frame(df):
    df_all = pd.DataFrame()
    
    for region in landclass_df.Region:
        df_temp = df[df['Region'] == region]
        df_temp = df_temp.resample('1D', on='Date').first()\
        .drop('Date', 1).reset_index()
        df_temp[['Region']] = df_temp[['Region']].fillna(value = region)
        df_temp = fill_missing_values(df_temp)
        df_all = df_all.append(df_temp)
    return df_all

def fill_missing_values(df):
    temp_df = df[['Date', 'Region']]
    if 'Estimated_fire_area' in df.columns:
        df = df[['Estimated_fire_area', 'Mean_estimated_fire_brightness', 'Mean_estimated_fire_radiative_power', 'Mean_confidence', 'Std_confidence', 'Var_confidence', 'Count', 'Precipitation_Max', 'Precipitation_Mean', 'Precipitation_Min', 'Precipitation_Variance', 'RelativeHumidity_Max', 'RelativeHumidity_Mean', 'RelativeHumidity_Min', 'RelativeHumidity_Variance', 'SoilWaterContent_Max', 'SoilWaterContent_Mean', 'SoilWaterContent_Min', 'SoilWaterContent_Variance', 'SolarRadiation_Max', 'SolarRadiation_Mean', 'SolarRadiation_Min', 'SolarRadiation_Variance', 'Temperature_Max', 'Temperature_Mean', 'Temperature_Min', 'Temperature_Variance', 'WindSpeed_Max', 'WindSpeed_Mean', 'WindSpeed_Min', 'WindSpeed_Variance']]
    else:
        df = df[['Precipitation_Max', 'Precipitation_Mean', 'Precipitation_Min', 'Precipitation_Variance', 'RelativeHumidity_Max', 'RelativeHumidity_Mean', 'RelativeHumidity_Min', 'RelativeHumidity_Variance', 'SolarRadiation_Max', 'SolarRadiation_Mean', 'SolarRadiation_Min', 'SolarRadiation_Variance', 'Temperature_Max', 'Temperature_Mean', 'Temperature_Min', 'Temperature_Variance', 'WindSpeed_Max', 'WindSpeed_Mean', 'WindSpeed_Min', 'WindSpeed_Variance']]
    #df.fillna(value=df.mean(), inplace=True)
    df = df.fillna((df.fillna(method='ffill') + df.fillna(method='bfill'))/2)
    df = df.fillna(method='ffill')
    df = df.fillna(method='bfill')
    return df.assign(Date = temp_df['Date'], Region = temp_df['Region'])
    
aggr_df = fill_frame(aggr_df)
forecasts_df = fill_frame(forecasts_df)
#KEIN SOILWATER IN FORECAST --> Lösung Finden

In [101]:
forecasts_df.isna().sum()

Precipitation_Max            0
Precipitation_Mean           0
Precipitation_Min            0
Precipitation_Variance       0
RelativeHumidity_Max         0
RelativeHumidity_Mean        0
RelativeHumidity_Min         0
RelativeHumidity_Variance    0
SolarRadiation_Max           0
SolarRadiation_Mean          0
SolarRadiation_Min           0
SolarRadiation_Variance      0
Temperature_Max              0
Temperature_Mean             0
Temperature_Min              0
Temperature_Variance         0
WindSpeed_Max                0
WindSpeed_Mean               0
WindSpeed_Min                0
WindSpeed_Variance           0
dtype: int64

In [109]:
# segregation of day, month, year in the original dataset to check the seasonality
def segregate_date(df):
    df["day"] = df["Date"].dt.day
    df["month"] = df["Date"].dt.month
    df["year"] = df["Date"].dt.year
    return df

aggr_df = segregate_date(aggr_df)
forecasts_df = segregate_date(forecasts_df)

In [113]:
aggr_df = aggr_df.set_index(['Date', 'Region'])
forecasts_df = forecasts_df.set_index(['Date', 'Region'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Estimated_fire_area,Mean_estimated_fire_brightness,Mean_estimated_fire_radiative_power,Mean_confidence,Std_confidence,Var_confidence,Count,Precipitation_Max,Precipitation_Mean,Precipitation_Min,...,Temperature_Mean,Temperature_Min,Temperature_Variance,WindSpeed_Max,WindSpeed_Mean,WindSpeed_Min,WindSpeed_Variance,day,month,year
Date,Region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2005-01-04,NSW,8.68000,312.266667,42.400000,78.666667,2.886751,8.333333,3.0,22.842566,2.808620,0.000000,...,23.055527,12.495799,9.455474,7.364222,4.841764,1.401951,1.494301,4,1,2005
2005-01-05,NSW,16.61125,322.475000,62.362500,85.500000,8.088793,65.428571,8.0,7.657155,0.157935,0.000000,...,22.425765,9.611800,13.352380,7.091141,4.014080,1.011328,1.043316,5,1,2005
2005-01-06,NSW,5.52000,325.266667,38.400000,78.333333,3.214550,10.333333,3.0,27.616505,0.434833,0.000000,...,20.621405,9.024039,11.764178,9.905821,4.477879,0.930842,1.953685,6,1,2005
2005-01-07,NSW,6.26400,313.870000,33.800000,92.200000,7.529940,56.700000,5.0,3.839235,0.064224,0.000000,...,19.632722,8.691216,7.940373,10.446799,4.017578,1.601724,1.102751,7,1,2005
2005-01-08,NSW,5.40000,337.383333,122.533333,91.000000,7.937254,63.000000,3.0,2.866673,0.203352,0.000000,...,21.477315,10.821342,12.063971,6.671862,3.884550,1.167861,0.774612,8,1,2005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-10-26,VI,2.00000,328.750000,101.100000,96.500000,4.949747,24.500000,2.0,7.093964,1.533564,0.000000,...,12.001367,5.329288,4.654061,12.514585,4.551744,2.424077,2.599498,26,10,2020
2020-10-27,VI,2.00000,320.387500,60.550000,88.750000,3.889087,16.250000,2.0,8.532687,1.741583,0.003191,...,13.240903,7.097615,4.175251,10.048282,3.308991,1.604777,1.572674,27,10,2020
2020-10-28,VI,2.00000,312.025000,20.000000,81.000000,2.828427,8.000000,2.0,9.971410,1.949601,0.006383,...,14.480439,8.865943,3.696442,7.581979,2.066238,0.785478,0.545850,28,10,2020
2020-10-29,VI,2.00000,314.812500,33.350000,86.750000,3.181981,10.250000,2.0,16.502216,2.032358,0.003250,...,14.601535,9.629849,3.045914,7.708210,3.245172,1.067496,1.364007,29,10,2020


In [114]:
##### SAVE FILES #####
project.save_data("Cleansed_Data.csv", aggr_df.to_csv(index=False), overwrite=True)
project.save_data("LandClass.csv", landclass_df.to_csv(index=False), overwrite=True)
project.save_data("Cleansed_Forecasts.csv", forecasts_df.to_csv(index=False), overwrite=True)

{'file_name': 'Cleansed_Forecasts.csv',
 'message': 'File saved to project storage.',
 'bucket_name': 'challengepredictionwildfires-donotdelete-pr-mn6u50a9owtjfc',
 'asset_id': 'df67fbc9-02cd-4a68-b373-897e15068dc4'}

In [None]:
# Download as CSV: data frame, optional title and filename
def create_download_link_csv(df, title = "Download CSV file", filename = "data.csv"):  
    # generate in-memory CSV, then base64-encode it
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

#create_download_link_csv(df_NSW,"Download my data","NSW.csv")