In [3]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
import rasterio 
import rasterio.plot
from rasterio.mask import mask
from glob import glob
import time
import re

import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns= None

In [4]:
infestation_history = pd.read_excel(r'data_raw/ML_BDR_20201019.xlsx', 
                                    names=['county_acronym', 'county_nr', 'fdist_nr', 'fdist_id','year', 
                                           'timeframe', 'forest_ownership', 'infested_wood', 'disposed_wood'])

In [5]:
# LK = county_acronym
# LK-Nr = county_nr
# LK-Rev = fdist_nr
# fdist_id = fdist_id
# Jahr = year
# ZR = timeframe
# Eigentumsgruppe = forest_ownership
# Zugang = infested_wood
# Abgang = disposed_wood

In [6]:
infestation_history.head()

Unnamed: 0,county_acronym,county_nr,fdist_nr,fdist_id,year,timeframe,forest_ownership,infested_wood,disposed_wood
0,BZ,25,1,2501,2007,06 Juni,SW,5.0,0.0
1,BZ,25,1,2501,2007,08 August,SW,12.0,12.0
2,BZ,25,1,2501,2007,10 Oktober-Dezember,SW,2.0,0.0
3,BZ,25,1,2501,2008,04 April,SW,1.0,0.0
4,BZ,25,1,2501,2008,06 Juni,SW,2.0,0.0


## Forestry Districts

The fdist_id column contains a unique identifier for the forstry districts. The first two digits indicate the county (Landkreis) and the last two digits indicate the number of the district in this county. 

In some forestry districts the district number (last two digits) begins with a leading 9 instead of a leading 0:

In [7]:
# display all forestry district numbers
infestation_history['fdist_id'].unique()

array([2501, 2502, 2503, 2504, 2505, 2506, 2507, 2508, 2509, 2510, 1101,
       1201, 2101, 2102, 2103, 2104, 2105, 2106, 2107, 2191, 2192, 2193,
       2194, 2195, 2196, 2197, 2198, 2201, 2202, 2203, 2204, 2601, 2602,
       2603, 2604, 2605, 2606, 2691, 2901, 2902, 2701, 2702, 2703, 2704,
       2791, 2792, 2793, 2801, 2802, 2803, 2804, 2805, 3001, 3002, 3003,
       2301, 2302, 2303, 2304, 2305, 2306, 2401, 2402], dtype=int64)

During the observation timeframe, some of the districts (*Erzgebirgskreis* and *Meißen*) underwent a restructuring process. A leading 9 instead of a leading 0 signifies that the border of the district was different than it is today.  We can see when these changes happened with the following code:

In [8]:
infestation_history[infestation_history['fdist_nr'] >= 90].groupby('fdist_id').max()

Unnamed: 0_level_0,county_acronym,county_nr,fdist_nr,year,timeframe,forest_ownership,infested_wood,disposed_wood
fdist_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2191,ERZ,21,91,2014,10 Oktober-Dezember,SW,4430.82,4701.61
2192,ERZ,21,92,2014,10 Oktober-Dezember,SW,2029.06,2185.31
2193,ERZ,21,93,2014,10 Oktober-Dezember,SW,231.0,238.0
2194,ERZ,21,94,2014,10 Oktober-Dezember,SW,445.0,460.0
2195,ERZ,21,95,2014,10 Oktober-Dezember,SW,1238.38,1219.88
2196,ERZ,21,96,2014,10 Oktober-Dezember,SW,1140.88,1157.92
2197,ERZ,21,97,2014,10 Oktober-Dezember,SW,1035.0,1268.0
2198,ERZ,21,98,2014,10 Oktober-Dezember,SW,175.75,164.35
2691,GR,26,91,2020,10 Oktober-Dezember,NSW,18000.0,15200.0
2791,MEI,27,91,2013,01 Januar-März,NSW,15.0,15.0


We were able to do this grouping by 'fdist_nr' because the two parts of 'fdist_id' also appear in the 'county_nr' and 'fdist_nr' columns seperately. This also means that they are redundant. We check if the information the three columns contain are really the same for every observation and then drop 'county_nr' and 'fdist_nr':

In [9]:
# first column as a string
county_nr = infestation_history['county_nr'].astype(str) 
# second column as a string with leading zero
fdist_nr = infestation_history['fdist_nr'].astype(str).map(lambda x: x.zfill(2)) 

# concatenate these strings and check if they are identical to the 'fdist_id' column at every observation
(county_nr + fdist_nr == infestation_history['fdist_id'].astype(str)).all() 

True

In [10]:
# drop 'county_nr' and 'fdist_nr' columns because the information is also found in 'fdist_id'
infestation_history.drop(['county_nr', 'fdist_nr', 'county_acronym'], axis=1, inplace=True)

# TODO: remove the following comments or keep 'LK'
# the 'LK' column is also redundant as it contains a string that matches its 'fdist_nr' column
# since we can use it in the EDA more intuitively than just the different 'fdist_id' numbers we will keep it for now

Now we continue examining the cases with leading 9s. The *Stadtwald Zittau* (fdist_id 2691) is a special case among those special cases. According to Sachsenforst the correct procedure is to just add the corresponding observations to the forestry district *Zittau* (fdist_id 2601).

In [11]:
# in column 'fdist_id' change all occurrences of 2691 to 2601
infestation_history['fdist_id'] = infestation_history['fdist_id'].replace(2691, 2601)

# aggregate the values by summing them together for the 'infested_wood' and 'disposed_wood' columns if every other column value is the same
infestation_history['infested_wood'] = infestation_history.groupby(['fdist_id', 'year', 'timeframe', 'forest_ownership'])['infested_wood'].transform('sum')

infestation_history['disposed_wood'] = infestation_history.groupby(['fdist_id', 'year', 'timeframe', 'forest_ownership'])['disposed_wood'].transform('sum')

# Now drop the duplicated rows that were just created
infestation_history.drop_duplicates(inplace=True)

# reset the index
infestation_history.reset_index(inplace=True, drop=True)

In [12]:
infestation_history.shape

(8008, 6)

For the remaining forestry districts we need to distinguish between the old borders and the new ones. Sachsenforst supplied us with two shape files, one with all current district borders and one with only borders of old districts that were different than they are now. We only have to change the 'fdist_id' numbers for the abolished districts so they match the format with the leading 9s and then merge both geodataframes.

In [13]:
# load in the first shape file as a geopandas geodataframe
districts_new = gpd.read_file(r'data_raw/shape/ufb_rev_wald_teil.shp', 
                              encoding='utf-8')

# gpd.read_file has no 'names' argument, so we still need to change the column names after reading
districts_new.columns=['county_name', 'fdist_name', 'fdist_id', 'area_nse', 
                                     'area_nsne', 'area_se', 'area_sne', 'geometry']

districts_new.head(3)

Unnamed: 0,county_name,fdist_name,fdist_id,area_nse,area_nsne,area_se,area_sne,geometry
0,Mittelsachsen,Reinsberg,2203,1597.32,3274.630917,2706.18,2133.910411,"POLYGON ((386902.476 5656907.025, 386910.595 5..."
1,Mittelsachsen,Geringswalde,2201,841.61,3508.60581,196.15,1453.972847,"POLYGON ((332902.962 5650328.573, 332905.989 5..."
2,Leipzig,Leipziger Land,2902,401.71,8199.85385,615.51,5314.476829,"POLYGON ((332897.160 5650325.466, 332893.592 5..."


In [14]:
# KREIS_NAME = county_name
# REVUFB_NM = fdist_name
# REVUFBADR = fdist_id
# NSW_FI = area_nse
# NSW_SONST = area_nsne
# SW_FI = area_se
# SW_SONST = area_sne
# geometry

In [15]:
# load in the second shape file as a geopandas geodataframe
districts_old = gpd.read_file(r'data_raw/shape/ufb_rev_vorUmstrukturierungen.shp', 
                              encoding='utf-8')

# gpd.read_file has no 'names' argument, so we still need to change the column names after reading
districts_old.columns=['county_name', 'fdist_name', 'fdist_id', 'area_nse', 
                                     'area_nsne', 'area_se', 'area_sne', 'geometry']

districts_old.head(3)

Unnamed: 0,county_name,fdist_name,fdist_id,area_nse,area_nsne,area_se,area_sne,geometry
0,Meißen,Nord,2703,143.31,5780.407594,1.09,768.093453,"POLYGON ((418952.942 5692288.782, 418909.147 5..."
1,Meißen,West,2701,22.8,4255.041515,3.93,3650.063576,"POLYGON ((389635.997 5699901.234, 389648.747 5..."
2,Meißen,Süd,2702,411.13,4543.837549,381.83,1975.417673,"POLYGON ((378695.051 5678837.912, 378676.082 5..."


In [16]:
# add 90 to every 'fdist_id' in the districts_old dataframe to get the leading 9 notation for abolished forestry districts
districts_old['fdist_id'] = districts_old['fdist_id'].astype(int) + 90

In [17]:
# change 'fdist_id' of districts_new to type int
districts_new['fdist_id'] = districts_new['fdist_id'].astype(int)

# merge the geodataframes
districts = pd.merge(districts_new, districts_old, how ='outer') 

# shape should be 64x8 now
districts.shape

(64, 8)

The old and new district borders are now present as well as correctly labeled in both the geodata and the observations.

In [18]:
# calculate area of the forestry district polygons in square kilometeres
# to get the correct area, we use an equal area projection (in this case cea) 
districts['area_fdist'] = districts.to_crs({'proj':'cea'})['geometry'].area/1000000

# we briefly evaluate the results by checking the area for the town of Leipzig 
# area should be 297.8 km^2 according to wikipedia
kfs_leipzig_area = districts[districts['county_name'] == 'Kreisfreie Stadt Leipzig']['area_fdist'].sum()
print(f'Area for Kreisfreie Stadt Leipzig is {round(kfs_leipzig_area, 1)} km^2, should be 297.8 km^2')

Area for Kreisfreie Stadt Leipzig is 297.8 km^2, should be 297.8 km^2


In [19]:
# endangered forest density
# TODO: Kommentieren
districts['endangered_forest_density'] = (districts['area_nse'] + districts['area_se'])*100/districts['area_fdist']

In [20]:
# Add columns for coordinates of centroid for every district
# maybe useful as features instead of dummy for every district
districts['centroid_xcoord'] = districts['geometry'].map(lambda x: x.centroid.coords[0][0])
districts['centroid_ycoord'] = districts['geometry'].map(lambda x: x.centroid.coords[0][1])

We make one more modification and the change some forestry district names slightly to make them unambiguous. Currently, Meißen as well as Zwickau have their districts labeled as *Nord* (north), *Süd* (south) etc. Since we also have the 'county_name' column to distinguish them, this is not a dealbreaker. In case we ever just want to use the district names however, we should be able do differentiate between them. Thus we add the first county name letter to the name ('M Nord'), ('Z Nord') and so on for those forestry districts only.

In [21]:
districts.loc[districts['county_name'] == 'Zwickau', 'fdist_name'] = districts.loc[districts['county_name'] == 'Zwickau', 'fdist_name'].map(lambda x: 'Z '+x)
districts.loc[districts['county_name'] == 'Meißen', 'fdist_name'] = districts.loc[districts['county_name'] == 'Meißen', 'fdist_name'].map(lambda x: 'M '+x)

In [22]:
districts[districts['county_name'] == 'Meißen'].sort_values('fdist_id')

Unnamed: 0,county_name,fdist_name,fdist_id,area_nse,area_nsne,area_se,area_sne,geometry,area_fdist,endangered_forest_density,centroid_xcoord,centroid_ycoord
36,Meißen,M Nord,2701,33.41,3794.019452,3.85,243.218414,"POLYGON ((408736.142 5692125.831, 408767.450 5...",267.290745,13.939877,398763.981752,5688815.0
31,Meißen,M Ost,2702,114.56,4945.945638,0.6,762.36579,"POLYGON ((413698.678 5674573.351, 413686.981 5...",236.609842,48.670841,410803.924166,5679526.0
4,Meißen,M Süd,2703,392.75,4365.001441,381.91,1973.920712,"POLYGON ((377329.166 5657157.286, 377285.838 5...",572.408624,135.333391,391122.095938,5666630.0
14,Meißen,M West,2704,36.08,1499.801018,0.08,3411.198775,"MULTIPOLYGON (((378097.915 5695126.311, 378079...",378.29004,9.558803,384941.58554,5684987.0
54,Meißen,M West,2791,22.8,4255.041515,3.93,3650.063576,"POLYGON ((389635.997 5699901.234, 389648.747 5...",551.16662,4.849713,388723.285894,5687100.0
55,Meißen,M Süd,2792,411.13,4543.837549,381.83,1975.417673,"POLYGON ((378695.051 5678837.912, 378676.082 5...",578.545545,137.060947,391113.136653,5666717.0
53,Meißen,M Nord,2793,143.31,5780.407594,1.09,768.093453,"POLYGON ((418952.942 5692288.782, 418909.147 5...",325.119391,44.414453,408599.357355,5680757.0


## Aggregating the different datasets

For this project there are three different data sources. 

The data sources are:
1. **The infestation history**
    * contains all observations for the amount of damaged wood (target variable)
    * also contains the timeframe for these observations, the respective forestry district, the type of forest (sepeartion by private/state owned) and the amount of refurbished wood in this time period
    * data supplied by Sachsenforst
    * already read in and stored in the infestation_history dataframe


2. **Information on the forestry districts (new and old)**
    * contains the geodata (polygons) of these districts
    * also for every district contains the area covered by forest, separated by private/state owned forest as well as endangered and safe forest area (endangered are only sections that consist predominantely of adult spruce trees)
    * data supplied by Sachsenforst
    * already read in and stored in the districts geodataframe


3. **Meteorological raster data**
    * contain certain climatic parameters such as the maximum, mean, minimum temperature, humidity, wind speeds etc. (15 variables total)
    * one raster file for every variable and every day of the covered time period (from January 2006 up to February 2020, so more than 80,000 files)
    * 5000mx5000m raster
    * supplied by ReKIS (*Regionales Klima-Informationssystem Sachsen, Sachsen-Anhalt und Thüringen*, https://rekis.hydro.tu-dresden.de/)

To make sense of the data we will have to aggregate this information into a single dataframe that can be used for an EDA and the modeling process. This will be done in the following sections.

The data aggreagtion will take place in a function that iterates over the rows (observations) of our infestation_history dataframe and supplements them with the information from the other data sources. The infestation_history dataframe was chosen as the skeleton on which information is added on because of the iterative nature of the data science life cycle. In case we later drop observations from the get go, create new synthetic observations or engineer our features differently, we need to ensure that this function still operates as expected. Thus the approach of taking infestation_history as the base and then specifying what to do with the rest of the data for every observation was chosen.

In [21]:
# how many zero rows do we already have?
n_zrows = infestation_history[(infestation_history['infested_wood'] == 0) & (infestation_history['disposed_wood'] == 0)].shape[0]
print(f'Initially {n_zrows} observations with neither infested wood nor disposed wood (out of {infestation_history.shape[0]} toal observations).')

Initially 839 observations with neither infested wood nor disposed wood (out of 8008 toal observations).


In [22]:
# TODO: Kommentieren
districts_before_jul2013 = pd.concat([districts_old, districts_new[(districts_new['county_name'] != 'Erzgebirgskreis') & (districts_new['county_name'] != 'Meißen')]], axis=0)
districts_jul2013_sep2014 = pd.concat([districts_old[districts_old['county_name'] == 'Erzgebirgskreis'], districts_new[districts_new['county_name'] != 'Erzgebirgskreis']], axis=0)
districts_after_sep2014 = districts_new

In [23]:
def create_zero_row(obs, district, year, timeframe, forest_type):
    '''
    TODO: Add description
    '''
    # first check if there already is an observation for this combination of parameters    
    if not (
        (obs['fdist_id'] == district) & 
        (obs['year'] == year) &
        (obs['timeframe'] == timeframe) &
        (obs['forest_ownership'] == forest_type)
    ).any():
        
        # if there is no observation yet: create one with damaged wood (infested_wood) and restored wood (disposed_wood) of 0
        return {
            'fdist_id': district, 
            'year': year,
            'timeframe': timeframe,
            'forest_ownership': forest_type,
            'infested_wood': 0,
            'disposed_wood': 0
        } 


def zero_fill(obs=infestation_history, 
              districts_before_jul2013=districts_before_jul2013, 
              districts_jul2013_sep2014=districts_jul2013_sep2014, 
              districts_after_sep2014=districts_after_sep2014):
    '''
    TODO: Add description
    '''
    
    # print current number of rows
    print(f'Number of rows before zero_fill(): {obs.shape[0]}')
    
    # to check every valid combination of timeframes, forest types, years and districts we use nested loops
    # loop through all unique months and quarters
    for timeframe in obs['timeframe'].unique():
        
        # loop through both types of forest (state owned - SW, private - NSW)
        for forest_type in obs['forest_ownership'].unique():
            
            # loop through all years
            for year in range(2005, 2021):
                
                # depending on the year there were different forestry districts
                # we check which year it is via an if-statement
                if year < 2013 or (year == 2013 and timeframe in ['01 Januar-März', '04 April', '05 Mai', '06 Juni']):
                    
                    # loop only through the old districts before July 2013
                    for district in districts_before_jul2013['fdist_id'].unique():
                    
                        # create new row if conditions are met by calling create_zero_rows()
                        obs = obs.append(
                            create_zero_row(obs, district, year, timeframe, forest_type),
                            ignore_index=True)
                    
                elif year == 2013 or (year == 2014 and not timeframe == '10 Oktober-Dezember'):
                    
                    # loop only through the districts from July 2013 until December 2014
                    for district in districts_jul2013_sep2014['fdist_id'].unique():
                    
                        # create new row if conditions are met by calling create_zero_rows()
                        obs = obs.append(
                            create_zero_row(obs, district, year, timeframe, forest_type),
                            ignore_index=True)
                        
                elif year >= 2014:

                    # additionial check to ensure we do not add rows after september 2020 (end of observations)
                    if not (year == 2020 and timeframe == '10 Oktober-Dezember'):
                        
                        # loop only through the new districts after 2014
                        for district in districts_after_sep2014['fdist_id'].unique():
                            
                            # create new row if conditions are met by calling create_zero_rows()
                            obs = obs.append(
                                create_zero_row(obs, district, year, timeframe, forest_type),
                                ignore_index=True)
       
    # reset the index
    obs.reset_index(inplace=True, drop=True)  
    
    # print new number of rows
    print(f'Number of rows after zero_fill(): {obs.shape[0]}')
          
    return obs

# TODO: replace if statements with mapping dictionairy

In [24]:
infestation_history = zero_fill(infestation_history)

Number of rows before zero_fill(): 8008
Number of rows after zero_fill(): 13485


In [25]:
a = 53 * 8 * 12 # 
d = 53 * 5 # 2013
b = 54 * 11 # 2013/2014
c = 53 * 7 #2020


2*(d+a+b+c)+1

12637

In [26]:
# merge information on the forestry districts with the obersvations in infestation_history
# TODO: wenn Exceldatei nicht stimmt, dann ist dieser Merge falsch, weil Führungsneun-nummern aus altem shapefile dann nicht mit Excel-Führungsneun-nummern übereinstimmen
infestation_history = pd.merge(infestation_history, districts[['county_name', 'fdist_name', 'area_nse', 'area_nsne', 'area_se', 'area_sne', 'fdist_id', 'centroid_xcoord', 'centroid_ycoord', 'area_fdist', 'endangered_forest_density']], on='fdist_id')

# save geodataframe as shape file 
districts.to_file('forestry_districts.shp', encoding='utf-8')

### Climate Raster

In [27]:
def nansumwrapper(a, **kwargs):
    '''
    TODO: documentation
    '''
    if np.isnan(a).all():
        return np.nan
    else:
        return np.nansum(a, **kwargs)

In [28]:
def raster_mean(filename, polygons):
    '''
    This function calculates the mean of a target meteorological parameter for a specific polygon over a given timeframe. 
    This is done by masking the rasters with the polygon and using the masked raster points to calculate the mean.

    inputs:
        - raster_dir: directory where all meteorological raster files are stored
        - polygon: shape of the forestry district
        - parameter_name: the shorthand for the meteorological parameter (needs to match the shorthand in the raster file names)
        - year: the year of the obervation
        - timeframe: the timeframe of the observation, formatted in a way that the glob() function can identify the right files based on a pattern match (example: '0[1-3]' for january-march)
        
    returns:
        - the mean value of the meteorological parameter for the timeframe in the specified forestry district
    '''  
    # TODO: Workaround entfernen wenn alle Datein vorhanden oder print-Befehl einbauen, sodass man benachrichtigt wird welches file fehlt
    try:
        current_raster = rasterio.open(filename, nodata=-9999.0)
    except: 
        return [np.nan for i in range(polygons.shape[0])] 
    
    results = []
    
    for polygon in polygons['geometry']:
        # mask raster with polygon and read in the relevant raster points
        masked, mask_transform = mask(
            dataset=current_raster, 
            shapes=[polygon], 
            crop=True, # avoids loading in the whole raster
            filled=False, # mask outside values with nodata instead of 0, so we can safely compute zonal stats
            all_touched=True # we can chose to overfill or underfill the polygon, in this case we overfill
        ) 
        results.append(np.ma.mean(masked))
    
    # since we want to return the mean of the parameter over the whole timeframe we return the arithmetic mean of the list of daily values
    return results
    

In [29]:
# specify location of raster files
raster_dir=r'data_raw/climate_monthly_1000/'

# the obervations from april untill september are gathered monthly while they are gathered quarterly from october till march
# create a dictionairy that maps the timeframe values from infestation_history to the pattern that is used in the raster file names 
timeframe_dict = {
'01 Januar-März': ['01', '02', '03'],
'04 April': ['04'],
'05 Mai': ['05'],
'06 Juni': ['06'],
'07 Juli': ['07'],
'08 August': ['08'],
'09 September': ['09'],
'10 Oktober-Dezember': ['10', '11', '12']
}

# create a dictionary of all meteorological parameter shorthands to calculate
# these shorthands match the notation used in the respective filenames
# they are mapped to the respective aggregation function that will be used if there are multiple months in the timeframe
# for example we want the average temperature but the total amount of sunshine hours
parameter_info = {
    'TX0' : np.nanmean, # maximum temperature of the day in degrees Celsius
    'TM0' : np.nanmean, # mean temperature of the day in degrees Celsius
    'TN0' : np.nanmean, # minimum temperature of the day in degrees Celsius
    'RF0' : np.nanmean, # mean relative humidity of the day in %
    'SD0' : nansumwrapper, # total sunshine duration of the day in h
    'RRU' : nansumwrapper, # total precipitation of the day in mm
    'RRK' : nansumwrapper, # corrected total precipitation of the day in mm (corrects systematic errors of the measuring device and installation location such as wetting/evaporation losses)
    'FF1' : np.nanmean, # mean wind velocity of the day 10 metres above ground in m*s-1
    'FF2' : np.nanmean, # mean wind velocity of the day 2 metres above ground in m*s-1
    'FFB' : np.nanmean, # wind speed of the day on the beaufort scale in bft
    'RGK' : nansumwrapper, # total global solar irradiation of the day in kWh*m-2
    'ETP' : nansumwrapper, # potential evaporation for the day in mm
    'GRV' : nansumwrapper, # potential evapotranspiration for the day in mm
    'KWU' : nansumwrapper,
    'KWK' : nansumwrapper
}



# since this will take a while we track the time it takes
start_time = time.time()

# we do not want to append to dataframes and thus use a list of lists 
# (one list for every parameter and a final list that will be used for merging)
climate_res = [[] for _ in range(len(parameter_info) + 1)]


for current_year in np.sort(infestation_history['year'].unique()):
    
    elapsed_time = round((time.time() - start_time)/60, 2)
    print(f'Starting with year {current_year}, elapsed time: {elapsed_time} min')
    
    polygons = districts[['fdist_id', 'geometry']] if current_year <= 2014 else districts_new[['fdist_id', 'geometry']]
    
    for current_timeframe in timeframe_dict:
        for idx, current_parameter in enumerate(parameter_info):
            
            filenames = [fr'{raster_dir}GRID_1_Messungen_Tageswerte_2020_{current_parameter}_MW_{current_year}{current_month}00_utm.asc' for current_month in timeframe_dict.get(current_timeframe)]
             
            aggregation_results = [raster_mean(filename, polygons) for filename in filenames]
            
            results_after_dispatch = [parameter_info[current_parameter](x) for x in zip(*aggregation_results)]
            
            climate_res[idx].extend(results_after_dispatch)         
        
        
        merge_dummies = [f'{current_year}-{current_timeframe}-{dist}' for dist in polygons['fdist_id']]
        climate_res[-1].extend(merge_dummies)




climate_res = pd.DataFrame(climate_res).T

climate_res.columns = [*parameter_info, 'merge_dummy']

print(f'Finished aggregation, total time: {round((time.time()-start_time)/60, 2)} min')

Starting with year 2005, elapsed time: 0.0 min
Starting with year 2006, elapsed time: 1.57 min
Starting with year 2007, elapsed time: 3.15 min
Starting with year 2008, elapsed time: 4.74 min
Starting with year 2009, elapsed time: 6.34 min
Starting with year 2010, elapsed time: 7.92 min
Starting with year 2011, elapsed time: 9.52 min
Starting with year 2012, elapsed time: 11.11 min
Starting with year 2013, elapsed time: 12.72 min
Starting with year 2014, elapsed time: 14.32 min
Starting with year 2015, elapsed time: 15.89 min
Starting with year 2016, elapsed time: 17.33 min
Starting with year 2017, elapsed time: 18.79 min
Starting with year 2018, elapsed time: 20.24 min
Starting with year 2019, elapsed time: 21.69 min
Starting with year 2020, elapsed time: 23.15 min
Finished aggregation, total time: 23.4 min


In [30]:
infestation_history['merge_dummy'] = infestation_history['year'].map(lambda x: str(x) + '-') + infestation_history['timeframe'].map(lambda x: x + '-') + infestation_history['fdist_id'].astype(str)

infestation_history = pd.merge(
    infestation_history, 
    climate_res, 
    on='merge_dummy'
).drop('merge_dummy', axis=1)

### County/District names, ID, timestamp - preparation for time series analysis

In [31]:
# Jetzt ID über Namen, daher Führungsneun austauschen nicht mehr notwendig
# def connect_districts(district):
#     district = str(district)
#     return int(
#         (
#             district[:2] + 
#             district[2].replace('9','0') + 
#             district[3]
#         ).replace('2108', '2198') # special case 2198 (Schwarzenberg), does not exist in new structure, so we leave it as is
#     )

# infestation_history['query'] = infestation_history['fdist_id'].apply(lambda x: connect_districts(x))
# infestation_history = pd.merge(infestation_history, districts[['county_name', 'fdist_name', 'fdist_id']], left_on='query', right_on='fdist_id', suffixes=('','_drop'))
# infestation_history.drop(['query', 'fdist_id_drop'], axis=1, inplace=True)


# for Meißen, logically connect the old forestry districts to the new ones that best approxiamte the location/shape
def connect_districts(fdist_name, fdist_id):
    if not fdist_id in [2793, 2791]:
        return fdist_name
    
    else:
        return fdist_name.replace(
            'Nord', 'Ost' # what was M Nord is almost exactly M Ost in the new structure
        ).replace(
            'West', 'Nord' # M West is best approximated by M Nord in the new structure
        )


In [32]:
infestation_history['fdist_newname'] = infestation_history[['fdist_name','fdist_id']].apply(lambda x: connect_districts(x[0], x[1]), axis=1)
infestation_history['id'] = infestation_history['county_name'].map(lambda x: x + '-') + infestation_history['fdist_newname'].map(lambda x: x + '-') + infestation_history['forest_ownership']

In [33]:
end_of_timeframe = {
    '01 Januar-März': '-03-31',
    '04 April': '-04-30',
    '05 Mai': '-05-31',
    '06 Juni': '-06-30',
    '07 Juli': '-07-31',
    '08 August': '-08-31',
    '09 September': '-09-30',
    '10 Oktober-Dezember': '-12-31'
    }
             
     
infestation_history['timestamp'] = infestation_history['year'].astype(str) + infestation_history['timeframe'].map(lambda x: end_of_timeframe.get(x))
infestation_history['timestamp'] = pd.to_datetime(infestation_history['timestamp'])

### Accounting for previously disposed wood

In [34]:
from pandas.tseries.offsets import MonthEnd



# for every row map the 'disposed_wood' and 'infested_wood' value of the previous observation with the same 'id'
# TODO: documentation

prev_disposed_wood = []
prev_infested_wood = []

for i, row in infestation_history.iterrows():
    if row['timestamp'].month in range(4,10):
        previous_row = infestation_history.loc[
            (infestation_history['timestamp'] == row['timestamp'] + MonthEnd(-1)) & 
            (infestation_history['id'] == row['id'])
        ]
        
    else:
        previous_row = infestation_history.loc[
            (infestation_history['timestamp'] == row['timestamp'] + MonthEnd(-3)) & 
            (infestation_history['id'] == row['id'])
        ]
    
    
    pdw = previous_row['disposed_wood'].values
    piw = previous_row['infested_wood'].values
    
    prev_disposed_wood.append(pdw[0] if len(pdw)==1 else np.NaN)
    prev_infested_wood.append(piw[0] if len(piw)==1 else np.NaN)
    
infestation_history['prev_disposed_wood'] = prev_disposed_wood
infestation_history['prev_infested_wood'] = prev_infested_wood

infestation_history['delta_prev_inf_dis'] = infestation_history['prev_infested_wood'] - infestation_history['prev_disposed_wood'] 
infestation_history.drop('prev_disposed_wood', axis=1, inplace=True) 

### Moving Averages of climate features for the last year

In [35]:
# _rollingyr

rolling_df = pd.DataFrame(np.nan, 
                          index=range(infestation_history.shape[0]),
                          columns=[*[name + '_rollyr' for name in parameter_info], 
                                   'prev_infested_wood_rollyr'])

for ID in infestation_history['id'].unique(): 
    
    # extract relevant time series for the id, sorted by timestamps
    id_subset = infestation_history.loc[infestation_history['id'] == ID].sort_values('timestamp')
    
    # the timeframes with three months need to be weighted accordingly
    weight = pd.Series([1 if element.month in range(4, 10) else 3 for element in id_subset['timestamp']])
        
    # calculate moving average for every meteorological parameter
    for current_parameter in parameter_info:
        
        # multiply parameter values by weight
        # use weight.values to keep the original index from id_subset
        weighted_parameter =  pd.Series(id_subset[current_parameter] * weight.values, name=current_parameter+'_rollyr')
        
        # perform rolling, save results in rolling_df
        rolling_df.loc[
            rolling_df.index.isin(weighted_parameter.index), 
            current_parameter+'_rollyr'
        ] = weighted_parameter.rolling(8).apply(
            lambda x: np.nansum(x)/12
        )
    # also do this for prev_infested_wood
    rolling_df.loc[
        rolling_df.index.isin(weighted_parameter.index), 
        'prev_infested_wood_rollyr'
    ] = id_subset['prev_infested_wood'].rolling(8).sum()
                          
# sice we kept the original indeces, merge results on index        
infestation_history = pd.merge(infestation_history, rolling_df, left_index=True, right_index=True)

In [36]:
# _rollingsr

rolling_df = pd.DataFrame(np.nan, 
                          index=range(infestation_history.shape[0]),
                          columns=[*[name + '_rollsr' for name in parameter_info], 
                                   'prev_infested_wood_rollsr'])

for ID in infestation_history['id'].unique(): 
    
    # extract relevant time series for the id, sorted by timestamps
    id_subset = infestation_history.loc[infestation_history['id'] == ID].sort_values('timestamp')
    
    # the timeframes with three months need to be weighted accordingly
    weight = pd.Series([1 if element.month in range(4, 10) else 0 for element in id_subset['timestamp']])
        
    # calculate moving average for every meteorological parameter
    for current_parameter in parameter_info:
        
        # multiply parameter values by weight
        # use weight.values to keep the original index from id_subset
        weighted_parameter =  pd.Series(id_subset[current_parameter] * weight.values, name=current_parameter+'_rollsr')
        
        # perform rolling, save results in rolling_df
        rolling_df.loc[
            rolling_df.index.isin(weighted_parameter.index), 
            current_parameter+'_rollsr'
        ] = weighted_parameter.rolling(8).apply(
            lambda x: np.nansum(x)/6
        )
    # also do this for prev_infested_wood
    rolling_df.loc[
        rolling_df.index.isin(weighted_parameter.index), 
        'prev_infested_wood_rollsr'
    ] = id_subset['prev_infested_wood'].rolling(8).sum()
                          
# sice we kept the original indeces, merge results on index        
infestation_history = pd.merge(infestation_history, rolling_df, left_index=True, right_index=True)

In [37]:
# _rollingwr

rolling_df = pd.DataFrame(np.nan, 
                          index=range(infestation_history.shape[0]),
                          columns=[*[name + '_rollwr' for name in parameter_info], 
                                   'prev_infested_wood_rollwr'])

for ID in infestation_history['id'].unique(): 
    
    # extract relevant time series for the id, sorted by timestamps
    id_subset = infestation_history.loc[infestation_history['id'] == ID].sort_values('timestamp')
    
    # the timeframes with three months need to be weighted accordingly
    weight = pd.Series([0 if element.month in range(4, 10) else 1 for element in id_subset['timestamp']])
        
    # calculate moving average for every meteorological parameter
    for current_parameter in parameter_info:
        
        # multiply parameter values by weight
        # use weight.values to keep the original index from id_subset
        weighted_parameter =  pd.Series(id_subset[current_parameter] * weight.values, name=current_parameter+'_rollwr')
        
        # perform rolling, save results in rolling_df
        rolling_df.loc[
            rolling_df.index.isin(weighted_parameter.index), 
            current_parameter+'_rollwr'
        ] = weighted_parameter.rolling(8).apply(
            lambda x: np.nansum(x)/6
        )
    # also do this for prev_infested_wood
    rolling_df.loc[
        rolling_df.index.isin(weighted_parameter.index), 
        'prev_infested_wood_rollwr'
    ] = id_subset['prev_infested_wood'].rolling(8).sum()
                          
# sice we kept the original indeces, merge results on index        
infestation_history = pd.merge(infestation_history, rolling_df, left_index=True, right_index=True)

In [None]:
# endangered area for this id
data['area_endangered'] = data[['forest_ownership', 'area_nse', 'area_se']].apply(lambda x: x[1] if x[0] == 'NSW' else x[2], axis=1)

# todo: endangered area for this fdist, other forest ownership

## Windfall and demolition wood 

Abiotic damages - Bruch & Wurf, Schnee,Eis u. Sturm kombiniert

In [30]:
demolition_history.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1945 entries, 0 to 1944
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   county_acronym            1945 non-null   object 
 1   county_nr                 1945 non-null   int64  
 2   fdist_nr                  1945 non-null   int64  
 3   fdist_id                  1945 non-null   int64  
 4   year                      1945 non-null   int64  
 5   timeframe                 1945 non-null   object 
 6   forest_ownership          1945 non-null   object 
 7   damaged_wood              1945 non-null   float64
 8   disposed_demolition_wood  1945 non-null   float64
dtypes: float64(2), int64(4), object(3)
memory usage: 136.9+ KB


In [28]:
# load in the data set
demolition_history = pd.read_excel(r'data_raw/ML_WB_20201112.xlsx', 
                                    names=['county_acronym', 'county_nr', 'fdist_nr', 'fdist_id','year', 
                                           'timeframe', 'forest_ownership', 'damaged_wood', 'disposed_demolition_wood'])


In [None]:
# use the fruits of our labor and merge fdist_ids on fdist_newname to logically connect old and new districts
demolition_history = pd.merge(demolition_history, infestation_history[['fdist_id', 'fdist_newname']], on='fdist_id')

# fill zero rows

# fill all other timeframes except April and September with the right values

### Saving

In [38]:
infestation_history.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13485 entries, 0 to 13484
Data columns (total 84 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   fdist_id                   13485 non-null  int64         
 1   year                       13485 non-null  int64         
 2   timeframe                  13485 non-null  object        
 3   forest_ownership           13485 non-null  object        
 4   infested_wood              13485 non-null  float64       
 5   disposed_wood              13485 non-null  float64       
 6   county_name                13485 non-null  object        
 7   fdist_name                 13485 non-null  object        
 8   area_nse                   13485 non-null  float64       
 9   area_nsne                  13485 non-null  float64       
 10  area_se                    13485 non-null  float64       
 11  area_sne                   13485 non-null  float64       
 12  cent

In [39]:
infestation_history.to_csv('barkbeetle_dataset.csv', index=False)

In [40]:
#barkbeetle_dataset.dropna(inplace=True)