In [194]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
import rasterio 
import rasterio.plot
from rasterio.mask import mask
from glob import glob
import time
import re

In [195]:
infestation_history = pd.read_excel(r'data_raw/ML_BDR_20201019.xlsx')

In [196]:
infestation_history.head()

Unnamed: 0,LK,LK-Nr,LK-Rev,REVUFBADR,Jahr,ZR,Eigentumsgruppe,Zugang,Abgang
0,BZ,25,1,2501,2007,06 Juni,SW,5.0,0.0
1,BZ,25,1,2501,2007,08 August,SW,12.0,12.0
2,BZ,25,1,2501,2007,10 Oktober-Dezember,SW,2.0,0.0
3,BZ,25,1,2501,2008,04 April,SW,1.0,0.0
4,BZ,25,1,2501,2008,06 Juni,SW,2.0,0.0


## Forestry Districts

The 'REVUFBADR' column contains a unique identifier for the forstry districts. The first two digits indicate the county (Landkreis) and the last two digits indicate the number of the district in this county. 

In some forestry districts the district number (last two digits) begins with a leading 9 instead of a leading 0:

In [197]:
# display all forestry district numbers
infestation_history.REVUFBADR.unique()

array([2501, 2502, 2503, 2504, 2505, 2506, 2507, 2508, 2509, 2510, 1101,
       1201, 2101, 2102, 2103, 2104, 2105, 2106, 2107, 2191, 2192, 2193,
       2194, 2195, 2196, 2197, 2198, 2201, 2202, 2203, 2204, 2601, 2602,
       2603, 2604, 2605, 2606, 2691, 2901, 2902, 2701, 2702, 2703, 2704,
       2791, 2792, 2793, 2801, 2802, 2803, 2804, 2805, 3001, 3002, 3003,
       2301, 2302, 2303, 2304, 2305, 2306, 2401, 2402], dtype=int64)

During the observation timeframe, some of the districts (*Erzgebirgskreis* and *Meißen*) underwent a restructuring process. A leading 9 instead of a leading 0 signifies that the border of the district was different than it is today.  We can see when these changes happened with the following code:

In [198]:
infestation_history[infestation_history['LK-Rev'] >= 90].groupby('REVUFBADR').max()

Unnamed: 0_level_0,LK,LK-Nr,LK-Rev,Jahr,ZR,Eigentumsgruppe,Zugang,Abgang
REVUFBADR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2191,ERZ,21,91,2014,10 Oktober-Dezember,SW,4430.82,4701.61
2192,ERZ,21,92,2014,10 Oktober-Dezember,SW,2029.06,2185.31
2193,ERZ,21,93,2014,10 Oktober-Dezember,SW,231.0,238.0
2194,ERZ,21,94,2014,10 Oktober-Dezember,SW,445.0,460.0
2195,ERZ,21,95,2014,10 Oktober-Dezember,SW,1238.38,1219.88
2196,ERZ,21,96,2014,10 Oktober-Dezember,SW,1140.88,1157.92
2197,ERZ,21,97,2014,10 Oktober-Dezember,SW,1035.0,1268.0
2198,ERZ,21,98,2014,10 Oktober-Dezember,SW,175.75,164.35
2691,GR,26,91,2020,10 Oktober-Dezember,NSW,18000.0,15200.0
2791,MEI,27,91,2013,01 Januar-März,NSW,15.0,15.0


We were able to do this grouping by 'LK-Rev' because the two parts of 'REVUFBADR' also appear in the 'LK-Nr' and 'LK-Rev' columns seperately. This also means that they are redundant. We check if the information the three columns contain are really the same for every observation and then drop 'LK-Nr' and 'LK-Rev':

In [199]:
# first column as a string
lk_nr = infestation_history['LK-Nr'].astype(str) 
# second column as a string with leading zero
lk_rev = infestation_history['LK-Rev'].astype(str).apply(lambda x: x.zfill(2)) 

# concatenate these strings and check if they are identical to the 'REVUFBADR' column at every observation
(lk_nr + lk_rev == infestation_history['REVUFBADR'].astype(str)).all() 

True

In [200]:
# drop 'LK-Nr' and 'LK-Rev' columns because the information is also found in 'REVUFBADR'
infestation_history.drop(['LK-Nr', 'LK-Rev', 'LK'], axis=1, inplace=True)

# TODO: remove the following comments or keep 'LK'
# the 'LK' column is also redundant as it contains a string that matches its 'LK-Rev' column
# since we can use it in the EDA more intuitively than just the different 'REVUFBADR' numbers we will keep it for now

Now we continue examining the cases with leading 9s. The *Stadtwald Zittau* (REVUFBADR 2691) is a special case among those special cases. According to Sachsenforst the correct procedure is to just add the corresponding observations to the forestry district *Zittau* (REVUFBADR 2601).

In [201]:
# in column 'REVUFBADR' change all occurrences of 2691 to 2601
infestation_history['REVUFBADR'] = infestation_history['REVUFBADR'].replace(2691, 2601)

In [202]:
# aggregate the values by summing them together for the 'Zugang' and 'Abgang' columns if every other column value is the same
infestation_history['Zugang'] = infestation_history.groupby(['REVUFBADR', 'Jahr', 'ZR', 'Eigentumsgruppe'])['Zugang'].transform('sum')
infestation_history['Abgang'] = infestation_history.groupby(['REVUFBADR', 'Jahr', 'ZR', 'Eigentumsgruppe'])['Abgang'].transform('sum')

# Now drop the duplicated rows that were just created
infestation_history.drop_duplicates(inplace=True)

# reset the index
infestation_history.reset_index(inplace=True, drop=True)

In [203]:
infestation_history.shape

(8009, 6)

For the remaining forestry districts we need to distinguish between the old borders and the new ones. Sachsenforst supplied us with two shape files, one with all current district borders and one with only borders of old districts that were different than they are now. We only have to change the 'REVUFBADR' numbers for the abolished districts so they match the format with the leading 9s and then merge both geodataframes.

In [204]:
# load in the first shape file as a geopandas geodataframe
districts_new = gpd.read_file(r'data_raw/shape/ufb_rev_wald_teil.shp', encoding='utf-8')
districts_new.head(3)

Unnamed: 0,KREIS_NAME,REVUFB_NM,REVUFBADR,NSW_FI,NSW_SONST,SW_FI,SW_SONST,geometry
0,Mittelsachsen,Reinsberg,2203,1597.32,3274.630917,2706.18,2133.910411,"POLYGON ((386902.476 5656907.025, 386910.595 5..."
1,Mittelsachsen,Geringswalde,2201,841.61,3508.60581,196.15,1453.972847,"POLYGON ((332902.962 5650328.573, 332905.989 5..."
2,Leipzig,Leipziger Land,2902,401.71,8199.85385,615.51,5314.476829,"POLYGON ((332897.160 5650325.466, 332893.592 5..."


In [205]:
# load in the second shape file as a geopandas geodataframe
districts_old = gpd.read_file(r'data_raw/shape/ufb_rev_vorUmstrukturierungen.shp', encoding='utf-8')

districts_old.head(3)

Unnamed: 0,KREIS_NAME,REVUFB_NM,REVUFBADR,NSW_FI,NSW_SONST,SW_FI,SW_SONST,geometry
0,Meißen,Nord,2703,143.31,5780.407594,1.09,768.093453,"POLYGON ((418952.942 5692288.782, 418909.147 5..."
1,Meißen,West,2701,22.8,4255.041515,3.93,3650.063576,"POLYGON ((389635.997 5699901.234, 389648.747 5..."
2,Meißen,Süd,2702,411.13,4543.837549,381.83,1975.417673,"POLYGON ((378695.051 5678837.912, 378676.082 5..."


In [206]:
# add 90 to every 'REVUFBADR' in the districts_old dataframe to get the leading 9 notation for abolished forestry districts
districts_old['REVUFBADR'] = districts_old['REVUFBADR'].astype(int) + 90

In [207]:
# change 'REVUFBADR' of districts_new to type int
districts_new['REVUFBADR'] = districts_new['REVUFBADR'].astype(int)

# merge the geodataframes
districts = pd.merge(districts_new, districts_old, how ='outer') 

# shape should be 64x8 now
districts.shape

(64, 8)

The old and new district borders are now present as well as correctly labeled in both the geodata and the observations.

In [208]:
# TODO: Kommentieren
districts_before_jul2013 = pd.concat([districts_old, districts_new[(districts_new['KREIS_NAME'] != 'Erzgebirgskreis') & (districts_new['KREIS_NAME'] != 'Meißen')]], axis=0)
districts_jul2013_dec2014 = pd.concat([districts_old[districts_old['KREIS_NAME'] == 'Erzgebirgskreis'], districts_new[districts_new['KREIS_NAME'] != 'Erzgebirgskreis']], axis=0)
districts_after_dec2014 = districts_new

## Aggregating the different datasets

For this project there are three different data sources. 

The data sources are:
1. **The infestation history**
    * contains all observations for the amount of damaged wood (target variable)
    * also contains the timeframe for these observations, the respective forestry district, the type of forest (sepeartion by private/state owned) and the amount of refurbished wood in this time period
    * data supplied by Sachsenforst
    * already read in and stored in the infestation_history dataframe


2. **Information on the forestry districts (new and old)**
    * contains the geodata (polygons) of these districts
    * also for every district contains the area covered by forest, separated by private/state owned forest as well as endangered and safe forest area (endangered are only sections that consist predominantely of adult spruce trees)
    * data supplied by Sachsenforst
    * already read in and stored in the districts geodataframe


3. **Meteorological raster data**
    * contain certain climatic parameters such as the maximum, mean, minimum temperature, humidity, wind speeds etc. (15 variables total)
    * one raster file for every variable and every day of the covered time period (from January 2006 up to February 2020, so more than 80,000 files)
    * 5000mx5000m raster
    * supplied by ReKIS (*Regionales Klima-Informationssystem Sachsen, Sachsen-Anhalt und Thüringen*, https://rekis.hydro.tu-dresden.de/)

To make sense of the data we will have to aggregate this information into a single dataframe that can be used for an EDA and the modeling process. This will be done in the following sections.

The data aggreagtion will take place in a function that iterates over the rows (observations) of our infestation_history dataframe and supplements them with the information from the other data sources. The infestation_history dataframe was chosen as the skeleton on which information is added on because of the iterative nature of the data science life cycle. In case we later drop observations from the get go, create new synthetic observations or engineer our features differently, we need to ensure that this function still operates as expected. Thus the approach of taking infestation_history as the base and then specifying what to do with the rest of the data for every observation was chosen.

In [209]:
infestation_history.head()

Unnamed: 0,REVUFBADR,Jahr,ZR,Eigentumsgruppe,Zugang,Abgang
0,2501,2007,06 Juni,SW,5.0,0.0
1,2501,2007,08 August,SW,12.0,12.0
2,2501,2007,10 Oktober-Dezember,SW,2.0,0.0
3,2501,2008,04 April,SW,1.0,0.0
4,2501,2008,06 Juni,SW,2.0,0.0


In [210]:
def create_zero_row(obs, district, year, timeframe, forest_type):
    '''
    TODO: Add description
    '''
    # first check if there already is an observation for this combination of parameters    
    if not (
        (obs['REVUFBADR'] == district) & 
        (obs['Jahr'] == year) &
        (obs['ZR'] == timeframe) &
        (obs['Eigentumsgruppe'] == forest_type)
    ).any():
        
        # if there is no observation yet: create one with damaged wood (Zugang) and restored wood (Abgang) of 0
        return {
            'REVUFBADR': district, 
            'Jahr': year,
            'ZR': timeframe,
            'Eigentumsgruppe': forest_type,
            'Zugang': 0,
            'Abgang': 0
        } 


def zero_fill(obs=infestation_history, districts_before_jul2013=districts_before_jul2013, districts_jul2013_dec2014=districts_jul2013_dec2014, districts_after_dec2014=districts_after_dec2014):
    '''
    TODO: Add description
    '''
    
    # print current number of rows
    print(f'Number of rows before zero_fill(): {obs.shape[0]}')
    
    # create dataframe to collect the synthetic rows
    new_rows = pd.DataFrame()
    
    # to check every valid combination of timeframes, forest types, years and districts we use nested loops
    # loop through all unique months and quarters
    for timeframe in obs['ZR'].unique():
        
        # loop through both types of forest (state owned - SW, private - NSW)
        for forest_type in obs['Eigentumsgruppe'].unique():
            
            # loop through all years
            for year in obs['Jahr'].unique():
                
                # depending on the year there were different forestry districts
                # we check which year it is via an if-statement
                if year < 2013 or (year == 2013 and timeframe in ['01 Januar-März', '04 April', '05 Mai', '06 Juni']):
                    
                    # loop only through the old districts before July 2013
                    for district in districts_before_jul2013['REVUFBADR'].unique():
                    
                        # create new row if conditions are met by calling create_zero_rows()
                        new_rows = new_rows.append(
                            create_zero_row(obs, district, year, timeframe, forest_type),
                            ignore_index=True)
                    
                elif (year == 2013 and timeframe in ['07 Juli', '08 August', '09 September', '10 Oktober-Dezember']) or year == 2014:
                    
                    # loop only through the districts from July 2013 until December 2014
                    for district in districts_jul2013_dec2014['REVUFBADR'].unique():
                    
                        # create new row if conditions are met by calling create_zero_rows()
                        new_rows = new_rows.append(
                            create_zero_row(obs, district, year, timeframe, forest_type),
                            ignore_index=True)
                        
                elif year > 2014:

                    # additionial check to ensure we do not add rows after september 2020 (end of observations)
                    if not (year == 2020 and timeframe == '10 Oktober-Dezember'):
                        
                        # loop only through the new districts after 2014
                        for district in districts_after_dec2014['REVUFBADR'].unique():
                            
                            # create new row if conditions are met by calling create_zero_rows()
                            new_rows = new_rows.append(
                                create_zero_row(obs, district, year, timeframe, forest_type),
                                ignore_index=True)
    
    # concatenate new rows to original dataframe
    obs_filled = pd.concat([obs, new_rows], axis=0)   
    
    # print new number of rows
    print(f'Number of rows after zero_fill(): {obs_filled.shape[0]}')
          
    return obs_filled

# TODO: replace if statements with mapping dictionairy

In [211]:
infestation_history = zero_fill(infestation_history)

Number of rows before zero_fill(): 8009
Number of rows after zero_fill(): 12651


In [213]:
# Now drop the duplicated rows that were just created
infestation_history.drop_duplicates(subset=['ZR','Eigentumsgruppe', 'REVUFBADR', 'Jahr'], inplace=True)

# reset the index
infestation_history.reset_index(inplace=True, drop=True)

infestation_history.shape

(12651, 6)

In [217]:
a = 53 * 8 * 12 # 
d = 53 * 4 # 2013
b = 54 * 12 # 2013/2014
c = 53 * 7 #2020


2*(d+a+b+c)+1

12639

In [212]:
(53 * 13 * 2 * 8) + (53 * 1 * 2 * 7) + (54 * 1 * 2 * 8) + 1

12631

In [15]:
def raster_mean(filename, polygon):
    '''
    This function calculates the mean of a target meteorological parameter for a specific polygon over a given timeframe. 
    This is done by masking the rasters with the polygon and using those raster points to calculate our means.
    
    inputs:
        - raster_dir: directory where all meteorological raster files are stored
        - polygon: shape of the forestry district
        - parameter_name: the shorthand for the meteorological parameter (needs to match the shorthand in the raster file names)
        - year: the year of the obervation
        - timeframe: the timeframe of the observation, formatted in a way that the glob() function can identify the right files based on a pattern match (example: '0[1-3]' for january-march)
        
    returns:
        - the mean value of the meteorological parameter for the timeframe in the specified forestry district
    '''        
    current_raster = rasterio.open(filename, nodata=-9999.0)
        
    # mask raster with polygon and read in the relevant raster points
    masked, mask_transform = mask(
            dataset=current_raster, 
            shapes=[polygon], 
            crop=True, # avoids loading in the whole raster
            filled=False, # mask outside values with nodata, so we can safely compute our own stats if necessary
            all_touched=True # we can chose to overfill or underfill the polygon, in this case we overfill
    ) 
    
    # since we want to return the mean of the parameter over the whole timeframe we return the arithmetic mean of the list of daily values
    return masked.mean()
    

In [16]:
# create a function in which the data aggregation takes place

def data_aggregation(obs=infestation_history, forestry_districts=districts, raster_dir=r'data_raw/climate_33_months_1000/'):
    '''
    This function iterates over the rows (observations) of the obs dataframe (in our case infestation_history) and supplements them with the information from the other data sources.
    If we do feature engineering that requires meteorological data of a higher time resolution than the observation timeframe it is also done in this dunction.
    (for example: new feature that contains the number of days with a maximum temperature below 8 degrees Celsius)
    
    inputs:
        - obs: main dataframe containing observations of (among other things) the target variable 
        - forestry_districts: dataframe containing the geodata and further information on the forestry districts
        - raster_dir: directory where all meteorological raster files are stored
        
    returns:
        - a single dataframe with the aggregated information that can be used for the EDA and modeling process
    '''
    start_time = time.time()
    # create an empty dataframe in which we will store our new features
    new_features = pd.DataFrame()
    
    # create a list of all meteorological parameter shorthands that we want to calculate the mean for
    parameter_names = [
        'TX0', # maximum temperature of the day in degrees Celsius
        'TM0', # mean temperature of the day in degrees Celsius
        'TN0', # minimum temperature of the day in degrees Celsius
        'RF0', # mean relative humidity of the day in %
        'SD0', # total sunshine duration of the day in h
#        'PP0', # mean air pressure of the day in hPa
#        'DD0', # mean vapor pressure of the day in hPa
        'RRU', # total precipitation of the day in mm
        'RRK', # corrected total precipitation of the day in mm (corrects systematic errors of the measuring device and installation location such as wetting/evaporation losses)
        'FF1', # mean wind velocity of the day 10 metres above ground in m*s-1
        'FF2', # mean wind velocity of the day 2 metres above ground in m*s-1
        'FFB', # wind speed of the day on the beaufort scale in bft
        'RGK', # total global solar irradiation of the day in kWh*m-2
        'ETP', # potential evaporation for the day in mm
        'GRV', # potential evapotranspiration for the day in mm
        'KWU',
        'KWK'
    ]
    
    # the obervations from april till september are gathered monthly while they are gathered quarterly from october till march
    # create a dictionairy that maps the timeframe values from infestation_history to the pattern that is used in the raster file names 
    timeframe_dict = {
    '01 Januar-März': ['01', '02', '03'],
    '04 April': ['04'],
    '05 Mai': ['05'],
    '06 Juni': ['06'],
    '07 Juli': ['07'],
    '08 August': ['08'],
    '09 September': ['09'],
    '10 Oktober-Dezember': ['10', '11', '12']
    }
    
    
    # initiate for loop, as we do multiple calculations per row for every row
    for current_index, current_obs in obs.iterrows():
        
        # provide the current progress to user after every 500 rows
        if current_index % 500 == 0:
            print(f'currently at index {current_index}, elapsed time: {time.time()-start_time}')
        
        # create a dictionairy in which all features of the current iteration will be collected
        feature_dict = {}
        
        ###########################################################################################################
        # FEATURES 1-4: AREAS COVERED BY DIFFERENT TYPES OF FOREST
        # get respective forest areas from forestry_districts
        
        # area of non-stateowned, non-endangered forest 
        feature_dict['area_nsne'] = forestry_districts.loc[forestry_districts['REVUFBADR'] == current_obs['REVUFBADR'], 'NSW_SONST'].item()
        # area of non-stateowned, endangered forest 
        feature_dict['area_nse'] = forestry_districts.loc[forestry_districts['REVUFBADR'] == current_obs['REVUFBADR'], 'NSW_FI'].item()
        # area of stateowned, non-endangered forest 
        feature_dict['area_sne'] = forestry_districts.loc[forestry_districts['REVUFBADR'] == current_obs['REVUFBADR'], 'SW_SONST'].item()
        # area of stateowned, endangered forest 
        feature_dict['area_se'] = forestry_districts.loc[forestry_districts['REVUFBADR'] == current_obs['REVUFBADR'], 'SW_FI'].item()
        
        ###########################################################################################################
        # FEATURE 5: GEOGRAPHICAL SHAPES
        # probably not usesd a feature in model, but definitely useful for EDA
        # get shapes from forestry_districts
        # WARNING: Too big to include in the file, increases size by x1000 
        #feature_dict['geometry'] = forestry_districts.loc[forestry_districts['REVUFBADR'] == current_obs['REVUFBADR'], 'geometry'].item()
        
        ###########################################################################################################
        # FEATURES 6-X: MEANS OF THE DIFFERENT METEOROLOGICAL PARAMETERS DURING THE OBSERVATION TIMEFRAME
        # even if we later do more sophisticated feature enginnering, the mean for every meteorological parameters will serve as a decent starting point for the analysis
        
        # the raster_mean() function is already defined, we just need to pass it the specifics of the current observation
        current_polygon = forestry_districts.loc[forestry_districts['REVUFBADR'] == current_obs['REVUFBADR'], 'geometry'].item()
        # year is in current_obs
        current_year = current_obs['Jahr']
        # to get the correct timeframe use timeframe_dict
        current_timeframe = timeframe_dict.get(current_obs['ZR'])
        
        missing_files = []
        for current_parameter in parameter_names:
            filenames = [fr'{raster_dir}GRID_1_Messungen_Tageswerte_2020_{current_parameter}_MW_{current_year}{current_month}00_utm.asc' for current_month in current_timeframe]
            try:
                feature_dict[f'{current_parameter}_mean'] = np.mean([raster_mean(filename, current_polygon) for filename in filenames])
            except Exception as e:
                missing_files.append(str(e))
                #print(str(e))
  
        
        
        ###########################################################################################################
        # STORE ALL FEATURES OF CURRENT OBSERVATION IN DATAFRAME
        new_features = new_features.append(feature_dict, ignore_index=True)
    
    
    #print(missing_files)
    
    
    # return concatenation of infestation_history and new_features
    return pd.concat([obs, new_features], axis=1)

In [17]:
barkbeetle_dataset = data_aggregation()

currently at index 0, elapsed time: 0.0010013580322265625


  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return

currently at index 500, elapsed time: 158.99884343147278


  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return

currently at index 1000, elapsed time: 314.1144995689392


  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)


currently at index 1500, elapsed time: 510.90320563316345


  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)


currently at index 2000, elapsed time: 709.3966364860535


  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return

currently at index 2500, elapsed time: 877.9910225868225


  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return

currently at index 3000, elapsed time: 1066.147920846939


  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return

currently at index 3500, elapsed time: 1314.5138783454895


  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return

currently at index 4000, elapsed time: 1536.9628455638885


  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return

currently at index 4500, elapsed time: 1679.4064359664917


  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return

currently at index 5000, elapsed time: 1903.0907073020935


  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return

currently at index 5500, elapsed time: 2111.4705715179443


  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return

currently at index 6000, elapsed time: 2321.5641009807587


  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return

currently at index 6500, elapsed time: 2544.1840550899506


  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return

currently at index 7000, elapsed time: 2780.215840578079


  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return

currently at index 7500, elapsed time: 3014.5095357894897


  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return

currently at index 8000, elapsed time: 3255.6194739341736


In [21]:
barkbeetle_dataset.to_csv('barkbeetle_dataset.csv')

In [5]:
barkbeetle_dataset = pd.read_csv('barkbeetle_dataset.csv')

In [6]:
barkbeetle_dataset.tail()

Unnamed: 0.1,Unnamed: 0,LK,REVUFBADR,Jahr,ZR,Eigentumsgruppe,Zugang,Abgang,ETP_mean,FF1_mean,...,RGK_mean,RRK_mean,RRU_mean,SD0_mean,TM0_mean,TX0_mean,area_nse,area_nsne,area_se,area_sne
8004,8004,Z,2402,2020,07 Juli,SW,0.0,6.0,,,...,,,,,,,1794.78,6175.259947,196.48,236.623276
8005,8005,Z,2402,2020,08 August,NSW,4278.0,4408.0,,,...,,,,,,,1794.78,6175.259947,196.48,236.623276
8006,8006,Z,2402,2020,08 August,SW,70.0,70.0,,,...,,,,,,,1794.78,6175.259947,196.48,236.623276
8007,8007,Z,2402,2020,09 September,NSW,3115.0,4249.0,,,...,,,,,,,1794.78,6175.259947,196.48,236.623276
8008,8008,Z,2402,2020,09 September,SW,0.0,340.0,,,...,,,,,,,1794.78,6175.259947,196.48,236.623276


In [7]:
barkbeetle_dataset.dropna(inplace=True)

In [8]:
barkbeetle_dataset.shape

(6598, 26)