In [1]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
import rasterio 
import rasterio.plot
from rasterio.mask import mask
from glob import glob
import time
import re

In [2]:
infestation_history = pd.read_excel(r'data_raw/ML_BDR_20201019.xlsx')

In [3]:
infestation_history.head()

Unnamed: 0,LK,LK-Nr,LK-Rev,REVUFBADR,Jahr,ZR,Eigentumsgruppe,Zugang,Abgang
0,BZ,25,1,2501,2007,06 Juni,SW,5.0,0.0
1,BZ,25,1,2501,2007,08 August,SW,12.0,12.0
2,BZ,25,1,2501,2007,10 Oktober-Dezember,SW,2.0,0.0
3,BZ,25,1,2501,2008,04 April,SW,1.0,0.0
4,BZ,25,1,2501,2008,06 Juni,SW,2.0,0.0


## Forestry Districts

The 'REVUFBADR' column contains a unique identifier for the forstry districts. The first two digits indicate the county (Landkreis) and the last two digits indicate the number of the district in this county. 

In some forestry districts the district number (last two digits) begins with a leading 9 instead of a leading 0:

In [4]:
# display all forestry district numbers
infestation_history.REVUFBADR.unique()

array([2501, 2502, 2503, 2504, 2505, 2506, 2507, 2508, 2509, 2510, 1101,
       1201, 2101, 2102, 2103, 2104, 2105, 2106, 2107, 2191, 2192, 2193,
       2194, 2195, 2196, 2197, 2198, 2201, 2202, 2203, 2204, 2601, 2602,
       2603, 2604, 2605, 2606, 2691, 2901, 2902, 2701, 2702, 2703, 2704,
       2791, 2792, 2793, 2801, 2802, 2803, 2804, 2805, 3001, 3002, 3003,
       2301, 2302, 2303, 2304, 2305, 2306, 2401, 2402], dtype=int64)

During the observation timeframe, some of the districts (*Erzgebirgskreis* and *Meißen*) underwent a restructuring process. A leading 9 instead of a leading 0 signifies that the border of the district was different than it is today.  We can see when these changes happened with the following code:

In [5]:
infestation_history[infestation_history['LK-Rev'] >= 90].groupby('REVUFBADR').max()

Unnamed: 0_level_0,LK,LK-Nr,LK-Rev,Jahr,ZR,Eigentumsgruppe,Zugang,Abgang
REVUFBADR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2191,ERZ,21,91,2014,10 Oktober-Dezember,SW,4430.82,4701.61
2192,ERZ,21,92,2014,10 Oktober-Dezember,SW,2029.06,2185.31
2193,ERZ,21,93,2014,10 Oktober-Dezember,SW,231.0,238.0
2194,ERZ,21,94,2014,10 Oktober-Dezember,SW,445.0,460.0
2195,ERZ,21,95,2014,10 Oktober-Dezember,SW,1238.38,1219.88
2196,ERZ,21,96,2014,10 Oktober-Dezember,SW,1140.88,1157.92
2197,ERZ,21,97,2014,10 Oktober-Dezember,SW,1035.0,1268.0
2198,ERZ,21,98,2014,10 Oktober-Dezember,SW,175.75,164.35
2691,GR,26,91,2020,10 Oktober-Dezember,NSW,18000.0,15200.0
2791,MEI,27,91,2013,01 Januar-März,NSW,15.0,15.0


We could do this grouping by 'LK-Rev' because the two parts of 'REVUFBADR' also appear in the 'LK-Nr' and 'LK-Rev' columns seperately. This also means that they are redundant. We check if the information the three columns contain are really the same for every observation and then drop 'LK-Nr' and 'LK-Rev':

In [6]:
# first column as a string
lk_nr = infestation_history['LK-Nr'].astype(str) 
# second column as a string with leading zero
lk_rev = infestation_history['LK-Rev'].astype(str).apply(lambda x: x.zfill(2)) 

# concatenate these strings and check if they are identical to the 'REVUFBADR' column at every observation
(lk_nr + lk_rev == infestation_history['REVUFBADR'].astype(str)).all() 

True

In [7]:
# drop 'LK-Nr' and 'LK-Rev' columns because the information is also found in 'REVUFBADR'
infestation_history.drop(['LK-Nr', 'LK-Rev'], axis=1, inplace=True)

# the 'LK' column is also redundant as it contains a string that matches its 'LK-Rev' column
# since we can use it in the EDA more intuitively than just the different 'REVUFBADR' numbers we will keep it for now

Now we continue examining the cases with leading 9s. The *Stadtwald Zittau* (REVUFBADR 2691) is a special case among those special cases. According to Sachsenforst the correct procedure is to just add the corresponding observations to the forestry district *Zittau* (REVUFBADR 2601).

In [8]:
# in column 'REVUFBADR' change all occurrences of 2691 to 2601
infestation_history['REVUFBADR'] = infestation_history['REVUFBADR'].replace(2691, 2601)

In [9]:
# aggregate the values by summing them together for the 'Zugang' and 'Abgang' columns if every other column value is the same
infestation_history['Zugang'] = infestation_history.groupby(['REVUFBADR', 'Jahr', 'ZR', 'Eigentumsgruppe'])['Zugang'].transform('sum')
infestation_history['Abgang'] = infestation_history.groupby(['REVUFBADR', 'Jahr', 'ZR', 'Eigentumsgruppe'])['Abgang'].transform('sum')

# Now drop the duplicated rows that were just created
infestation_history.drop_duplicates(inplace=True)

# reset the index
infestation_history.reset_index(inplace=True, drop=True)

In [10]:
infestation_history.shape

(8009, 7)

For the remaining forestry districts we need to distinguish between the old borders and the new ones. Sachsenforst supplied us with two shape files, one with all current district borders and one with only borders of old districts that were different than they are now. We only have to change the 'REVUFBADR' numbers for the abolished districts so they match the format with the leading 9s and then merge both geodataframes.

In [11]:
# load in the first shape file as a geopandas geodataframe
districts_new = gpd.read_file(r'data_raw/shape/ufb_rev_wald_teil.shp', encoding='utf-8')
districts_new.head(3)

Unnamed: 0,KREIS_NAME,REVUFB_NM,REVUFBADR,NSW_FI,NSW_SONST,SW_FI,SW_SONST,geometry
0,Mittelsachsen,Reinsberg,2203,1597.32,3274.630917,2706.18,2133.910411,"POLYGON ((386902.476 5656907.025, 386910.595 5..."
1,Mittelsachsen,Geringswalde,2201,841.61,3508.60581,196.15,1453.972847,"POLYGON ((332902.962 5650328.573, 332905.989 5..."
2,Leipzig,Leipziger Land,2902,401.71,8199.85385,615.51,5314.476829,"POLYGON ((332897.160 5650325.466, 332893.592 5..."


In [12]:
# load in the second shape file as a geopandas geodataframe
districts_old = gpd.read_file(r'data_raw/shape/ufb_rev_vorUmstrukturierungen.shp', encoding='utf-8')

districts_old.head(3)

Unnamed: 0,KREIS_NAME,REVUFB_NM,REVUFBADR,NSW_FI,NSW_SONST,SW_FI,SW_SONST,geometry
0,Meißen,Nord,2703,143.31,5780.407594,1.09,768.093453,"POLYGON ((418952.942 5692288.782, 418909.147 5..."
1,Meißen,West,2701,22.8,4255.041515,3.93,3650.063576,"POLYGON ((389635.997 5699901.234, 389648.747 5..."
2,Meißen,Süd,2702,411.13,4543.837549,381.83,1975.417673,"POLYGON ((378695.051 5678837.912, 378676.082 5..."


In [13]:
# add 90 to every 'REVUFBADR' in the districts_old dataframe to get the leading 9 notation for abolished forestry districts
districts_old['REVUFBADR'] = districts_old['REVUFBADR'].astype(int) + 90

In [14]:
# change 'REVUFBADR' of districts_new to type int
districts_new['REVUFBADR'] = districts_new['REVUFBADR'].astype(int)

# merge the geodataframes
districts = pd.merge(districts_new, districts_old, how ='outer') 

# shape should be 64x8 now
districts.shape

(64, 8)

The old and new district borders are now present as well as correctly labeled in the geodata and the observations.

## Aggregating the different datasets

For this project there are three different data sources. 

Our data sources are:
1. **The infestation history**
    * contains all observations for the amount of damaged wood (target variable)
    * also contains the timeframe for these observations, the respective forestry district, the type of forest (sepeartion by private/state owned) and the amount of refurbished wood in this time period
    * data supplied by Sachsenforst
    * already read in and stored in the infestation_history dataframe


2. **Information on the forestry districts (new and old)**
    * contains the geodata (polygons) of these districts
    * also for every district contains the area covered by forest, separated by private/state owned forest as well as endangered and safe forest area (endangered are only sections that consist predominantely of adult spruce trees)
    * data supplied by Sachsenforst
    * already read in and stored in the districts geodataframe


3. **Meteorological raster data**
    * contain certain climatic parameters such as the maximum, mean, minimum temperature, humidity, wind speeds etc. (15 variables total)
    * one raster file for every variable and every day of the covered time period (from January 2006 up to February 2020, so more than 80,000 files)
    * 5000mx5000m raster
    * supplied by ReKIS (*Regionales Klima-Informationssystem Sachsen, Sachsen-Anhalt und Thüringen*, https://rekis.hydro.tu-dresden.de/)

To make sense of the data we will have to aggregate this information into a single dataframe that can be used for an EDA and the modeling process. This will be done in the following sections.

The data aggreagtion will take place in a function that iterates over the rows (observations) of our infestation_history dataframe and supplements them with the information from the other data sources. The infestation_history dataframe was chosen as the skeleton on which information is added on because of the iterative nature of the data science life cycle. In case we later drop observations from the get go, create new synthetic observations or engineer our features differently, we need to ensure that this function still operates as expected. Thus the approach of taking infestation_history as the base and then specifying what to do with the rest of the data for every observation was chosen.

In [15]:
#from numba import cuda
#print(cuda.gpus)

In [16]:
import fnmatch
fnmatch.translate('*GRID_1_Messungen_Tageswerte_2020_PN_TW_YTF[0-9][0-9]_utm.asc' )

'(?s:.*GRID_1_Messungen_Tageswerte_2020_PN_TW_YTF[0-9][0-9]_utm\\.asc)\\Z'

In [17]:
def raster_mean(parameter_name, polygon, raster_files):
    '''
    This function calculates the mean of a target meteorological parameter for a specific polygon over a given timeframe. 
    This is done by masking the rasters with the polygon and using those raster points to calculate our means.
    
    inputs:
        - raster_dir: directory where all meteorological raster files are stored
        - polygon: shape of the forestry district
        - parameter_name: the shorthand for the meteorological parameter (needs to match the shorthand in the raster file names)
        - year: the year of the obervation
        - timeframe: the timeframe of the observation, formatted in a way that the glob() function can identify the right files based on a pattern match (example: '0[1-3]' for january-march)
        
    returns:
        - the mean value of the meteorological parameter for the timeframe in the specified forestry district
    ''' 
    # create empty list for saving daily values
    daily_means = []
    
    # get a list of all files whose names match the pattern we specify with our inputs
    #matching_files = glob(fr'{raster_dir}GRID_1_Messungen_Tageswerte_2020_{parameter_name}_TW_{year}{timeframe}[0-9][0-9]_utm.asc')
    
    # Allgemeine Darstellung des gewollten Namesnmusters      
    pattern = re.compile(f'.*{parameter_name}.*')

    # Durchsuchen von raster_files nach Dateinamen, die gewolltem Muster entsprechen und Sammeln dieser Namen in validlist
    matching_files = list(filter(pattern.match, raster_files))
        
    # read in those files in a for loop, calculate the means, append to list
    for file in matching_files:
        # create opened dataset object
        current_raster = rasterio.open(file, nodata=-9999.0)
        
        # mask raster with polygon and read in the relevant raster points
        masked, mask_transform = mask(
            dataset=current_raster, 
            shapes=[polygon], 
            crop=True, # avoids loading in the whole raster
            filled=False, # mask outside values with nodata, so we can safely compute our own stats if necessary
            all_touched=True # we can chose to overfill or underfill the polygon, in this case we overfill
        ) 
        
        # calculate the mean for the day and append it to the daily_means list
        daily_means.append(masked.mean())
    
    # since we want to return the mean of the parameter over the whole timeframe we return the arithmetic mean of the list of daily values
    return np.mean(daily_means)
    

In [18]:
# create a function in which the data aggregation takes place

def data_aggregation(obs=infestation_history, forestry_districts=districts, raster_dir=r'data_raw/climate/'):
    '''
    This function iterates over the rows (observations) of the obs dataframe (in our case infestation_history) and supplements them with the information from the other data sources.
    If we do feature engineering that requires meteorological data of a higher time resolution than the observation timeframe it is also done in this dunction.
    (for example: new feature that contains the number of days with a maximum temperature below 8 degrees Celsius)
    
    inputs:
        - obs: main dataframe containing observations of (among other things) the target variable 
        - forestry_districts: dataframe containing the geodata and further information on the forestry districts
        - raster_dir: directory where all meteorological raster files are stored
        
    returns:
        - a single dataframe with the aggregated information that can be used for the EDA and modeling process
    '''
    start_time = time.time()
    # create an empty dataframe in which we will store our new features
    new_features = pd.DataFrame()
    
    # create a list of all meteorological parameter shorthands that we want to calculate the mean for
    parameter_names = [
        'TX0', # maximum temperature of the day in degrees Celsius
        'TM0', # mean temperature of the day in degrees Celsius
#        'TN0', # minimum temperature of the day in degrees Celsius
        'RF0', # mean relative humidity of the day in %
        'SD0', # total sunshine duration of the day in h
#        'PP0', # mean air pressure of the day in hPa
#        'DD0', # mean vapor pressure of the day in hPa
#        'RRU', # total precipitation of the day in mm
        'RRK', # corrected total precipitation of the day in mm (corrects systematic errors of the measuring device and installation location such as wetting/evaporation losses)
        'FF1', # mean wind velocity of the day 10 metres above ground in m*s-1
#        'FF2', # mean wind velocity of the day 2 metres above ground in m*s-1
        'FFB', # wind speed of the day on the beaufort scale in bft
        'RGK', # total global solar irradiation of the day in kWh*m-2
        'ETP', # potential evaporation for the day in mm
        'GRV'  # potential evapotranspiration for the day in mm
    ]
    
    # the obervations from april till september are gathered monthly while they are gathered quarterly from october till march
    # create a dictionairy that maps the timeframe values from infestation_history to the pattern that is used in the raster file names 
    timeframe_dict = {
    '01 Januar-März': '0[1-3]',
    '04 April': '04',
    '05 Mai': '05',
    '06 Juni': '06',
    '07 Juli': '07',
    '08 August': '08',
    '09 September': '09',
    '10 Oktober-Dezember': '1[0-2]'
    }
    
    
    # initiate for loop, as we do multiple calculations per row for every row
    for current_index, current_obs in obs.iterrows():
        
        # provide the current progress to user after every 500 rows
        if current_index % 250 == 0:
            print(f'currently at index {current_index}, elapsed time: {time.time()-start_time}')
        
        # create a dictionairy in which all features of the current iteration will be collected
        feature_dict = {}
        
        ###########################################################################################################
        # FEATURES 1-4: AREAS COVERED BY DIFFERENT TYPES OF FOREST
        # get respective forest areas from forestry_districts
        
        # area of non-stateowned, non-endangered forest 
        feature_dict['area_nsne'] = forestry_districts.loc[forestry_districts['REVUFBADR'] == current_obs['REVUFBADR'], 'NSW_SONST'].item()
        # area of non-stateowned, endangered forest 
        feature_dict['area_nse'] = forestry_districts.loc[forestry_districts['REVUFBADR'] == current_obs['REVUFBADR'], 'NSW_FI'].item()
        # area of stateowned, non-endangered forest 
        feature_dict['area_sne'] = forestry_districts.loc[forestry_districts['REVUFBADR'] == current_obs['REVUFBADR'], 'SW_SONST'].item()
        # area of stateowned, endangered forest 
        feature_dict['area_se'] = forestry_districts.loc[forestry_districts['REVUFBADR'] == current_obs['REVUFBADR'], 'SW_FI'].item()
        
        ###########################################################################################################
        # FEATURE 5: GEOGRAPHICAL SHAPES
        # probably not usesd a feature in model, but definitely useful for EDA
        # get shapes from forestry_districts
        feature_dict['geometry'] = forestry_districts.loc[forestry_districts['REVUFBADR'] == current_obs['REVUFBADR'], 'geometry'].item()
        
        ###########################################################################################################
        # FEATURES 6-X: MEANS OF THE DIFFERENT METEOROLOGICAL PARAMETERS DURING THE OBSERVATION TIMEFRAME
        # even if we later do more sophisticated feature enginnering, the mean for every meteorological parameters will serve as a decent starting point for the analysis
        
        # the raster_mean() function is already defined, we just need to pass it the specifics of the current observation
        # already got the polygon as a feature
        current_polygon = feature_dict.get('geometry')
        # year is in current_obs
        current_year = current_obs['Jahr']
        # to get the correct timeframe use timeframe_dict
        current_timeframe = timeframe_dict.get(current_obs['ZR'])
        
        # TODO: hieraus regex machen, aus * 3 buchstaben, dann regex auch am Anfang machen
        raster_files = glob(fr'{raster_dir}GRID_1_Messungen_Tageswerte_2020_???_TW_{current_year}{current_timeframe}[0-9][0-9]_utm.asc')
        
        # calculate means for all parameters in parameter_names list
        for parameter_name in parameter_names:
            feature_dict[f'{parameter_name}_mean'] = raster_mean(
                parameter_name=parameter_name, 
                polygon=current_polygon, 
                raster_files=raster_files
            )
        
        ###########################################################################################################
        # STORE ALL FEATURES OF CURRENT OBSERVATION IN DATAFRAME
        new_features = new_features.append(feature_dict, ignore_index=True)
    
    # return concatenation of infestation_history and new_features
    return pd.concat([obs, new_features], axis=1)

In [None]:
barkbeetle_dataset = data_aggregation()

currently at index 0, elapsed time: 0.0


  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret

currently at index 250, elapsed time: 2169.3231098651886


  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=o

currently at index 500, elapsed time: 4486.225106477737


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order

currently at index 750, elapsed time: 6758.987464189529


  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=

currently at index 1000, elapsed time: 9108.615070343018


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)


currently at index 1250, elapsed time: 11441.03863120079


  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret 

currently at index 1500, elapsed time: 14097.921556949615


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **k

currently at index 1750, elapsed time: 16702.785815238953


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  return array(a, dtype, copy=False, order=order, subok=True)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out,

currently at index 2000, elapsed time: 18981.69925236702


  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)


currently at index 2250, elapsed time: 20925.95864391327


  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)


currently at index 2500, elapsed time: 22994.028823375702


  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)


currently at index 2750, elapsed time: 25176.24635195732


  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)
  return array(a, dtype, copy=False, order=order, subok=True)


currently at index 3000, elapsed time: 27716.014973163605


In [None]:
barkbeetle_dataset.to_csv('barkbeetle_dataset.csv')