In [None]:
%load_ext autoreload
%autoreload 2

# Covertype
## Load and transform the dataset¶


In [None]:
# Source:
# =======
# https://www.kaggle.com/uciml/forest-cover-type-dataset
#
# Dataset info:
# =============
# elevation                               quantitative    meters                       elevation in meters
# aspect                                  quantitative    azimuth                      aspect in degrees azimuth
# slope                                   quantitative    degrees                      slope in degrees
# horizontal_distance_to_hydrology        quantitative    meters                       horz dist to nearest surface water features
# vertical_distance_to_hydrology          quantitative    meters                       vert dist to nearest surface water features
# horizontal_distance_to_roadways         quantitative    meters                       horz dist to nearest roadway
# hillshade_9am                           quantitative    0 to 255 index               hillshade index at 9am, summer solstice
# hillshade_noon                          quantitative    0 to 255 index               hillshade index at noon, summer soltice
# hillshade_3pm                           quantitative    0 to 255 index               hillshade index at 3pm, summer solstice
# horizontal_distance_to_fire_points      quantitative    meters                       horz dist to nearest wildfire ignition points
# wilderness_area (4 binary columns)      qualitative     0 (absence) or 1 (presence)  wilderness area designation
# soil_type (40 binary columns)           qualitative     0 (absence) or 1 (presence)  soil type designation
# cover_type (7 types)                    integer         1 to 7                       forest cover type designation
#
#                              id    name                  count   encoding 1     encoding 2
# Forest Cover Type Classes:    1 -- Spruce/Fir           211840   needles   0    other 0
#                               2 -- Lodgepole Pine       283301   needles   0    lpine 1
#                               3 -- Ponderosa Pine        35754   needles   0    other 0
#                               4 -- Cottonwood/Willow      2747   broadleaf 1    other 0
#                               5 -- Aspen                  9493   broadleaf 1    other 0
#                               6 -- Douglas-fir           17367   needles   0    other 0
#                               7 -- Krummholz             20510   needles   0    other 0

import io, os, sys, timeit
import pandas as pd
import zipfile
import numpy as np

workdir = "/tmp/experiments/covertype"
source_data_zip = "covtype.csv.zip"
source_data_zip_path = os.path.join(workdir, source_data_zip)
source_data_h5_1 = "covtype-lodgepole-vs-rest.h5"
source_data_h5_1_path = os.path.join(workdir, source_data_h5_1)
source_data_h5_2 = "covtype-broadleaf-vs-rest.h5"
source_data_h5_2_path = os.path.join(workdir, source_data_h5_2)
h5_key = "dataset"

bitboost_path = ".."

if not os.path.isdir(workdir):
    os.makedirs(workdir)

In [None]:
# Prep the source data file if prepped files do not exist
if not os.path.isfile(source_data_h5_1_path):
    if not os.path.isfile(source_data_zip_path):
        raise Exception(f"download source data file to {workdir}")

    df = None
    with zipfile.ZipFile(source_data_zip_path) as zf:
        df = pd.read_csv(zf.open('covtype.csv'), dtype=np.float32)

    # Map multiclass classification problem to lodgepole vs rest (balanced)
    df_lodgepole = df.copy()
    df_lodgepole["Cover_Type"] = df["Cover_Type"].map(dict(zip([1,2,3,4,5,6,7],
                                                               [0,1,0,0,0,0,0])))

    # Write files as HDF5
    df_lodgepole.to_hdf(source_data_h5_1_path, h5_key, complevel=9)

In [None]:
# Prep the source data file if prepped files do not exist
if not os.path.isfile(source_data_h5_2_path):
    if not os.path.isfile(source_data_zip_path):
        raise Exception(f"download source data file to {workdir}")

    df = None
    with zipfile.ZipFile(source_data_zip_path) as zf:
        df = pd.read_csv(zf.open('covtype.csv'), dtype=np.float32)
    
    # Map multiclass classification problem to broadleaf vs needles (unbalanced)
    df_broadleaf = df.copy()
    df_broadleaf["Cover_Type"] = df["Cover_Type"].map(dict(zip([1,2,3,4,5,6,7],
                                                               [0,0,0,1,1,0,0])))

    # Write files as HDF5
    df_broadleaf.to_hdf(source_data_h5_2_path, h5_key, complevel=9)