In [None]:
%load_ext autoreload
%autoreload 2

# Allstate Claims Severity dataset

In [None]:
# Source:
# =======
# https://www.kaggle.com/c/allstate-claims-severity/data
#
# Categorical: 0:116 (exclusive), numerical 116:130, target: 130

import io, os, sys, timeit
import pandas as pd
import zipfile
import numpy as np
import sklearn.preprocessing as pre

workdir = "/tmp/experiments/allstate-claims"
source_data_zip = "allstate-claims-severity.zip"
source_data_zip_path = os.path.join(workdir, source_data_zip)
source_data_h5 = "allstate-claims.h5"
source_data_h5_path = os.path.join(workdir, source_data_h5)
h5_key = "dataset"

bitboost_path = ".."

if not os.path.isdir(workdir):
    os.makedirs(workdir)

In [None]:
# Prep the source data file if prepped files do not exist
if not os.path.isfile(source_data_h5_path):
    if not os.path.isfile(source_data_zip_path):
        raise Exception(f"download source data file to {workdir}")

    df = None
    with zipfile.ZipFile(source_data_zip_path) as zf:
        df = pd.read_csv(zf.open('train.csv'), index_col=False)
        df.drop(df.columns[0], axis=1, inplace=True) # drop Kaggle index
        
    display(df.shape)

    # Convert target to log of targetn = dfull.shape[0]
    n = df.shape[0]
    columns = []

    for i in range(0, 116):
        print(i, end=" ")
        enc = pre.OrdinalEncoder(dtype=np.uint32)
        encoded = enc.fit_transform(df.iloc[:,i:i+1])
        columns.append(encoded)

    cat_nparray = np.concatenate(columns).reshape((n, 116), order='F')
    cat_df = pd.DataFrame(cat_nparray, columns=df.columns[0:116])
    num_df = df.iloc[:,116:]
    num_df = num_df.astype(np.float32)
    num_df['loss'] = num_df.iloc[:,-1].apply(np.log) # log transform target

    dfull = pd.concat([cat_df, num_df], axis=1)
    
    # Write file to HDF5
    dfull.to_hdf(source_data_h5_path, h5_key, complevel=9)