In [110]:
from scipy.io import arff
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from helpers import *

# get file path
sdir = 'data/raw'
tdir = 'data/experiments'
seismic_file = get_abspath('seismic-bumps.arff', sdir)

# read arff file and convert to record array
rawdata = arff.loadarff(seismic_file)
df = pd.DataFrame(rawdata[0])

# apply one-hot encoding to categorical features using Pandas get_dummies
cat_cols = ['seismic', 'seismoacoustic', 'shift', 'ghazard']
cats = df[cat_cols]
onehot_cols = pd.get_dummies(cats, prefix=cat_cols)

# replace 0s with -1s to improve NN performance
onehot_cols.replace(to_replace=[0], value=[-1], inplace=True)

# drop original categorical columns and append one-hot encoded columns
df.drop(columns=cat_cols, inplace=True)
df = pd.concat((onehot_cols, df), axis=1)

# drop columns that have only 1 unique value (features add no information)
for col in df.columns:
    if len(np.unique(df[col])) == 1:
        df.drop(columns=col, inplace=True)

# cast class column as integer
df['class'] = df['class'].astype(int)

# split out X data and scale (Gaussian zero mean and unit variance)
X = df.drop(columns='class').as_matrix()
y = df['class'].astype(int).as_matrix()