In [None]:
"""Read CSV into data frame."""

import pandas as pd

df = pd.read_csv(
    '../data/Sensorless_drive_diagnosis.csv', 
    sep=' ', 
    header=None,
)

print(df)
print(df.shape)

In [None]:
"""Check missing values by summing over isna'd rows.
value>0 indicates missing value(s).
"""

print(df.isna().sum())

In [None]:
"""Remove missing values."""

# df = df.dropna()
# print(df.head(10))

In [None]:
"""Split data into input features x and target labels y."""

# Shuffle
#df = df.sample(
#    frac=1,  # All rows
#)

x = df.loc[:,0:df.shape[1]-2]
y = df.loc[:,df.shape[1]-1]

print("x type: {}".format(type(x)))
print("x shape: {}".format(x.shape))

print("y type: {}".format(type(y)))
print("y shape: {}".format(y.shape))

In [None]:
"""Print x statistics"""

print(x.describe())

In [None]:
"""Check labels for imbalanced distribution of target labels."""

print(df.groupby(y).size())

In [None]:
"""Normalize x values via global scaling."""

import sklearn.preprocessing

scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(0, 1))

x_transformed = scaler.fit_transform(x)
x_df = pd.DataFrame(x_transformed)

print(x_df)
print("x_df type: {}".format(type(x_df)))
print("x_df shape: {}".format(x_df.shape))

In [None]:
"""Convert y into dummy/indicator variables.
(i.e. Y -> one-hot encoding)
"""

y_df = pd.get_dummies(y)
# If target is in string form, use following code:
# First encode target values as integers from string
# Then perform one hot encoding
# encoder = LabelEncoder()
# encoder.fit(Y)
# Y = encoder.transform(Y)
# Y = np_utils.to_categorical(Y)

print(y_df)
print("y_df type: {}".format(type(y_df)))
print("y_df shape: {}".format(y_df.shape))

In [None]:
"""Drop axes labels from data frames."""

x_ar = x_df.values
y_ar = y_df.values

print("x_ar type: {}".format(type(x_ar)))
print("x_ar shape: {}".format(x_ar.shape))

print("y_ar type: {}".format(type(y_ar)))
print("y_ar shape: {}".format(y_ar.shape))

In [None]:
"""Split x and y into train, val and test sets."""

#val_size = 5000
#test_size = 5000

#x_val = x_ar[-val_size:]
#x_test = x_ar[val_size:val_size+test_size]
#x_train = x_ar[:-(val_size+test_size)]

#y_val = y_ar[-val_size:]
#y_test = y_ar[val_size:val_size+test_size]
#y_train = y_ar[:-(val_size+test_size)]

import sklearn.model_selection

# 60 - 20 - 20 split

x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(
    x_ar,
    y_ar,
    test_size=0.2,
    random_state=1,
    stratify=y_ar,  # Only necessary in case of class imbalance
)

x_train, x_val, y_train, y_val = sklearn.model_selection.train_test_split(
    x_train,
    y_train,
    test_size=0.25,
    random_state=1,
    stratify=y_train, # Only necessary in case of class imbalance
)

print("x_ar shape: {}".format(x_ar.shape))
print("x_val shape: {}".format(x_val.shape))
print("x_test shape: {}".format(x_test.shape))
print("x_train shape: {}".format(x_train.shape))

print("y_ar shape: {}".format(y_ar.shape))
print("y_val shape: {}".format(y_val.shape))
print("y_test shape: {}".format(y_test.shape))
print("y_train shape: {}".format(y_train.shape))