In [None]:
import numpy as np  # for np.ndarray
from enum import Enum

The data is stored in `3` files.
1. `train.csv` numbers `18,304` samples and contains the training data including features and labels. 
2. `test.csv` numbers `4580` samples and contains the testing feature without labels.
3. `sample.csv` number `4580` samples and contains the testing labels without features.

Every file has table headers and ends in an empty line.

In [None]:
# constants
DATA_DIR = r'dataset-in/'   # directory holding the data files
# names of files holding dataset
DATA_FILENAMES = {\
                  r'trainXy': r'train.csv',\
                  r'testX':   r'test.csv',\
                  r'test_y':  r'sample.csv',\
                 }
DELIMITER = r','    # used to separate values in DATA_FILENAME

# axes of numpy arrays
class Axis(Enum):
    COLS = 0
    ROWS = 1
# class Axis(Enum)

Read the files.

In [None]:
def putDataFromFiles(srcFilenames, dest):
    r'''
     Puts the data from the files represented by the source filenames
     into the given destination dictionary.

     Although this function leaves the keys as abstract, it is expected
     that keys represent the type of data (trainXy, testX, test_y)
     contained in each file whose name the key maps to.

     @param srcFilenames : dict<TKey,str> = dictionary mapping to
         filenames containing the data
     @param dest : dict<? super TKey,np.ndarray> = dictionary to
         which to map the arrays
     @return `destDict`
     '''
    # loop through each mapping to the name of the file
    for key, file in srcFilenames.items():
        # generate the arrays from the data contained therein
        dest[key] = np.genfromtxt(
            fr'{DATA_DIR}{file}', delimiter=DELIMITER,
            skip_header=True, dtype=np.float64)
    # for key, file in srcFilenames.items()
    return dest
# def putDataFromFiles(srcFilenames, dest)

# test with the default DATA_FILENAMES
if __name__ == "__main__":
    data = putDataFromFiles(DATA_FILENAMES, {})

Let's count the number of samples as a sanity check.

In [None]:
def countSamples(data, callback):
    r'''
     Performs the callback on each row of a table of the data types to
     arrays.
     @param data : dict<str,np.ndarray> = data of which to count samples
     @param callback : function(str) = to call on each row
     '''
    for datatype, array in data.items():
        callback(f'{datatype:8}\t{len(array):8}')
    # for datatype, array in data.items()
# def countSamples(data, callback)

# if main module, print the counts from `putDataFromFiles`
if __name__ == "__main__":
    countSamples(data, print)

trainXy 	   18304
testX   	    4580
test_y  	    4580


We can assume that the IDs are succeeding in order.

In [None]:
def isEachArrayIdSorted(data, isEachSorted):
    r'''
     Returns whether each array in the data is sorted by ID.
     @param data : dict<TKey,np.ndarray> = data to check for sorting
     @param isEachSorted : dict<? super TKey,bool> = dictionary to
         whether each array is sorted
     @return `isEachSorted`
     '''
    # loop through each type of data and array in the data
    for datatype, array in data.items():
        isEachSorted[datatype] = isArrayIdSorted(array)
    return isEachSorted
# def isEachArrayIdSorted(data, isEachSorted)

def isArrayIdSorted(array):
    r'''
     Returns whether an array is sorted by ID.
     @param array : np.ndarray = array to check for sorting
     @return true if each row ID of the array is 1 greater than the
     previous;  false otherwise
     '''
    prev_id = int(array[0, 0])  # ID of the previous row
    # for each row
    for irow in range(1, array.shape[0]):
        curr_id = int(array[irow, 0])   # ID of current row
        # if the current row is 1 greater than the previous
        if (curr_id != (prev_id + 1)):
            # the current row is out of order
            return False
        # update the previous ID
        prev_id = curr_id
    # no rows out of order
    return True
# def isArrayIdSorted(array)

# if main module, print whether the data is sorted by ID
if __name__ == "__main__":
    print(isEachArrayIdSorted(data, {}))

{'trainXy': True, 'testX': True, 'test_y': True}


Let's split `trainXy` into features and labels resembling `testX` and `test_y`.

In [None]:
def splitFeaturesLabels(dataset, removeIds = False, splitLabels = True):
    r'''
     Divides the dataset into features and labels.
     @syntax (features, labels)
         = splitFeaturesLabels(dataset, removeIds, splitLabels)
     @param dataset : np.ndarray = the dataset to divide
     @param removeIds : bool = whether to remove an initial ID column
     @return a tuple containing the feature arrays and label vector
     '''
    # get the number of rows and columns
    (num_rows, num_cols) = dataset.shape
    # split each row of the dataset
    (_, features, M_label_scalars) = \
        np.split(dataset,
                 # skip column 1 if removing IDs
                 ((1 if removeIds else 0),
                  # if splitting labels, stop 1 column early
                  (num_cols - (1 if splitLabels else 0))
                 ), axis=Axis.ROWS.value)
    # convert to a vector
    if (splitLabels):
        labels = M_label_scalars.reshape((num_rows,))
    else:
        labels = M_label_scalars
    # if (splitLabels)
    return (features, labels)
# def splitFeaturesLabels(dataset)

# if main module
if __name__ == "__main__":
    # print the shape of trainXy before splitting
    print(data['trainXy'].shape)
    # print the shape of each after splitting
    print([x.shape for x in splitFeaturesLabels(data['trainXy'])])
    print([x.shape for x in splitFeaturesLabels(data['trainXy'], splitLabels=False)])
    print([x.shape for x in splitFeaturesLabels(data['trainXy'], True)])
    print([x.shape for x in splitFeaturesLabels(data['trainXy'], True, False)])
# if __name__ == "__main__"

(18304, 12)
[(18304, 11), (18304,)]
[(18304, 12), (18304, 0)]
[(18304, 10), (18304,)]
[(18304, 11), (18304, 0)]


Altogether we have read and split the data.

In [None]:
def main():
    # read in the data from the data files
    data = putDataFromFiles(DATA_FILENAMES, {})
    # split the training data
    (trainX, train_y) = splitFeaturesLabels(data['trainXy'], True)
    # get the testing data
    (testX, _) = splitFeaturesLabels(data['testX'], True, splitLabels=False)
    (_, test_y) = splitFeaturesLabels(data['test_y'], True)
    return (trainX, train_y, testX, test_y)

# if main module, print the shape of each type of data
if __name__ == "__main__":
    # print the shape of each after splitting
    print([x.shape for x in main()])

[(18304, 10), (18304,), (4580, 10), (4580,)]
