# Overview

This notebook is to split the training data set from Kaggle's digit recognizer challenge into a new, smaller training set and a validation set. These new data sets will be used in hyper-parameter tuning and CNN architecture exploration with a view to understanding the optimal CNN architecture for the problem

In [None]:
import pandas as pd
import numpy as np
import random
import math

In [None]:
# read the data in
data = pd.read_csv('datasets/train.csv')

In [None]:
# print out the label value counts to see how many occurrences of each label exist in the database
data.label.value_counts()

In [None]:
# create a list of lists - each sub-list will contain all the indices corresponding to a single label 
# in the input training set
indarray = []
for i in range(0,10):
    indarray.append(data.index[data.label == i].tolist())

In [None]:
# print out the length of the index lists as a check that the data has been divided correctly
for i in range(len(indarray)):
    print(len(indarray[i]))

In [None]:
# create training and validation subsets of the indarray (index array)
trainind = []
validationind = []
for i in range(len(indarray)):
    # calculate '10%'
    tenpercent = math.ceil(len(indarray[i]) * 0.1)
    # form the validation index and train index arrays
    validationind.append(indarray[i][0:tenpercent])
    trainind.append(indarray[i][tenpercent:])

In [None]:
# form the training data set
idx = np.concatenate(trainind[:])
random.shuffle(idx) # shuffle the indices so that image labels are not grouped in the output set
train = data.iloc[idx]

In [None]:
train.info()

In [None]:
idx = np.concatenate(validationind[:])
random.shuffle(idx) # shuffle the indices so that image labels are not grouped in the output set
validation = data.iloc[idx]

In [None]:
validation.info()

In [None]:
# write out all data
train.to_csv('datasets/train-exploration.csv', index=False)
validation.to_csv('datasets/validation-exploration.csv', index=False)