In [1]:
from random import shuffle, seed 

In [2]:
def readFile(file='ccnc.txt'):
    '''Read a file given its file path.
    
    Paras:
        file: filepath. Defaults to 'ccnc.txt'. Please
            ensure this file is within the current directory. 
    
    '''
    data = []
    f = open('ccnc.txt', 'r')
    header = next(f)
    for line in f:
        data.append(line)
    return data


def train_dev_test_split(data, train=0.6, dev=0.2, test=0.2, seed_idx=5):
    '''
    Split ccnc.txt into train, dev and test sets with a predefined ratio.
    
    Paras:
        train, dev, test: respective ratio for the train, dev and test sets. 
            Default to 0.6, 0.2, 0.2 respectively. 
        seed_idx: Int. Defaults to 5 (a random picked seed). 
    '''
    
    seed(seed_idx)
    shuffle(data)
    length = len(data)
    boundary1 = round(length * train)
    boundary2 = round(length * (train + dev))
    
    # return the tran_ds, dev_ds, test_ds 
    return data[:boundary1], data[boundary1: boundary2], data[boundary2:]


def fileWriter(data, file_name):
    '''Write a list of name examples back into a txt file and save in 
    the current directory if the full path is not given in the file_name.
    
    Paras:
        data: list
            a list of name examples; each example contains 
            last name, first name, full name and gender
        file_name: str
    '''
    file_name = file_name if file_name.endswith('.txt') else file_name + '.txt'
    tmp = '{}\t{}\t{}\t{}'
    with open(file_name, 'w') as f:
        f.write(tmp.format('last name', 'first name', 'full name', 'gender\n'))
        f.write(''.join(data))

In [3]:
data = readFile()
train_ds, dev_ds, test_ds = train_dev_test_split(data)
# debug to see whether the spliting is working
print(len(data) == len(train_ds) + len(dev_ds) + len(test_ds))

True


In [4]:
fileWriter(train_ds, 'train_dev_test/train_ds.txt')
fileWriter(dev_ds, 'train_dev_test/dev_ds.txt')
fileWriter(test_ds, 'train_dev_test/test_ds.txt')