In [87]:
filename = 'c:/data/mrh/axaf_mrh_tot_freq_tot.csv'
features = [
    'ddea_quant_cm_10',
    'HAB_nb_pieces',
    'HAB_qual',
    'HAB_anclg',
    'CLI_age',
    #'POL_mtcapass',
    'HAB_habit',
    'CLI_sex',
    #'POL_tr_tx_objv',
    'POL_fract',
    'annee',
    'CLI_nb_enfant',
]
exposure = 'anpol'
targets = ['nbsinDDE']

In [109]:
import csv
import time

import numpy as np

In [24]:
def detect_csv_separator(filename):
    """Utility function to automatically detect the separator character in a csv file."""
    with open(filename) as csvfile:
        first_line = csvfile.readline()
        return csv.Sniffer().sniff(first_line).delimiter

In [358]:
def create_data_file(out_filename, dtype, shape):
    """
    Args:
        out_filename: The name of the binary file. It must be in the same
            directory.
        dtype: The type of the numpy array.
    """
    out_file = open(out_filename, 'wb+')
    dat_file = np.memmap(out_file, dtype=dtype, shape=shape)
    return dat_file

        
def save_data_file(file, filename):
    file.flush()
    size = float(file.nbytes) / (1024 ** 2)
    print('written %s : %.3f MB' % (filename, size))

    
def create_data_file_from_list(lst, out_filename, dtype):
    """Write a list in a binary file as a numpy array.
    Args:
        lst: The list that will be written in the file.
        out_filename: The name of the binary file. It must be in the same
            directory.
        dtype: The type of the numpy array.
    """
    with open(out_filename, 'wb+') as out_file:
        dat_file = np.memmap(out_file, dtype=dtype, shape=(len(lst),))
        dat_file[:] = lst[:]
        dat_file.flush()
    size = float(dat_file.nbytes) / (1024 ** 2)
    print('written %s : %.3f MB' % (out_filename, size))

def load_data(file_path, dtype='int32', shape=None):
    return np.memmap(file_path, dtype=dtype, shape=shape)

In [100]:
from itertools import (takewhile,repeat)

def count_line(filename):
    f = open(filename, 'rb')
    bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024) for _ in repeat(None)))
    return sum(buf.count(b'\n') for buf in bufgen)

In [349]:
start_time = 0

# Print iterations progress
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█'):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
    """
    global start_time
    if iteration == 0:
        start_time = time.time()
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    elapsed_time = int(time.time() - start_time)
    m = str(elapsed_time // 60).zfill(2)
    s = str(elapsed_time % 60).zfill(2)
    print('\r%s |%s| %s%% %s in %sm%ss' % (prefix, bar, percent, suffix, m, s), end = '\r')
    # Print New Line on Complete
    if iteration == total:
        print()

In [352]:
def transform(csv_filename, data_filename, features, targets, exposure):
    print('Starting data importation.')
    sep = detect_csv_separator(csv_filename)
    nb_lines = count_line(filename)
    print("Importing", '{:,}'.format(nb_lines).replace(',', ' '), "lines.")
    nb_features = len(features)
    observations = create_data_file(data_filename, np.dtype('u1'), (nb_lines, nb_features))
    exposure_data = create_data_file('./data/exposure.dat', np.dtype('float32'), (nb_lines))
    targets_data = create_data_file('./data/targets.dat', np.dtype('float32'), (nb_lines, len(targets)))
    with open(csv_filename) as csv_file:
        fields = [s.strip() for s in csv_file.readline().split(sep)]
        nb_fields = len(fields)
        features_index = [i for i in range(nb_fields) if fields[i] in features]
        if len(features_index) != nb_features:
            raise Exception("Invalid features")
        features_mapping = [{} for i in range(nb_features)]
        exposure_index = [i for i in range(nb_fields) if fields[i] == exposure]
        if len(exposure_index) != 1:
            raise Exception("Invalid Exposure field.")
        exposure_index = exposure_index[0]
        targets_index = [i for i in range(nb_fields) if fields[i] in targets]
        if len(targets_index) != len(targets):
            raise Exception("Invalid targets")
            
        for i, line in enumerate(csv_file):
            values = line.split(sep)
            if len(values) != nb_fields:
                raise Exception("Inconsistent number of fields", len(values),  "in line", i + 1, "expecting", len(fields))
            for j, index in enumerate(features_index):
                v = values[index]
                a = features_mapping[j].setdefault(v, len(features_mapping[j]))
                if a > 200:
                    raise Exception("Feature", fields[j], "has too many modalities ( more than 200).")
                observations[i, j] = a
            targets_data[i, :] = [values[index] for index in targets_index]
            exposure_data[i] = values[exposure_index]
            if i % 1000 == 0 or i == nb_lines - 2:
                printProgressBar(i, nb_lines - 2, prefix = 'Progress:', suffix = 'Complete', length = 50)
    save_data_file(observations, csv_filename)
    save_data_file(targets_data, './data/exposure.dat')
    save_data_file(exposure_data, './data/targets.dat')

In [353]:
transform(filename, './data/observations.dat', features, [target], exposure)

Starting data importation.
Importing 8 420 946 lines.
Progress: |██████████████████████████████████████████████████| 100.0% Complete in 04m17s
written c:/data/mrh/axaf_mrh_tot_freq_tot.csv : 80.308 MB
written ./data/exposure.dat : 32.123 MB
written ./data/targets.dat : 32.123 MB


In [363]:
observations = np.memmap('./data/observations.dat', np.dtype('u1'))
exposure_data = np.memmap('./data/exposure.dat', np.dtype('float32'))
targets_data = np.memmap('./data/targets.dat', np.dtype('float32'))

In [388]:
import sys
import struct

with open('./data/targets.dat', 'rb') as tf:
    for i in range(100):
        b = tf.read(4)
        # int.from_bytes(b, sys.byteorder)
        print(i, b, struct.unpack('f', b)[0] , targets_data[i])

0 b'\x00\x00\x80?' 1.0 1.0
1 b'\x00\x00\x00\x00' 0.0 0.0
2 b'\x00\x00\x00\x00' 0.0 0.0
3 b'\x00\x00\x80?' 1.0 1.0
4 b'\x00\x00\x00\x00' 0.0 0.0
5 b'\x00\x00\x00\x00' 0.0 0.0
6 b'\x00\x00\x00\x00' 0.0 0.0
7 b'\x00\x00\x00\x00' 0.0 0.0
8 b'\x00\x00\x00\x00' 0.0 0.0
9 b'\x00\x00\x80?' 1.0 1.0
10 b'\x00\x00\x00\x00' 0.0 0.0
11 b'\x00\x00\x00\x00' 0.0 0.0
12 b'\x00\x00\x00\x00' 0.0 0.0
13 b'\x00\x00\x00\x00' 0.0 0.0
14 b'\x00\x00\x00\x00' 0.0 0.0
15 b'\x00\x00\x00\x00' 0.0 0.0
16 b'\x00\x00\x00\x00' 0.0 0.0
17 b'\x00\x00\x00\x00' 0.0 0.0
18 b'\x00\x00\x00\x00' 0.0 0.0
19 b'\x00\x00\x00\x00' 0.0 0.0
20 b'\x00\x00\x00\x00' 0.0 0.0
21 b'\x00\x00\x80?' 1.0 1.0
22 b'\x00\x00\x00\x00' 0.0 0.0
23 b'\x00\x00\x00\x00' 0.0 0.0
24 b'\x00\x00\x00\x00' 0.0 0.0
25 b'\x00\x00\x00\x00' 0.0 0.0
26 b'\x00\x00\x80?' 1.0 1.0
27 b'\x00\x00\x00\x00' 0.0 0.0
28 b'\x00\x00\x00\x00' 0.0 0.0
29 b'\x00\x00\x00\x00' 0.0 0.0
30 b'\x00\x00\x00\x00' 0.0 0.0
31 b'\x00\x00\x00\x00' 0.0 0.0
32 b'\x00\x00\x00\x00' 0.0 0.0
33