In [23]:
import contextlib
import mmap
import os
from struct import Struct

import pandas as pd
import numpy as np

in_file = 'c:/data/axaf_mrh_appt.csv'
out_file = 'c:/data/axaf_mrh_appt.bml'

In [24]:
class Feature:
    
    name = ''
    min_value = 0
    max_value = 0
    count = 0
    
    def __init__(self, name, array_sample):
        self.name = name
        self.dtype_name = array_sample.dtype.name
        values = array_sample.unique()
        self.values = {} #{value: k for k, value in enumerate(values)}
        if self.get_type() == str:
            self.count = len(values)
        else:
            self.min_value = array_sample.min()
            self.min_value = array_sample.max()
        
    def __repr__(self):
        return str((self.name, str(self.get_type()), self.count, self.values, self.min_value, self.max_value))
        
    def get_type(self):
        if 'float' in self.dtype_name:
            return float
        elif 'int' in self.dtype_name:
            return int
        elif 'object' in self.dtype_name:
            return str
        else:
            raise Exception("Unknown type for feature: ", name)
        
    def get_binary_format(self):
        if 'float' in self.dtype_name:
            return 'f'
        elif 'int' in self.dtype_name:
            if abs(self.min_value) < 100 and abs(self.max_value) < 100:
                return 'b'
            elif abs(self.min_value) < 10000 and abs(self.max_value) < 10000:
                return 'h'
            return 'i'
        elif 'object' in self.dtype_name:
            if self.count < 200:
                return 'b'
            elif self.count < 10000:
                return 'h'
            else:
                return 'i'
        else:
            raise Exception("incompatible type for feature: " + self.name)
            
    def should_skip(self):
        return self.count > 1000
    
    def cast_float(self, value):
        result = self.values.get(value)
        if result is None:
            result = float(value)
            self.values[value] = result
        return result
    
    def cast_int(self, value):
        result = self.values.get(value)
        if result is None:
            result = int(float(value))
            self.values[value] = result
        return result
    
    def cast_object(self, value):
        result = self.values.get(value)
        if result is None:
            result = len(self.values)
            self.values[value] = result
        return result
            
    def get_cast(self):
        if 'float' in self.dtype_name:
            return self.cast_float
        elif 'int' in self.dtype_name:
            return self.cast_int  # TODO clean this
        elif 'object' in self.dtype_name:
            return self.cast_object
        else:
            raise Exception("Unknown type for feature: ", name)
            
    
class Dataset:
    
    filename = ''
    features = []
    
    def __init__(self, filename):
        self.filename = filename
        df = pd.read_csv(mhr_dataset_filename, nrows=10000)  
        for name in list(df):
            self.add(Feature(name, df[name]))
            
    def __repr__(self):
        return '\n'.join(map(str, self.features))
    
    def add(self, feature):
        if feature.should_skip():
            print('Skip feature: ', feature.name)
        self.features.append(feature)
        
    def get_binary_format(self):
        return '<' + ''.join([f.get_binary_format() for f in self.features if not f.should_skip()])
    
    def convert_csv_to_binary(self):
        binary_io = Struct(self.get_binary_format())
        with open(self.filename, 'r') as csv, open(self.filename[:-3] + 'bml', 'wb') as data_file:
            i = 0
            conv_features = [(i, f.get_cast(), ) for i, f in enumerate(self.features) if not f.should_skip()]
            csv.readline()  # skip header
            for line in csv:
                try:
                    values = line[:-1].split(',')
                    values = [cast(values[j]) for j, cast in conv_features]
                    data_file.write(binary_io.pack(*values))
                    i += 1
                    if i % 100000 == 0:
                        print(i, ' lines converted')
                except Exception as e:
                    print(e)
                    names = [f.name for f in self.features if not f.should_skip()]
                    print(i, list(zip(names, self.get_binary_format()[1:], values)))
                    break
            print("nb lines: ", i)

In [25]:
dataset = Dataset(in_file)
#print(dataset)
dataset.convert_csv_to_binary()

Skip feature:  newid
100000  lines converted
200000  lines converted
300000  lines converted
400000  lines converted
500000  lines converted
600000  lines converted
700000  lines converted
800000  lines converted
900000  lines converted
1000000  lines converted
1100000  lines converted
1200000  lines converted
1300000  lines converted
1400000  lines converted
1500000  lines converted
1600000  lines converted
1700000  lines converted
1800000  lines converted
1900000  lines converted
2000000  lines converted
2100000  lines converted
2200000  lines converted
2300000  lines converted
2400000  lines converted
2500000  lines converted
2600000  lines converted
2700000  lines converted
2800000  lines converted
2900000  lines converted
3000000  lines converted
3100000  lines converted
3200000  lines converted
3300000  lines converted
3400000  lines converted
3500000  lines converted
3600000  lines converted
3700000  lines converted
3800000  lines converted
3900000  lines converted
4000000  line

In [16]:
def get_column(file_name, binary_format, binary_line_format, offset=0):
    binary_io = Struct(binary_format)
    line_size = Struct(binary_line_format).size
    with open(file_name, 'rb') as data_file:
        file_size = os.fstat(data_file.fileno()).st_size
        nb_rows = file_size // line_size
        result = np.zeros(nb_rows, dtype='float32')
        with contextlib.closing(mmap.mmap(data_file.fileno(), 0, access=mmap.ACCESS_READ)) as m:
            for i in range(nb_rows):
                pt = i * line_size + offset
                result[i] = binary_io.unpack_from(m, pt)[0]
    return result

In [17]:
binary_line_format = Dataset(mhr_dataset_filename).get_binary_format()

Skip feature:  newid


In [18]:
density = get_column(out_file, '<f', binary_line_format)

In [22]:
np.max(density)

26253.76