In [1]:
import os
import numpy as np
import pandas as pd
import Utils

In [2]:
DATA_PATH = '../data/'
EXPORT_PATH = '/export/'

files = {
    'data': [
        DATA_PATH + 'baskets.parquet',
        DATA_PATH + 'coupon_index.parquet', 
        DATA_PATH + 'coupons.parquet',
        #DATA_PATH + 'prepare.parquet'
   ]
}

In [3]:
# todo: rename to Utils and break inheritance
class Helper:
    """
    Expose utility methods
    """

    def __init__(self):
        """
        Attributes:
            data: (dict) contains the loaded data sets
            mappings: (dict) todo: what kind of mappings?
        """
        #self.data = {}
        self.mappings = {}

    # read parquet files from disk and optimize memory consumption
    # ----------------------------------------------------------------------------------
    def load(self, files: dict):
        data_frames = {}
        for key, paths in files.items():
            for path in paths:
                name_with_extension = os.path.basename(path)
                name = os.path.splitext(name_with_extension)[0]
                data = pd.read_parquet(path)
                data_frames[name] = self.reduce_data_size(data)
        self.data = data_frames
    # reduce data size
    # ----------------------------------------------------------------------------------
    def reduce_data_size(self, df):
        """
        use:
            - reduces memory usage of dataframes by converting integers to the lowest
              possible memory usage and float to float32

        input:
            - df: pd.DataFrame
                - dataframe with high memory usage

        return: pd.DataFrame
            - dataframe with lower memory usage
        """

        max_integer_values = {
            127: "int8", 
            32767: "int16", 
            2147483647: "int32"
        }
        
        for column, dtype in df.dtypes.items():
            if np.issubdtype(dtype, np.integer):
                # determining the minimum dtype
                max_value = np.max([abs(df[column].min()), df[column].max()])
                max_array = np.array(list(max_integer_values.keys()))
                max_idx = max_array[max_array > max_value][0]
                # converting integers
                df[column] = df[column].astype(max_integer_values[max_idx])
            # converting float
            if np.issubdtype(dtype, np.floating):
                df[column] = df[column].astype("float32")
        return df

    # convenience functions
    # ----------------------------------------------------------------------------------
    def __getitem__(self, item):
        """
        use:
            - dict-like behaviour: self['item'] <-> self.item
            - allows to get attributes using a str
        """
        return eval(f"self.{item}")

    def __setitem__(self, item, value):
        """
        use:
            - dict-like behaviour: self['item'] = value <-> self.item = value
            - allows to set attribute values using a str
        """
        exec(f"self.{item} = value")

    def _format_time(self, seconds):
        """
        use:
            - formats seconds into str time-format: 'mm:ss'
        """
        return "{:02.0f}".format(seconds // 60) + ":" + "{:02.0f}".format(seconds % 60)

In [4]:
myHelper = Helper()
myHelper.load(files)

{'baskets':           week  shopper  product  price
 0            0        0       71    629
 1            0        0       91    605
 2            0        0      116    715
 3            0        0      123    483
 4            0        0      157    592
 ...        ...      ...      ...    ...
 68841593    89    99999      143    470
 68841594    89    99999      158    566
 68841595    89    99999      186    499
 68841596    89    99999      204    496
 68841597    89    99999      225    602
 
 [68841598 rows x 4 columns],
 'coupon_index':       week  shopper  coupon
 0       90        0       0
 2000    90        0       1
 4000    90        0       2
 6000    90        0       3
 8000    90        0       4
 ...    ...      ...     ...
 1999    90     1999       0
 3999    90     1999       1
 5999    90     1999       2
 7999    90     1999       3
 9999    90     1999       4
 
 [10000 rows x 3 columns],
 'coupons':           week  shopper  product  discount
 0            0  

In [5]:
myHelper.data['baskets']

Unnamed: 0,week,shopper,product,price
0,0,0,71,629
1,0,0,91,605
2,0,0,116,715
3,0,0,123,483
4,0,0,157,592
...,...,...,...,...
68841593,89,99999,143,470
68841594,89,99999,158,566
68841595,89,99999,186,499
68841596,89,99999,204,496


In [8]:
Utils.reduce_mem_usage(myHelper.data['coupons'])

Memory usage of dataframe is 686.65 MB
Memory usage after optimization is: 686.65 MB
Decreased by 0.0%


Unnamed: 0,week,shopper,product,discount
0,0,0,35,35
1,0,0,193,40
2,0,0,27,30
3,0,0,177,35
4,0,0,5,30
...,...,...,...,...
44999995,89,99999,62,40
44999996,89,99999,110,10
44999997,89,99999,37,40
44999998,89,99999,155,25
