In [1]:
#| default_exp data

# Dataset

> A simple adaptor class to make the data in required format that can be easily consumed and processd by the models.
> The key aspect here is to make adaptable and fast prcessable dataset, reduce the data size, make splits and add any processing if required.

In [2]:
#| export
from fastcore.utils import *
import numpy as np
import pandas as pd
import gc, copy

In [3]:
#| hide
from nbdev.showdoc import *
from fastcore.test import *

In [4]:
#| hide
import nbdev; nbdev.nbdev_export()

In [5]:
#| export
class Dataset:
    """
    Dataset Adaptor Class

    This class is meant to make dataset possible which would be consumed by models further
    """
    def __init__(self, df:pd.DataFrame,      # Dataframe that needs to be converted 
                reduce_datatype:bool = True, # Shall we try to reduce datatype to make is smaller
                encode_category:str = None,  # Do encoding of categories default to None as no encoding
                add_intercept:bool = False,  # Add a constant value intercept to data. This might be needed for Model based Trees.
                na_treatment:str = 'allow',  # How to work with nas. Default: 'allow'
                copy_data:bool = False,      # Keep a self copy of original data
                digits:int = None,           # To round float to certain digits or not, Default: None means no rounding
                n_category:Union[int, float, None] = None  # How many different level shoud be treated as category. If a value less than one the number of levels is defined aas % oft total rows
                ) -> None:
        self.df, self.reduce_datatype = df, reduce_datatype
        self.encode_category, self.add_intercept = encode_category, add_intercept
        self.na_treatment, self.copy,self.digits = na_treatment, copy_data, digits
        self.n_category = n_category
        self.split_ratio = None
        self.features_ = self.df.columns.tolist()
        self._shape = self.df.shape
        self._dtypes = self.df.dtypes
        self._category_lbl_dict = {}
        self._dataranges = {}
        
        self._process_data()      

    def _reduce_size(self):
        df = pd.DataFrame()
        for col,dtype in self._dtypes.items():
            if dtype != object:
                max_,min_,hasna_ = self.df[col].max(),self.df[col].min(),np.isfinite(self.df[col]).all()
                isint_ = dtype == int
                if not hasna_:
                    asint = self.df[col].fillna(0).astype(np.int64)
                    result = (self.df[col] - asint).sum()
                    if -0.01 < result < 0.01:
                        isint_ = True

                # Make Integer/unsigned Integer datatypes
                if isint_:
                    if min_ >= 0:
                        df[col] = pd.to_numeric(self.df[col].fillna(min_-1), downcast="unsigned",errors='coerce')
                    else:
                        df[col] = pd.to_numeric(self.df[col].fillna(min_-1), downcast="integer",errors='coerce')
                        
                    self._dataranges[col] = {'hasna':hasna_,'min':min_-1,'max':max_,'inferredtype':df[col].dtype}

                # Make float datatypes 32 bit
                else: #todo make this more advanced with rounding and more : evaluate the below an
                    if self.digits is None:
                        df[col] = pd.to_numeric(self.df[col], downcast="float",errors='coerce')
                    else:
                        df[col] = pd.to_numeric(np.round(self.df[col],self.digits), downcast="float",errors='coerce')
                        max_,min_,hasna_ = df[col].max(),df[col].min(),np.isfinite(df[col]).all()
                    self._dataranges[col] = {'hasna':hasna_,'min':min_,'max':max_,'inferredtype':df[col].dtype}
            else:
                #todo : add logic of makig a bing object to small category based on defintion
                df[col] =self.df[col]
                self._dataranges[col] = {'hasna':hasna_,'min':None,'max':None,'inferredtype':df[col].dtype}
        
        try: 
            self.data = np.array([tuple(x) for x in df.values], dtype=[(k,v['inferredtype']) for k,v in self._dataranges.items()])
        except:
            print('Their was an error in above')
            self.data = df.copy(deep=True)

    def _process_data(self):
        self._reduce_size()
        if not self.copy:
            self.df = None
            gc.collect()

    def dtypes(self,actual=False):
        if actual:
            return pd.DataFrame.from_dict(self._dtypes,orient='index').rename(columns={0:'data_type'})
        else:
            dt = {k:v['inferredtype'] for k,v in self._dataranges.items()}

    @property
    def shape(self): return self._shape

    def __str__(self) -> str:
        s = f"Dataset(df=Shape({self.shape}"
        for k,v in self.__dict__.items() :
            if (k not in ('df','features_','data')) and (not k.startswith('_')) : s += f", {k}={v}"
        s += ')'
        return s
    
    __repr__ = __str__

    
    


# How to work on data


Creating random sample data:

In [6]:
n_size = 1000000
rnd = np.random.RandomState(1234)
dummy_data = pd.DataFrame({'numericfull':rnd.randint(1,500,size=n_size),
                            'unitint':rnd.randint(1,25,size=n_size),
                            'floatfull':rnd.random_sample(size=n_size),
                            'floatsmall':np.round(rnd.random_sample(size=n_size)+rnd.randint(1,25,size=n_size),2),
                            'categoryobj':rnd.choice(['a','b','c','d'],size=n_size),
                            'stringobj':rnd.choice(["{:c}".format(k) for k in range(97, 123)],size=n_size)})
dummy_data.head()

Unnamed: 0,numericfull,unitint,floatfull,floatsmall,categoryobj,stringobj
0,304,1,0.651859,11.42,a,f
1,212,1,0.906869,23.28,d,v
2,295,23,0.933262,21.79,d,t
3,54,19,0.919103,9.24,d,s
4,205,9,0.262066,16.69,a,l


Pass it to `Dataset`  and let it do its magic

In [7]:
dataset = Dataset(df=dummy_data)
dataset

Dataset(df=Shape((1000000, 6), reduce_datatype=True, encode_category=None, add_intercept=False, na_treatment=allow, copy=False, digits=None, n_category=None, split_ratio=None)

To acess raw processed data

In [8]:
dataset.data[:5]

array([(304,  1, 0.65185905, 11.42, 'a', 'f'),
       (212,  1, 0.90686905, 23.28, 'd', 'v'),
       (295, 23, 0.9332624 , 21.79, 'd', 't'),
       ( 54, 19, 0.9191031 ,  9.24, 'd', 's'),
       (205,  9, 0.2620663 , 16.69, 'a', 'l')],
      dtype=[('numericfull', '<u2'), ('unitint', 'u1'), ('floatfull', '<f4'), ('floatsmall', '<f4'), ('categoryobj', 'O'), ('stringobj', 'O')])

***Note:*** This is a [Structured arrays](https://numpy.org/doc/stable/user/basics.rec.html) and not a simmple numpy array or pandas data frame.

Size reduction is as follows:

In [9]:
print('Pandas Data Frame        : ',np.round(dummy_data.memory_usage(deep=True).sum()*1e-6,2),'MB')
print('Dataset Structured Array : ',np.round(dataset.data.nbytes*1e-6/ 1024 * 1024,2),'MB')

Pandas Data Frame        :  148.0 MB
Dataset Structured Array :  27.0 MB


In [10]:
print(dummy_data.info(memory_usage='deep'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   numericfull  1000000 non-null  int64  
 1   unitint      1000000 non-null  int64  
 2   floatfull    1000000 non-null  float64
 3   floatsmall   1000000 non-null  float64
 4   categoryobj  1000000 non-null  object 
 5   stringobj    1000000 non-null  object 
dtypes: float64(2), int64(2), object(2)
memory usage: 141.1 MB
None


# Further reduction in data size

We can even further reduce data by using following parameters:

In [11]:
dataset = Dataset(df=dummy_data, digits=2)
print('Pandas Data Frame        : ',np.round(dummy_data.memory_usage(deep=True).sum()*1e-6,2),'MB')
print('Dataset Structured Array : ',np.round(dataset.data.nbytes*1e-6/ 1024 * 1024,2),'MB')

Pandas Data Frame        :  148.0 MB
Dataset Structured Array :  27.0 MB


In [12]:
dataset.data[:5]

array([(304,  1, 0.65, 11.42, 'a', 'f'), (212,  1, 0.91, 23.28, 'd', 'v'),
       (295, 23, 0.93, 21.79, 'd', 't'), ( 54, 19, 0.92,  9.24, 'd', 's'),
       (205,  9, 0.26, 16.69, 'a', 'l')],
      dtype=[('numericfull', '<u2'), ('unitint', 'u1'), ('floatfull', '<f4'), ('floatsmall', '<f4'), ('categoryobj', 'O'), ('stringobj', 'O')])

In [13]:
# data.features_
# dummy_data.to_dict?
# data._dtypes = data.df.dtypes.to_dict()
# data._dtypes
# dummy_data.rename?
# {k:v==object for k,v in data._dtypes.items()