# Data provider
> A module that gets the raw data

In [None]:
#| default_exp data_provider

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
# Setting up autoreload and plotting aesthetics
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
%reload_ext rpy2.ipython
import seaborn as sns
from matplotlib import pyplot as plt
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="white", font_scale = 1.3, rc=custom_params)

In [None]:
#export
import numpy as np
import os
import pandas as pd
from fastcore.foundation import patch
from pathlib import Path
import biuR.wrapper # Note that biuR was pip installed

In [None]:
import time

## DataProvider
The task of the dataprovider is to ensure that we're using the correct data structure.

In [None]:
#export
class DataProvider():
    def __init__(self, data_folder_path):
        # Creating some convenience variables 
        self.data_folder_path = Path(data_folder_path)
        self.raw = self.data_folder_path / 'raw'
        self.external = self.data_folder_path / 'external'
        self.interim = self.data_folder_path / 'interim'
        self.processed = self.data_folder_path / 'processed'
        # Checking if folder paths exist
        self._ensure_directories_exist()

    def _ensure_directories_exist(self):
            paths = [self.raw, self.interim, self.processed, self.external]
            for path in paths:
                path.mkdir(parents=True, exist_ok=True)

In [None]:
#export
def get_efficiently(func):
    """
    This decorator wraps around functions that get data and handles data storage.
    If the output from the function hasn't been stored yet, it stores it in "[path_to_interim]/[function_name_without_get].parquet"
    If the output from the function has been stored already, it loads the stored file instead of running the function (unless update is specified as True)
    """
    def w(*args, update = False, columns = None, path = None, **kw):
        _self = args[0] # Getting self to grab interim path from DataProvider
        var_name = func.__name__.replace('__get_','').replace('get_','')
        file_path = os.path.join(_self.interim, "%s.parquet"%var_name)
        if os.path.exists(file_path) and (update == False):
            result =  pd.read_parquet(file_path, columns = columns)
        else:
            print("Preparing %s"%var_name)
            result = func(_self)
            result.to_parquet(file_path)
        return result
    w.__wrapped__ = func # Specifying the wrapped function for inspection
    w.__doc__ = func.__doc__
    w.__name__ = func.__name__
    w.__annotations__ = {'cls':DataProvider, 'as_prop':False} # Adding parameters to make this work with @patch
    return w

## Example: getting movisens data
This function uses the raw data folder path from data provider to get "mov.csv".

In [None]:
@patch
@get_efficiently
def get_mov_data(self:DataProvider):
    time.sleep(3) # pretending this function takes time.
    self.mov_path = self.raw / "mov.csv"
    mov = pd.read_csv(self.mov_path, sep = ';')
    return mov

When getting the data mov_data.parquet is created in the interim folder:

In [None]:
%%time
dp = DataProvider('/Users/hilmarzech/Projects/a02/data')
mov = dp.get_mov_data()

CPU times: user 52.5 ms, sys: 14.4 ms, total: 67 ms
Wall time: 66.3 ms


The next time we get the data it is pulled from the parquet file, speeding up the process:

In [None]:
%%time
dp = DataProvider('/Users/hilmarzech/Projects/a02/data')
mov = dp.get_mov_data()

CPU times: user 22.2 ms, sys: 2.16 ms, total: 24.3 ms
Wall time: 23.4 ms


If we want to run the function again to refresh the data, we set update to true:

In [None]:
%%time
dp = DataProvider('/Users/hilmarzech/Projects/a02/data')
dp.get_mov_data(update = True)

Preparing mov_data
CPU times: user 128 ms, sys: 8.43 ms, total: 137 ms
Wall time: 3.14 s


Unnamed: 0,Participant,Trigger,Trigger_date,Trigger_time,Trigger_counter,Form,Form_start_date,Form_start_time,Form_finish_date,Form_finish_time,...,item_726,item_731,item_737,item_742,item_748,item_753,item_820,item_828,item_658,item_680
0,1,Initial,2024-04-10,13:54:36,1,Informed Consent,2024-04-10,13:54:36,2024-04-10,13:56:36,...,,,,,,,,,,
1,1,Initial,2024-04-10,13:54:36,1,Setup,2024-04-10,13:56:36,2024-04-10,13:58:04,...,,,,,,,,,,
2,1,Initial,2024-04-10,13:54:36,1,Demographics,2024-04-10,13:58:05,2024-04-10,13:58:09,...,,,,,,,,,1.0,25.0
3,1,Initial,2024-04-10,13:54:36,1,Bottle Game,2024-04-10,13:58:10,2024-04-10,14:03:39,...,,,,,,,,,,
4,1,Initial,2024-04-10,13:54:36,1,Sushi Game Initial,2024-04-10,14:03:41,2024-04-10,14:36:22,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590,78,Initial,2024-05-13,00:09:17,1,Setup,2024-05-13,00:09:31,2024-05-13,00:10:08,...,,,,,,,,,,
591,78,Initial,2024-05-13,00:09:17,1,Demographics,2024-05-13,00:10:11,2024-05-13,00:10:17,...,,,,,,,,,2.0,38.0
592,78,Initial,2024-05-13,00:09:17,1,Bottle Game,2024-05-13,00:10:17,2024-05-13,00:15:51,...,,,,,,,,,,
593,78,Initial,2024-05-13,00:09:17,1,Sushi Game Initial,2024-05-13,00:15:54,2024-05-13,00:49:58,...,,,,,,,,,,


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()