# Optimizing the generator object with Numba

In [1]:
from numba import jit
import pandas as pd
from timeit import timeit
from time import time
from pandas.core.frame import DataFrame, Series
import numpy as np
from numpy.core import ndarray, int64, float64
from typing import Dict, Tuple, List, Union
import os
import sys
import inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir) 
import utils
import config
basedir: str = os.getcwd()

In [14]:
import numba
print(numba.__version__)

0.50.1


In [2]:
datapaths: Dict[str, str] = config.datapaths['tsa']

train: DataFrame = pd.read_csv(datapaths['train_1_clean'], index_col=0)
    
categorical_cols: List[str] = ['sub_url', 'access', 'agent']
train[categorical_cols] = train[categorical_cols].astype('category')
train.iloc[:, :-4] = train.iloc[:, :-4].ffill(axis=1)
train.iloc[:, :-4] = train.iloc[:, :-4].bfill(axis=1)
train.dropna(axis=0, inplace=True)
train

Unnamed: 0,2015-07-01,2015-07-02,2015-07-03,2015-07-04,2015-07-05,2015-07-06,2015-07-07,2015-07-08,2015-07-09,2015-07-10,...,2016-12-26,2016-12-27,2016-12-28,2016-12-29,2016-12-30,2016-12-31,subject,sub_url,access,agent
0,18.0,11.0,5.0,13.0,14.0,9.0,9.0,22.0,26.0,24.0,...,14.0,20.0,22.0,19.0,18.0,20.0,2NE1,zh.wikipedia.org,all-access,spider
1,11.0,14.0,15.0,18.0,11.0,13.0,22.0,11.0,10.0,4.0,...,9.0,30.0,52.0,45.0,26.0,20.0,2PM,zh.wikipedia.org,all-access,spider
2,1.0,0.0,1.0,1.0,0.0,4.0,0.0,3.0,4.0,4.0,...,4.0,4.0,6.0,3.0,4.0,17.0,3C,zh.wikipedia.org,all-access,spider
3,35.0,13.0,10.0,94.0,4.0,26.0,14.0,9.0,11.0,16.0,...,16.0,11.0,17.0,19.0,10.0,11.0,4minute,zh.wikipedia.org,all-access,spider
4,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,38.0,...,3.0,11.0,27.0,13.0,36.0,10.0,52 Hz I Love You,zh.wikipedia.org,all-access,spider
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145054,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0,...,17.0,7.0,13.0,12.0,31.0,11.0,Skam (serie de televisión),es.wikipedia.org,all-access,spider
145055,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,...,3.0,4.0,2.0,4.0,4.0,3.0,Legión (serie de televisión),es.wikipedia.org,all-access,spider
145056,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,...,21.0,21.0,21.0,21.0,21.0,51.0,Doble tentación,es.wikipedia.org,all-access,spider
145057,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Mi adorable maldición,es.wikipedia.org,all-access,spider


## With Numba

In [3]:
@jit(nopython=True)
def lagged_features_numba(features_tensor: ndarray, num_lags: int, fill_value: Union[int, float] = np.nan) -> ndarray:
    N: int = features_tensor.shape[0]
    T: int = features_tensor.shape[1]
    D: int = features_tensor.shape[2]
    filled = np.full((N, num_lags, D), fill_value)
    extended_feature = np.concatenate((filled, features_tensor), axis=1)
    return extended_feature[:, :T]    

In [4]:
@jit(nopython=True)
def single_autocorr_numba(series: ndarray, num_lags: int) -> float64:    
    x0 = series[num_lags:]
    x1 = series[:-num_lags]
    mu0 = np.nanmean(x0)
    mu1 = np.nanmean(x1)
    dx0 = x0 - mu0
    dx1 = x1 - mu1
    sigma0 = np.sqrt(np.nansum(dx0 * dx0))
    sigma1 = np.sqrt(np.nansum(dx1 * dx1))
    return np.nansum(dx0 * dx1) / (sigma0 * sigma1) if (sigma0 * sigma1) != 0 else 0

In [5]:
@jit(nopython=True)
def batch_autocorr_numba(time_series: ndarray, num_lags: int, lookback_periods: int) -> ndarray:
    N = time_series.shape[0]
    
    collect_autocorrs: List[float64] = []
    for i in range(N):
        r = single_autocorr_numba(time_series[i, :], num_lags)
        collect_autocorrs.append(r)
        
    autocorrs: ndarray = np.array(collect_autocorrs)  # Shape: (N,)
    autocorrs = np.repeat(autocorrs, lookback_periods)  # ! (Nxlookback_periods, )
    autocorrs = autocorrs.reshape(-1, lookback_periods)  # !  (N, lookback_periods)
    autocorrs = np.expand_dims(autocorrs, -1)  # Shape: (N, lookback_periods, 1)
    
    return autocorrs   

## Without Numba

In [6]:
def lagged_features(features_tensor: ndarray, num_lags: int, fill_value: Union[int, float] = np.nan) -> ndarray:
    N, T, D = features_tensor.shape  # D = 1
    filled = np.full((N, num_lags, D), fill_value)
    extended_feature = np.concatenate((filled, features_tensor), axis=1)
    return extended_feature[:, :T]   

In [7]:
def single_autocorr(series: ndarray, num_lags: int) -> float64:
    x0 = series[num_lags:]
    x1 = series[:-num_lags]
    mu0 = np.nanmean(x0)
    mu1 = np.nanmean(x1)
    dx0 = x0 - mu0
    dx1 = x1 - mu1
    sigma0 = np.sqrt(np.nansum(dx0 * dx0))
    sigma1 = np.sqrt(np.nansum(dx1 * dx1))
    return np.nansum(dx0 * dx1) / (sigma0 * sigma1) if (sigma0 * sigma1) != 0 else 0

In [8]:
def batch_autocorr(time_series: ndarray, num_lags: int, lookback_periods: int, verbose: bool = True) -> ndarray:
    N, D = time_series.shape
    
    collect_autocorrs: List[float64] = []
    for i in range(N):
        if verbose:
            if i % 10 == 0:
                print(f"Series {i}/{N}")
        r = single_autocorr(time_series[i, :], num_lags)
        collect_autocorrs.append(r)
        
    autocorrs: ndarray = np.array(collect_autocorrs).reshape(-1, 1)  # Shape: (N, 1)
    autocorrs = np.expand_dims(autocorrs, -1)  # Shape: (N, 1, 1)
    
    # Propagate the autocorrelation
    autocorrs = np.repeat(autocorrs, lookback_periods, axis=1)   # Shape: (N, lookback_periods, 1)
    return autocorrs    

In [None]:
def one_hot_and_dims(series: Series, lookback_periods: int) -> ndarray:
    '''One-hot encode features and expand dimensions.'''
    encoded_series: ndarray = one_hot_encode_series(series)
    encoded_series: ndarray = np.expand_dims(encoded_series, 1)
    encoded_series: ndarray = np.repeat(encoded_series, lookback_periods, axis=1)
    return encoded_series

In [None]:
def create_medians_tensor(time_series_matrix: ndarray, lookback_periods: int):
    medians: ndarray = np.median(time_series_matrix, axis=1)
    medians = np.expand_dims(medians, -1)  # Shape: (N, 1)
    medians = np.expand_dims(medians, -1)  # Shape: (N, 1, 1)
    medians = np.repeat(medians, lookback_periods, axis=1)  # Shape: (N, lookback_periods, 1)
    return medians    

In [None]:
def get_batch(time_series: DataFrame, 
              global_features: DataFrame = None, 
              start: int = 0, 
              lookback_periods: int = 100, 
              lags: tuple = None,
              show_runtime: bool = True) -> tuple:
    
    if show_runtime:
        start_time: float = time()        
    
    # N is the number of samples
    # T is the number of periods
    N, T = time_series.shape
    
    end: int = start + lookback_periods
    assert end <= T,  f"End of lookback out of bounds. End of lookback: {end}, but end of your time series: {T}."
    
    time_series_matrix: ndarray = time_series.iloc[:, start:end].to_numpy()
    target: ndarray = np.log1p(time_series.iloc[:, end].to_numpy())
    
    log_series: ndarray = np.log1p(time_series_matrix)  # Shape: (N, lookback_periods)
    log_series = np.expand_dims(log_series, axis=-1)  # Shape: (N, lookback_periods, 1)
    
    weekdays: List[str] = get_weekdays(time_series, start, end)
    days_one_hot: ndarray = one_hot_encode_series(weekdays)  # Shape: (lookback_periods, 7)
    days_one_hot = np.expand_dims(days_one_hot, 0)  # Shape: (1, lookback_periods, 7)
    days: ndarray = np.repeat(days_one_hot, repeats=N, axis=0)  # Shape: (N, lookback_periods, 7)
        
    batch: ndarray = np.concatenate((log_series, days), axis=2)  # Shape: (N, lookback_periods, 8)
    
    if lags is not None:
        for lag in lags:
            batch = np.concatenate((batch, lagged_features(log_series, lag, np.nan)), axis=2)
            batch = np.concatenate((batch, batch_autocorr(time_series_matrix, lag, lookback_periods, verbose=False)), axis=2)  # WHY SERIES, NOT LOG_SERIES??
    
    if global_features is not None:
        assert not isinstance(global_features, Series), "You passed a 'pandas.Series' object instead of a 'pandas.DataFrame'"
        
        N_, D = global_features.shape
        assert N == N_, ("'time_series' and 'global_features' must have same number of samples."
                         + f" You gave {N} samples for 'time_series', but {N_} for 'global_features'.")
        
        for feature in global_features.columns:
            batch = np.concatenate((batch, one_hot_and_dims(global_features[feature], lookback_periods)), axis=2)
    
    medians: ndarray = create_medians_tensor(time_series_matrix, lookback_periods)  # WHY SERIES, NOT LOG_SERIES?? # Shape (N, lookback_periods, 1)
    batch = np.concatenate((batch, medians), axis=2)
    
    if show_runtime:
        end_time: float = time()
        utils.show_elapsed_time(start_time, end_time)
        
    return batch, target 

## Performance comparison

In [9]:
samples: ndarray = train.iloc[:, :-4].to_numpy()
lb: int = 100
lag: int = 365

In [10]:
start: float = time()
batch_autocorr(samples, lag, lb, verbose=False)
time_batch_autocorr: float = time() - start
print(f"Elapsed time: {time_batch_autocorr}")

Elapsed time: 14.279760837554932


In [11]:
start: float = time()
batch_autocorr_numba(samples, lag, lb)
time_batch_autocorr_numba_compil: float = time() - start
print(f"Elapsed time (with compilation): {time_batch_autocorr_numba_compil}")

Elapsed time (with compilation): 2.0739457607269287


In [12]:
start: float = time()
batch_autocorr_numba(samples, lag, lb)
time_batch_autocorr_numba: float = time() - start
print(f"Elapsed time (without compilation): {time_batch_autocorr_numba}")

Elapsed time (without compilation): 0.4977879524230957


In [13]:
print(f"Numba makes function {time_batch_autocorr/time_batch_autocorr_numba} times much faster.")

Numba makes function 28.686433185144313 times much faster.
