In [1]:
%%html
<style>
.not_clean {
    border: 7px solid #F3FF70;
    padding: 20px;
}

.comment {
    border: 7px solid lightgreen;
    padding: 20px;
}
</style>

# Del 15: Procesiranje velikih datasetov - hitrost

In [2]:
import pandas as pd
import numpy as np

## CPU Bound Programs

### Bounds

In [20]:
import pandas as pd

In [21]:
df = pd.read_csv('data/demand_profile.csv')

In [22]:
df.head()

Unnamed: 0,date_time,energy_kwh
0,1/1/13 0:00,0.586
1,1/1/13 1:00,0.58
2,1/1/13 2:00,0.572
3,1/1/13 3:00,0.596
4,1/1/13 4:00,0.592


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date_time   8760 non-null   object 
 1   energy_kwh  8760 non-null   float64
dtypes: float64(1), object(1)
memory usage: 137.0+ KB


In [25]:
df['date_time'] = pd.to_datetime(df['date_time'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date_time   8760 non-null   datetime64[ns]
 1   energy_kwh  8760 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 137.0 KB


In [26]:
def convert(df, column_name):
    return pd.to_datetime(df[column_name])

df = pd.read_csv('data/demand_profile.csv')
df_coverted = df.copy()

In [27]:
%%timeit -r 3 -n 10
df_coverted['date_time'] = convert(df, 'date_time')

443 ms ± 5.63 ms per loop (mean ± std. dev. of 3 runs, 10 loops each)


In [31]:
def convert_with_format(df, column_name):
    return pd.to_datetime(df[column_name], format='%d/%m/%y %H:%M')

In [32]:
%%timeit -r 3 -n 10
df_coverted['date_time'] = convert_with_format(df, 'date_time')

16.9 ms ± 512 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)


<table class="table table-hover">
<thead>
<tr>
<th>Tariff Type</th>
<th>Cents per kWh</th>
<th>Time Range</th>
</tr>
</thead>
<tbody>
<tr>
<td>Peak</td>
<td>28</td>
<td>17:00 to 24:00</td>
</tr>
<tr>
<td>Shoulder</td>
<td>20</td>
<td>7:00 to 17:00</td>
</tr>
<tr>
<td>Off-Peak</td>
<td>12</td>
<td>0:00 to 7:00</td>
</tr>
</tbody>
</table>

In [36]:
def apply_tariff(kwh, hour):
    """Calculates cost of electricity for given hour."""    
    if 0 <= hour < 7:
        rate = 12
    elif 7 <= hour < 17:
        rate = 20
    elif 17 <= hour < 24:
        rate = 28
    else:
        raise ValueError(f'Invalid hour: {hour}')
    return rate * kwh

### Simple Looping Over Pandas Data

In [37]:
# NOTE: Don't do this!
def apply_tariff_loop(df):
    """Calculate costs in loop.  Modifies `df` inplace."""
    energy_cost_list = []
    for i in range(len(df)):
        # Get electricity used and hour of day
        energy_used = df.iloc[i]['energy_kwh']
        hour = df.iloc[i]['date_time'].hour
        energy_cost = apply_tariff(energy_used, hour)
        energy_cost_list.append(energy_cost)
    df['cost_cents'] = energy_cost_list

In [38]:
%%timeit -r 3 -n 10
apply_tariff_loop(df_coverted)

941 ms ± 618 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)


### Looping with .itertuples() and .iterrows()

In [39]:
def apply_tariff_iterrows(df):
    energy_cost_list = []
    for index, row in df.iterrows():
        # Get electricity used and hour of day
        energy_used = row['energy_kwh']
        hour = row['date_time'].hour
        # Append cost list
        energy_cost = apply_tariff(energy_used, hour)
        energy_cost_list.append(energy_cost)
    df['cost_cents'] = energy_cost_list

In [40]:
%%timeit -r 3 -n 10
apply_tariff_iterrows(df_coverted)

248 ms ± 5.02 ms per loop (mean ± std. dev. of 3 runs, 10 loops each)


### Pandas’ .apply()

In [41]:
def apply_tariff_withapply(df):
    df['cost_cents'] = df.apply(lambda row: apply_tariff(
        kwh=row['energy_kwh'],
        hour=row['date_time'].hour), axis=1)

In [42]:
%%timeit -r 3 -n 10
apply_tariff_withapply(df_coverted)

51 ms ± 154 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)


### Selecting Data With .isin()

In [43]:
df_coverted = df.copy()
df_coverted['date_time'] = convert_with_format(df, 'date_time')
df_coverted.set_index('date_time', inplace=True)

In [44]:
df_coverted.head()

Unnamed: 0_level_0,energy_kwh
date_time,Unnamed: 1_level_1
2013-01-01 00:00:00,0.586
2013-01-01 01:00:00,0.58
2013-01-01 02:00:00,0.572
2013-01-01 03:00:00,0.596
2013-01-01 04:00:00,0.592


In [45]:
def apply_tariff_isin(df):
    # Define hour range Boolean arrays
    peak_hours = df.index.hour.isin(range(17, 24))
    shoulder_hours = df.index.hour.isin(range(7, 17))
    off_peak_hours = df.index.hour.isin(range(0, 7))
    
    # Apply tariffs to hour ranges
    df.loc[peak_hours, 'cost_cents'] = df.loc[peak_hours, 'energy_kwh'] * 28
    df.loc[shoulder_hours,'cost_cents'] = df.loc[shoulder_hours, 'energy_kwh'] * 20
    df.loc[off_peak_hours,'cost_cents'] = df.loc[off_peak_hours, 'energy_kwh'] * 12

In [46]:
%%timeit -r 3 -n 10
apply_tariff_isin(df_coverted)

2.74 ms ± 354 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)


### Using NumPy

> **[numpy.digitize](https://docs.scipy.org/doc/numpy/reference/generated/numpy.digitize.html)**
- `numpy.digitize(x, bins, right=False)`
- Return the indices of the bins to which each value in input array belongs.

In [47]:
import numpy as np

In [48]:
df_coverted = df.copy()
df_coverted['date_time'] = convert_with_format(df, 'date_time')
df_coverted.set_index('date_time', inplace=True)

In [49]:
np.digitize(df_coverted.index.hour.values, bins=[7, 17, 24])

array([0, 0, 0, ..., 2, 2, 2], dtype=int64)

In [50]:
def apply_tariff_digitize(df):
    prices = np.array([12, 20, 28])
    bins = np.digitize(df.index.hour.values, bins=[7, 17, 24])
    df['cost_cents'] = prices[bins] * df['energy_kwh'].values

In [51]:
%%timeit -r 3 -n 10
apply_tariff_digitize(df_coverted)

511 µs ± 58.2 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)


### Prevent Reprocessing with HDFStore

In [56]:
df_coverted.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8760 entries, 2013-01-01 00:00:00 to 2013-12-31 23:00:00
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   energy_kwh  8760 non-null   float64
 1   cost_cents  8760 non-null   float64
dtypes: float64(2)
memory usage: 205.3 KB


In [57]:
# Create storage object with filename `processed_data`
data_store = pd.HDFStore('data/OUT_processed_data.h5')

# Put DataFrame into the object setting the key as 'preprocessed_df'
data_store['preprocessed_df'] = df_coverted
data_store.close()

In [58]:
data_store = pd.HDFStore('data/OUT_processed_data.h5')

# Retrieve data using key
preprocessed_df = data_store['preprocessed_df']
data_store.close()

In [59]:
preprocessed_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8760 entries, 2013-01-01 00:00:00 to 2013-12-31 23:00:00
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   energy_kwh  8760 non-null   float64
 1   cost_cents  8760 non-null   float64
dtypes: float64(2)
memory usage: 205.3 KB


---

## Drugi nasveti

### [Dask](https://www.dask.org/)

https://docs.dask.org/en/stable/install.html

###  [Numba](https://numba.pydata.org/)

Numba translates Python functions to optimized machine code at runtime using the industry-standard LLVM compiler library. Numba-compiled numerical algorithms in Python can approach the speeds of C or FORTRAN.

### pandas.eval() for Efficient Operations

[Dokumentacija](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.eval.html)

[High-Performance Pandas: eval() and query()](https://jakevdp.github.io/PythonDataScienceHandbook/03.12-performance-eval-and-query.html#pandas.eval()-for-Efficient-Operations)

As of version 0.13 (released January 2014), Pandas includes some experimental tools that allow you to directly access C-speed operations without costly allocation of intermediate arrays. These are the eval() and query() functions, which rely on the Numexpr package.