# Del 15: Procesiranje velikih datasetov - hitrost

In [1]:
import pandas as pd
import numpy as np

## CPU Bound Programs

### Bounds vs Limitations

<img alt="I/O bounds" src="images/CPU+and+I_O+bounds.png">

### Primer optimizacije

In [3]:
import numpy as np

# Define a basic Haversine distance formula
def haversine(lat1, lon1, lat2, lon2):
    MILES = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    total_miles = MILES * c
    return total_miles

In [4]:
df = pd.read_csv('data/new_york_hotels.csv')

#### Crude looping over DataFrame rows using indices

In [5]:
# Define a function to manually loop over all rows and return a series of distances
def haversine_looping(df):
    distance_list = []
    for i in range(0, len(df)):
        d = haversine(40.671, -73.985, df.iloc[i]['latitude'], df.iloc[i]['longitude'])
        distance_list.append(d)
    return distance_list

In [6]:
%%timeit
# Run the haversine looping function
df['distance'] = haversine_looping(df)

1.13 s ± 6.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Looping with iterrows()

In [7]:
%%timeit
# Haversine applied on rows via iteration
haversine_series = []
for index, row in df.iterrows():
    haversine_series.append(haversine(40.671, -73.985, row['latitude'], row['longitude']))
df['distance'] = haversine_series

346 ms ± 55.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Looping with apply()

In [8]:
%%timeit

# Timing apply on the Haversine function
df['distance'] = df.apply(lambda row: haversine(40.671, -73.985, row['latitude'], row['longitude']), axis=1)

162 ms ± 10.3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


#### Vectorization with Pandas series

In [9]:
%%timeit 
# Vectorized implementation of Haversine applied on Pandas series
df['distance'] = haversine(40.671, -73.985, df['latitude'], df['longitude'])

5.05 ms ± 151 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


####  Vectorization with NumPy arrays

In [10]:
%%timeit
# Vectorized implementation of Haversine applied on NumPy arrays
df['distance'] = haversine(40.671, -73.985, df['latitude'].values, df['longitude'].values)

742 µs ± 1.38 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## I/O Bound Programs

### I/O Bounds

<img src="./images/report_assembly.png">

<img src="./images/report_assembly_bidir.png">

### Profiling an I/O bound task

In [11]:
query = '''
SELECT DISTINCT teamID 
FROM Teams 
INNER JOIN TeamsFranchises ON Teams.franchID == TeamsFranchises.franchID 
WHERE TeamsFranchises.active = 'Y';
'''

In [12]:
import cProfile
import sqlite3

conn = sqlite3.connect("data/lahman2015.sqlite")

cur = conn.cursor()
teams = [row[0] for row in cur.execute(query).fetchall()]

In [13]:
print(teams)

['BSN', 'CHN', 'CN2', 'PT1', 'SL4', 'NY1', 'PHI', 'BR3', 'PIT', 'BRO', 'CIN', 'SLN', 'BLA', 'BOS', 'CHA', 'CLE', 'DET', 'MLA', 'PHA', 'WS1', 'SLA', 'NYA', 'ML1', 'BAL', 'KC1', 'LAN', 'SFN', 'LAA', 'MIN', 'WS2', 'HOU', 'NYN', 'CAL', 'ATL', 'OAK', 'KCA', 'SE1', 'MON', 'SDN', 'ML4', 'TEX', 'SEA', 'TOR', 'COL', 'FLO', 'ANA', 'TBA', 'ARI', 'MIL', 'WAS', 'MIA']


In [14]:
import cProfile
import sqlite3

query = "SELECT SUM(HR) FROM Batting WHERE teamId=?"
conn = sqlite3.connect("data/lahman2015.sqlite")
cur = conn.cursor()

def calculate_runs(teams):
    home_runs = []
    for team in teams:
        runs = cur.execute(query, [team]).fetchall()
        runs = runs[0][0]
        home_runs.append(runs)
    return home_runs

In [15]:
%%timeit
home_runs = calculate_runs(teams)

219 ms ± 1.21 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
profile_string = "home_runs = calculate_runs(teams)"

In [17]:
cProfile.run(profile_string)

         157 function calls in 0.223 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.223    0.223 2346532923.py:8(calculate_runs)
        1    0.000    0.000    0.223    0.223 <string>:1(<module>)
        1    0.000    0.000    0.223    0.223 {built-in method builtins.exec}
       51    0.000    0.000    0.000    0.000 {method 'append' of 'list' objects}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
       51    0.221    0.004    0.221    0.004 {method 'execute' of 'sqlite3.Cursor' objects}
       51    0.001    0.000    0.001    0.000 {method 'fetchall' of 'sqlite3.Cursor' objects}




### Blocking Tasks

```python
conn = sqlite3.connect(':memory:')
```

In [18]:
import sqlite3

# Create an in memory database.
memory = sqlite3.connect(':memory:')

# Connect to our disk database.
disk = sqlite3.connect('data/lahman2015.sqlite')

# Create a query that will read the contents of the disk database 
# into another database.
dump = ''.join(line for line in disk.iterdump())

# Run the query to copy the database from disk into memory.
memory.executescript(dump)

cur = memory.cursor()

In [19]:
dump[:1000]

'BEGIN TRANSACTION;CREATE TABLE AllstarFull (\nplayerID TEXT,\nyearID INTEGER,\ngameNum INTEGER,\ngameID TEXT,\nteamID TEXT,\nlgID TEXT,\nGP INTEGER,\nstartingPos INTEGER\n);INSERT INTO "AllstarFull" VALUES(\'aaronha01\',1955,0,\'NLS195507120\',\'ML1\',\'NL\',1,NULL);INSERT INTO "AllstarFull" VALUES(\'aaronha01\',1956,0,\'ALS195607100\',\'ML1\',\'NL\',1,NULL);INSERT INTO "AllstarFull" VALUES(\'aaronha01\',1957,0,\'NLS195707090\',\'ML1\',\'NL\',1,9);INSERT INTO "AllstarFull" VALUES(\'aaronha01\',1958,0,\'ALS195807080\',\'ML1\',\'NL\',1,9);INSERT INTO "AllstarFull" VALUES(\'aaronha01\',1959,1,\'NLS195907070\',\'ML1\',\'NL\',1,9);INSERT INTO "AllstarFull" VALUES(\'aaronha01\',1959,2,\'NLS195908030\',\'ML1\',\'NL\',1,9);INSERT INTO "AllstarFull" VALUES(\'aaronha01\',1960,1,\'ALS196007110\',\'ML1\',\'NL\',1,9);INSERT INTO "AllstarFull" VALUES(\'aaronha01\',1960,2,\'ALS196007130\',\'ML1\',\'NL\',1,9);INSERT INTO "AllstarFull" VALUES(\'aaronha01\',1961,1,\'NLS196107110\',\'ML1\',\'NL\',1,NULL

In [20]:
import cProfile
import sqlite3

query = "SELECT SUM(HR) FROM Batting WHERE teamId=?"

def calculate_runs(teams):
    home_runs = []
    for team in teams:
        runs = cur.execute(query, [team]).fetchall()
        runs = runs[0][0]
        home_runs.append(runs)
    return home_runs

In [21]:
profile_string = "home_runs = calculate_runs(teams)"
cProfile.run(profile_string)

         157 function calls in 0.055 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.054    0.054 3437437996.py:6(calculate_runs)
        1    0.000    0.000    0.055    0.055 <string>:1(<module>)
        1    0.000    0.000    0.055    0.055 {built-in method builtins.exec}
       51    0.000    0.000    0.000    0.000 {method 'append' of 'list' objects}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
       51    0.054    0.001    0.054    0.001 {method 'execute' of 'sqlite3.Cursor' objects}
       51    0.000    0.000    0.000    0.000 {method 'fetchall' of 'sqlite3.Cursor' objects}




### Parallel Execution

<img src="./images/single_threaded.png">

<img src="./images/multi_threaded.png">

In [23]:
def task(team):
    print(team)

In [24]:
import threading

thread = threading.Thread(target=task, args=(teams,))
thread.start()

['BSN', 'CHN', 'CN2', 'PT1', 'SL4', 'NY1', 'PHI', 'BR3', 'PIT', 'BRO', 'CIN', 'SLN', 'BLA', 'BOS', 'CHA', 'CLE', 'DET', 'MLA', 'PHA', 'WS1', 'SLA', 'NYA', 'ML1', 'BAL', 'KC1', 'LAN', 'SFN', 'LAA', 'MIN', 'WS2', 'HOU', 'NYN', 'CAL', 'ATL', 'OAK', 'KCA', 'SE1', 'MON', 'SDN', 'ML4', 'TEX', 'SEA', 'TOR', 'COL', 'FLO', 'ANA', 'TBA', 'ARI', 'MIL', 'WAS', 'MIA']


In [25]:
def task(team):
    print(team)
    

for n, team in enumerate(teams):
    thread = threading.Thread(target=task, args=(team,))
    thread.start()
    print('Started task', n)

BSN
Started task 0
CHN
Started task 1
CN2
Started task 2
PT1
Started task 3
SL4
Started task 4
NY1
Started task 5
PHIStarted task
 6
BR3
Started task 7
PITStarted task 8

BRO
Started task 9
CIN
Started task 10
SLN
Started task 11
BLA
Started task 12
BOS
Started task 13
CHA
Started task 14
CLEStarted task
 15
DET
Started task 16
MLA
Started task 17
PHAStarted task 18
WS1
Started task 19
SLA
Started task 20
NYA
Started task 
21
ML1
Started task 22
BALStarted task
 23
KC1Started task
 24
LANStarted task 25

SFN
Started task 26
LAA
Started task 27
MINStarted task 28

WS2
Started task 29
HOU
Started task 30
NYN
Started task 31
CAL
Started task 32
ATLStarted task 33

OAK
Started task 34
KCA
Started task 35
SE1
Started task 36
MON
Started task 37
SDNStarted task 38
ML4

Started task 39
TEX
Started task 40
SEAStarted task 41

TOR
Started task 42
COL
Started task 43
FLO
Started task 44
ANA
Started task 45
TBA
Started task 46
ARI
Started task 47
MIL
Started task 48
WAS
Started task 49
MIA
Starte

### Thread Blocking

<img src="./images/three_threads.svg">

In [29]:
import threading
import time

def task(team):
    time.sleep(3)
    print(team)
    
for n, team in enumerate(teams):
    thread = threading.Thread(target=task, args=(team,))
    thread.start()
    #task(team)
    print('Started task', n)

Started task 0
Started task 1
Started task 2
Started task 3
Started task 4
Started task 5
Started task 6
Started task 7
Started task 8
Started task 9
Started task 10
Started task 11
Started task 12
Started task 13
Started task 14
Started task 15
Started task 16
Started task 17
Started task 18
Started task 19
Started task 20
Started task 21
Started task 22
Started task 23
Started task 24
Started task 25
Started task 26
Started task 27
Started task 28
Started task 29
Started task 30
Started task 31
Started task 32
Started task 33
Started task 34
Started task 35
Started task 36
Started task 37
Started task 38
Started task 39
Started task 40
Started task 41
Started task 42
Started task 43
Started task 44
Started task 45
Started task 46
Started task 47
Started task 48
Started task 49
Started task 50
BSN
CHN
CN2
PT1
PHI
NY1
SL4
PIT
BR3
BRO
CIN
CHA
BLA
SLN
BOS
CLE
DET
MLA
PHA
ML1
SLA
NYA
WS1
LAA
SFN
LAN
BAL
OAK
NYN
CAL
WS2
ATL
HOU
SDN
KCA
KC1
SE1
MIN
MON
ML4
TEX
FLO
COL
TOR
SEA
ANA
TBA
MIL
WA

### Joining Threads

```python
t1 = threading.Thread(target=task, args=(team,))
t2 = threading.Thread(target=task, args=(team,))
t3 = threading.Thread(target=task, args=(team,))

# Start the first three threads
t1.start()
t2.start()
t3.start()

t1.join() # Wait until t1 finishes.
t2.join() # Wait until t2 finishes.  If it already finished, then keep going.
t3.join() # Wait until t3 finishes.  If it already finished, then keep going.
```

<img src="./images/Screenshot from 2019-06-29 16-40-25.png">

In [34]:
import time

def task(team):
    time.sleep(0.5)
    print(team)

for i in range(11):
    team_names = teams[i*5: (i+1) * 5]
    threads = []
    for team in team_names:
        thread = threading.Thread(target=task, args=(team,))
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()
    print("Finished batch {}".format(i)) 

print("Done")

BSN
CHN
CN2
SL4PT1

Finished batch 0
NY1PHI
PIT
BR3
BRO

Finished batch 1
CINSLN

BOS
BLA
CHA
Finished batch 2
CLE
DET
PHA
MLA
WS1
Finished batch 3
SLA
ML1
BALNYA
KC1

Finished batch 4
LAN
LAA
WS2
SFN
MIN
Finished batch 5
HOU
NYN
CAL
OAK
ATL
Finished batch 6
KCA
SE1
MON
ML4
SDN
Finished batch 7
TEX
SEA
FLO
TOR
COL
Finished batch 8
ANA
ARITBA

WAS
MIL
Finished batch 9
MIA
Finished batch 10
Done


### Locking

```python

lock = threading.Lock()

def task(team):
    lock.acquire()
    # This code cannot be executed until a thread acquires the lock.
    print(team)
    lock.release()

t1 = threading.Thread(target=task, args=(team,))
t2 = threading.Thread(target=task, args=(team,))

t1.start()
t2.start()

```

In [35]:
import threading
import time
import sys

lock = threading.Lock()

def task(team):
    lock.acquire()
    print(team)
    sys.stdout.flush()
    lock.release()
    
for i in range(11):
    team_names = teams[i*5: (i+1) * 5]
    threads = []
    for team in team_names:
        thread = threading.Thread(target=task, args=(team,))
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()
    print("Finished batch {}".format(i))   

BSN
CHN
CN2
PT1
SL4
Finished batch 0
NY1
PHI
BR3
PIT
BRO
Finished batch 1
CIN
SLN
BLA
BOS
CHA
Finished batch 2
CLE
DET
MLA
PHA
WS1
Finished batch 3
SLA
NYA
ML1
BAL
KC1
Finished batch 4
LAN
SFN
LAA
MIN
WS2
Finished batch 5
HOU
NYN
CAL
ATL
OAK
Finished batch 6
KCA
SE1
MON
SDN
ML4
Finished batch 7
TEX
SEA
TOR
COL
FLO
Finished batch 8
ANA
TBA
ARI
MIL
WAS
Finished batch 9
MIA
Finished batch 10


### Thread Safety

In [36]:
import cProfile
import sqlite3
import threading
import sys

In [37]:
query = "SELECT DISTINCT teamID from Teams inner join TeamsFranchises on Teams.franchID == TeamsFranchises.franchID where TeamsFranchises.active = 'Y';"

In [38]:
conn = sqlite3.connect("data/lahman2015.sqlite", check_same_thread=False)

In [39]:
cur = conn.cursor()
teams = [row[0] for row in cur.execute(query).fetchall()]

query = "SELECT SUM(HR) FROM Batting WHERE teamId=?"
lock = threading.Lock()

In [40]:
def calculate_runs(team):
    cur = conn.cursor()
    runs = cur.execute(query, [team]).fetchall()
    runs = runs[0][0]
    lock.acquire()
    print(team, ':', runs)
    sys.stdout.flush()
    lock.release()
    return runs


threads = []

for team in teams:
    thread = threading.Thread(target=calculate_runs, args=(team,))
    thread.start()
    threads.append(thread)
    
for thread in threads:
    thread.join()

CN2 : 267
PT1 : 54
CHN : 13530
BSN : 3424
SL4 : 305
PHI : 12503
NY1 : 5777
BR3 : 143
PIT : 10878
SLN : 11157
CLE : 12333
PHA : 3502
BRO : 4336
BLA : 57
BOS : 12883
CIN : 12383
MLA : 26
DET : 13160
CHA : 10792
SLA : 3014
WS1 : 2786
NYA : 15218
ML1 : 2230
LAN : 7601
SFN : 8348
LAA : 2276
HOU : 6536
MIN : 7393
BAL : 9592
NYN : 6817
KC1 : 1480
CAL : 3912
WS2 : 1387
KCA : 5613
ATL : 7535
OAK : 7438
SE1 : 125
MON : 4381
TEX : 7055
ML4 : 3664
SEA : 5976
FLO : 2816
COL : 4120
SDN : 5648
TOR : 6415
ANA : 1324
TBA : 2823
ARI : 2987
MIL : 3160
WAS : 2002
MIA : 474


## Dask

https://github.com/dask/dask-tutorial

## Optimizing Python Code with pandas

### Basic Looping

### Select columns and rows efficiently


In [41]:
data = pd.read_csv('data/school.csv')
data.head(3)

Unnamed: 0,School ID,School Name,Building Code,Street Address,City,State,Zip Code
0,02M260,Clinton School Writers and Artists,M933,425 West 33rd Street,Manhattan,NY,10001
1,06M211,Inwood Early College for Health and Informatio...,M052,650 Academy Street,Manhattan,NY,10002
2,01M539,"New Explorations into Science, Technology and ...",M022,111 Columbia Street,Manhattan,NY,10002


In [42]:
data['City'].value_counts().head(10)

Brooklyn               121
Bronx                  118
Manhattan              106
Jamaica                 13
Long Island City        12
Staten Island           10
Flushing                 8
Astoria                  6
Elmhurst                 5
Springfield Gardens      4
Name: City, dtype: int64

In [43]:
# save the top cities in a list
top_cities = ['Brooklyn','Bronx','Manhattan','Jamaica','Long Island City']

In [44]:
%%timeit
data.loc[(data['City'].isin(top_cities) == False),'City'] = 'Others'

745 µs ± 26.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [45]:
data.City.value_counts()

Brooklyn            121
Bronx               118
Manhattan           106
Others               65
Jamaica              13
Long Island City     12
Name: City, dtype: int64

In [46]:
data = pd.read_csv('data/school.csv')

In [47]:
%%timeit
data['City'][(data['City'].isin(top_cities) == False)] = 'Others'
# salba praksa

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


3.67 ms ± 124 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [48]:
data.City.value_counts()

Brooklyn            121
Bronx               118
Manhattan           106
Others               65
Jamaica              13
Long Island City     12
Name: City, dtype: int64

### Uporaba biult-in funkciji

### Joining on indexes is faster than joining on columns

In [49]:
n = 100000

i1 = np.arange(n)
np.random.shuffle(i1)
df1 = pd.DataFrame({'i': i1,
                    'j': np.random.randint(1,1000,n),
                    'k': np.random.randint(1,1000,n)})

i2 = np.arange(n)
np.random.shuffle(i1)
df2 = pd.DataFrame({'i': i2,
                    'm': np.random.randint(1,1000,n),
                    'n': np.random.randint(1,1000,n)})

In [50]:
df1.head()

Unnamed: 0,i,j,k
0,29410,891,545
1,77749,249,782
2,30569,887,385
3,49768,391,672
4,87913,689,237


In [51]:
df2.head()

Unnamed: 0,i,m,n
0,0,877,130
1,1,480,764
2,2,831,381
3,3,586,36
4,4,623,754


In [52]:
%%timeit
df1.merge(df2, on="i")

27.6 ms ± 560 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [53]:
df1 = df1.set_index('i')
df2 = df2.set_index('i')

In [54]:
%%timeit
df1.merge(df2, left_index=True, right_index=True)

15.9 ms ± 81 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## PRIMER: Pohitritev pandas kode

### Naloga

### Priprava podatkov

In [55]:
import pandas as pd

In [56]:
df = pd.read_csv('data/demand_profile.csv')

In [57]:
df.head()

Unnamed: 0,date_time,energy_kwh
0,1/1/13 0:00,0.586
1,1/1/13 1:00,0.58
2,1/1/13 2:00,0.572
3,1/1/13 3:00,0.596
4,1/1/13 4:00,0.592


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date_time   8760 non-null   object 
 1   energy_kwh  8760 non-null   float64
dtypes: float64(1), object(1)
memory usage: 137.0+ KB


In [59]:
df.dtypes

date_time      object
energy_kwh    float64
dtype: object

In [60]:
type(df.iloc[0, 0])

str

In [61]:
df['date_time'] = pd.to_datetime(df['date_time'])
df['date_time'].dtype

dtype('<M8[ns]')

In [62]:
df.head()

Unnamed: 0,date_time,energy_kwh
0,2013-01-01 00:00:00,0.586
1,2013-01-01 01:00:00,0.58
2,2013-01-01 02:00:00,0.572
3,2013-01-01 03:00:00,0.596
4,2013-01-01 04:00:00,0.592


In [63]:
def convert(df, column_name):
    return pd.to_datetime(df[column_name])

df = pd.read_csv('data/demand_profile.csv')
df_coverted = df.copy()

In [64]:
%%timeit -r 3 -n 10
df_coverted['date_time'] = convert(df, 'date_time')

2.08 s ± 2.57 ms per loop (mean ± std. dev. of 3 runs, 10 loops each)


In [65]:
def convert_with_format(df, column_name):
    return pd.to_datetime(df[column_name], format='%d/%m/%y %H:%M')

In [66]:
%%timeit -r 3 -n 10
df_coverted['date_time'] = convert_with_format(df, 'date_time')

60.3 ms ± 392 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)


In [67]:
2000/60

33.333333333333336

> <div class="alert alert-primary" role="alert"><p><strong>Note</strong>: Pandas’ <a href="https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html"><code>read_csv()</code></a> also allows you to parse dates as a part of the file I/O step. See the <code>parse_dates</code>, <code>infer_datetime_format</code>, and <code>date_parser</code> parameters.</p>
</div>

In [68]:
df_coverted.head()

Unnamed: 0,date_time,energy_kwh
0,2013-01-01 00:00:00,0.586
1,2013-01-01 01:00:00,0.58
2,2013-01-01 02:00:00,0.572
3,2013-01-01 03:00:00,0.596
4,2013-01-01 04:00:00,0.592


In [69]:
df_coverted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date_time   8760 non-null   datetime64[ns]
 1   energy_kwh  8760 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 137.0 KB


### 1) Simple Looping Over Pandas Data

<table class="table table-hover">
<thead>
<tr>
<th>Tariff Type</th>
<th>Cents per kWh</th>
<th>Time Range</th>
</tr>
</thead>
<tbody>
<tr>
<td>Peak</td>
<td>28</td>
<td>17:00 to 24:00</td>
</tr>
<tr>
<td>Shoulder</td>
<td>20</td>
<td>7:00 to 17:00</td>
</tr>
<tr>
<td>Off-Peak</td>
<td>12</td>
<td>0:00 to 7:00</td>
</tr>
</tbody>
</table>

In [70]:
df_test = df_coverted.copy()
df_test['cost_cents'] = df['energy_kwh'] * 28

In [71]:
df_test.head()

Unnamed: 0,date_time,energy_kwh,cost_cents
0,2013-01-01 00:00:00,0.586,16.408
1,2013-01-01 01:00:00,0.58,16.24
2,2013-01-01 02:00:00,0.572,16.016
3,2013-01-01 03:00:00,0.596,16.688
4,2013-01-01 04:00:00,0.592,16.576


In [74]:
def apply_tariff(kwh, hour):
    """Calculates cost of electricity for given hour."""    
    if 0 <= hour < 7:
        rate = 12
    elif 7 <= hour < 17:
        rate = 20
    elif 17 <= hour < 24:
        rate = 28
    else:
        raise ValueError(f'Invalid hour: {hour}')
    return rate * kwh

In [75]:
# NOTE: Don't do this!
def apply_tariff_loop(df):
    """Calculate costs in loop.  Modifies `df` inplace."""
    energy_cost_list = []
    for i in range(len(df)):
        # Get electricity used and hour of day
        energy_used = df.iloc[i]['energy_kwh']
        hour = df.iloc[i]['date_time'].hour
        energy_cost = apply_tariff(energy_used, hour)
        energy_cost_list.append(energy_cost)
    df['cost_cents'] = energy_cost_list

In [76]:
%%timeit
apply_tariff_loop(df_coverted)

3.4 s ± 92.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### 2) Looping with .itertuples() and .iterrows()

In [77]:
for index, row in df[:5].iterrows():
    print(index)
    print(row)
    print('energy_kwh' ,row['energy_kwh'])
    print('-------')

0
date_time     1/1/13 0:00
energy_kwh          0.586
Name: 0, dtype: object
energy_kwh 0.586
-------
1
date_time     1/1/13 1:00
energy_kwh           0.58
Name: 1, dtype: object
energy_kwh 0.58
-------
2
date_time     1/1/13 2:00
energy_kwh          0.572
Name: 2, dtype: object
energy_kwh 0.572
-------
3
date_time     1/1/13 3:00
energy_kwh          0.596
Name: 3, dtype: object
energy_kwh 0.596
-------
4
date_time     1/1/13 4:00
energy_kwh          0.592
Name: 4, dtype: object
energy_kwh 0.592
-------


In [78]:
def apply_tariff_iterrows(df):
    energy_cost_list = []
    for index, row in df.iterrows():
        # Get electricity used and hour of day
        energy_used = row['energy_kwh']
        hour = row['date_time'].hour
        # Append cost list
        energy_cost = apply_tariff(energy_used, hour)
        energy_cost_list.append(energy_cost)
    df['cost_cents'] = energy_cost_list

In [79]:
%%timeit
apply_tariff_iterrows(df_coverted)

833 ms ± 21 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### 3) Pandas’ .apply()

In [80]:
def apply_tariff_withapply(df):
    df['cost_cents'] = df.apply(lambda row: apply_tariff(
        kwh=row['energy_kwh'],
        hour=row['date_time'].hour), axis=1)

In [81]:
%%timeit
apply_tariff_withapply(df_coverted)

231 ms ± 1.08 ms per loop (mean ± std. dev. of 3 runs, 10 loops each)


### 4) Selecting Data With .isin()

In [82]:
df_coverted = df.copy()
df_coverted['date_time'] = convert_with_format(df, 'date_time')
df_coverted.set_index('date_time', inplace=True)

In [83]:
df_coverted.head()

Unnamed: 0_level_0,energy_kwh
date_time,Unnamed: 1_level_1
2013-01-01 00:00:00,0.586
2013-01-01 01:00:00,0.58
2013-01-01 02:00:00,0.572
2013-01-01 03:00:00,0.596
2013-01-01 04:00:00,0.592


In [84]:
def apply_tariff_isin(df):
    # Define hour range Boolean arrays
    peak_hours = df.index.hour.isin(range(17, 24))
    shoulder_hours = df.index.hour.isin(range(7, 17))
    off_peak_hours = df.index.hour.isin(range(0, 7))
    
    # Apply tariffs to hour ranges
    df.loc[peak_hours, 'cost_cents'] = df.loc[peak_hours, 'energy_kwh'] * 28
    df.loc[shoulder_hours,'cost_cents'] = df.loc[shoulder_hours, 'energy_kwh'] * 20
    df.loc[off_peak_hours,'cost_cents'] = df.loc[off_peak_hours, 'energy_kwh'] * 12

In [85]:
%%timeit -r 3 -n 10
apply_tariff_isin(df_coverted)

12.9 ms ± 619 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)


In [None]:
203/6.5

### 5) Pandas’ pd.cut() function

In [86]:
df_coverted = df.copy()
df_coverted['date_time'] = convert_with_format(df, 'date_time')
df_coverted.set_index('date_time', inplace=True)

> **[pandas.cut](https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.cut.html)**
- `pandas.cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False, duplicates='raise')`
- Bin values into discrete intervals.
- Use cut when you need to segment and sort data values into bins. This function is also useful for going from a continuous variable to a categorical variable. For example, cut could convert ages to groups of age ranges. Supports binning into an equal number of bins, or a pre-specified array of bins.

In [88]:
data = pd.cut(x=df_coverted.index.hour,
      bins=[0,7,17,24],
      include_lowest=True,
      labels=[12, 20, 28])

In [91]:
data[:10]

[12, 12, 12, 12, 12, 12, 12, 12, 20, 20]
Categories (3, int64): [12 < 20 < 28]

In [92]:
def apply_tariff_cut(df):
    cents_per_kwh = pd.cut(x=df_coverted.index.hour,
      bins=[0,7,17,24],
      include_lowest=True,
      labels=[12, 20, 28]).astype(int)
    df['cost_cents'] = cents_per_kwh * df['energy_kwh']

In [93]:
%%timeit -r 3 -n 10
apply_tariff_cut(df_coverted)

4.56 ms ± 246 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)


### 6) Using NumPy

In [94]:
import numpy as np

In [95]:
df_coverted = df.copy()
df_coverted['date_time'] = convert_with_format(df, 'date_time')
df_coverted.set_index('date_time', inplace=True)

> **[numpy.digitize](https://docs.scipy.org/doc/numpy/reference/generated/numpy.digitize.html)**
- `numpy.digitize(x, bins, right=False)`
- Return the indices of the bins to which each value in input array belongs.

In [96]:
np.digitize(df_coverted.index.hour.values, bins=[7, 17, 24])

array([0, 0, 0, ..., 2, 2, 2])

In [97]:
def apply_tariff_digitize(df):
    prices = np.array([12, 20, 28])
    bins = np.digitize(df.index.hour.values, bins=[7, 17, 24])
    df['cost_cents'] = prices[bins] * df['energy_kwh'].values

In [98]:
%%timeit -r 3 -n 10
apply_tariff_digitize(df_coverted)

2.62 ms ± 100 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)


### Prevent Reprocessing with HDFStore

[What is HDF5](https://portal.hdfgroup.org/display/knowledge/What+is+HDF5)

In [99]:
df_coverted.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8760 entries, 2013-01-01 00:00:00 to 2013-12-31 23:00:00
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   energy_kwh  8760 non-null   float64
 1   cost_cents  8760 non-null   float64
dtypes: float64(2)
memory usage: 205.3 KB


In [100]:
# Create storage object with filename `processed_data`
data_store = pd.HDFStore('data/OUT_processed_data.h5')

# Put DataFrame into the object setting the key as 'preprocessed_df'
data_store['preprocessed_df'] = df_coverted
data_store.close()

In [101]:
data_store = pd.HDFStore('data/OUT_processed_data.h5')

# Retrieve data using key
preprocessed_df = data_store['preprocessed_df']
data_store.close()

In [102]:
preprocessed_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8760 entries, 2013-01-01 00:00:00 to 2013-12-31 23:00:00
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   energy_kwh  8760 non-null   float64
 1   cost_cents  8760 non-null   float64
dtypes: float64(2)
memory usage: 205.3 KB


### Povzetek

## Drugi nasveti

###  [Numba](https://numba.pydata.org/)

Numba translates Python functions to optimized machine code at runtime using the industry-standard LLVM compiler library. Numba-compiled numerical algorithms in Python can approach the speeds of C or FORTRAN.

### pandas.eval() for Efficient Operations

[Dokumentacija](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.eval.html)

[High-Performance Pandas: eval() and query()](https://jakevdp.github.io/PythonDataScienceHandbook/03.12-performance-eval-and-query.html#pandas.eval()-for-Efficient-Operations)