# Del 3: Optimizacija kode za velike datasete

Pripravimo datasete:

In [1]:
!tar -xJf data/data_del_06.tar.xz -C ./data/

In [2]:
import pandas as pd
import numpy as np

## CPU Bound Programs

### Bounds vs Limitations

<img alt="I/O bounds" src="images/CPU+and+I_O+bounds.png">

### Primer optimizacije

In [2]:
import numpy as np

# Define a basic Haversine distance formula
def haversine(lat1, lon1, lat2, lon2):
    MILES = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    total_miles = MILES * c
    return total_miles

In [3]:
df = pd.read_csv('data/new_york_hotels.csv')

#### Crude looping over DataFrame rows using indices

In [4]:
# Define a function to manually loop over all rows and return a series of distances
def haversine_looping(df):
    distance_list = []
    for i in range(0, len(df)):
        d = haversine(40.671, -73.985, df.iloc[i]['latitude'], df.iloc[i]['longitude'])
        distance_list.append(d)
    return distance_list

In [17]:
%%timeit
# Run the haversine looping function
df['distance'] = haversine_looping(df)

786 ms ± 161 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Looping with iterrows()

In [18]:
%%timeit
# Haversine applied on rows via iteration
haversine_series = []
for index, row in df.iterrows():
    haversine_series.append(haversine(40.671, -73.985, row['latitude'], row['longitude']))
df['distance'] = haversine_series

148 ms ± 25.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


#### Looping with apply()

In [5]:
%%timeit

# Timing apply on the Haversine function
df['distance'] = df.apply(lambda row: haversine(40.671, -73.985, row['latitude'], row['longitude']), axis=1)

81 ms ± 20.4 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


#### Vectorization with Pandas series

In [6]:
%%timeit 
# Vectorized implementation of Haversine applied on Pandas series
df['distance'] = haversine(40.671, -73.985, df['latitude'], df['longitude'])

2 ms ± 349 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


####  Vectorization with NumPy arrays

In [7]:
%%timeit
# Vectorized implementation of Haversine applied on NumPy arrays
df['distance'] = haversine(40.671, -73.985, df['latitude'].values, df['longitude'].values)

241 µs ± 24.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## I/O Bound Programs

### I/O Bounds

<img src="./images/report_assembly.png">

<img src="./images/report_assembly_bidir.png">

### Profiling an I/O bound task

In [9]:
query = '''
SELECT DISTINCT teamID 
FROM Teams 
INNER JOIN TeamsFranchises ON Teams.franchID == TeamsFranchises.franchID 
WHERE TeamsFranchises.active = 'Y';
'''

In [11]:
import cProfile
import sqlite3

conn = sqlite3.connect("data/lahman2015.sqlite")



In [44]:
import cProfile
import sqlite3

query = "SELECT SUM(HR) FROM Batting WHERE teamId=?"
conn = sqlite3.connect("data/lahman2015.sqlite")
cur = conn.cursor()

def calculate_runs(teams):


### Blocking Tasks

In [18]:
import sqlite3

# Create an in memory database.
memory = sqlite3.connect(':memory:')

# Connect to our disk database.
disk = sqlite3.connect('data/lahman2015.sqlite')




In [20]:
import cProfile
import sqlite3



## Optimizing Python Code with pandas

### Basic Looping

### Select columns and rows efficiently


In [13]:
data = pd.read_csv('data/school.csv')
data.head(3)

Unnamed: 0,School ID,School Name,Building Code,Street Address,City,State,Zip Code
0,02M260,Clinton School Writers and Artists,M933,425 West 33rd Street,Manhattan,NY,10001
1,06M211,Inwood Early College for Health and Informatio...,M052,650 Academy Street,Manhattan,NY,10002
2,01M539,"New Explorations into Science, Technology and ...",M022,111 Columbia Street,Manhattan,NY,10002


### Uporaba biult-in funkciji

### Joining on indexes is faster than joining on columns

Construct some sample data:

In [3]:
n = 100000

i1 = np.arange(n)
np.random.shuffle(i1)
df1 = pd.DataFrame({'i': i1,
                    'j': np.random.randint(1,1000,n),
                    'k': np.random.randint(1,1000,n)})

i2 = np.arange(n)
np.random.shuffle(i1)
df2 = pd.DataFrame({'i': i2,
                    'm': np.random.randint(1,1000,n),
                    'n': np.random.randint(1,1000,n)})

## PRIMER: Pohitritev pandas kode

### Naloga

### Priprava podatkov

In [1]:
import pandas as pd

In [37]:
df = pd.read_csv('data/demand_profile.csv')

In [38]:
df.head()

Unnamed: 0,date_time,energy_kwh
0,1/1/13 0:00,0.586
1,1/1/13 1:00,0.58
2,1/1/13 2:00,0.572
3,1/1/13 3:00,0.596
4,1/1/13 4:00,0.592


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 2 columns):
date_time     8760 non-null object
energy_kwh    8760 non-null float64
dtypes: float64(1), object(1)
memory usage: 137.0+ KB


In [5]:
df.dtypes

date_time      object
energy_kwh    float64
dtype: object

### 1) Simple Looping Over Pandas Data

In [19]:
def apply_tariff(kwh, hour):
    """Calculates cost of electricity for given hour."""    
    if 0 <= hour < 7:
        rate = 12
    elif 7 <= hour < 17:
        rate = 20
    elif 17 <= hour < 24:
        rate = 28
    else:
        raise ValueError(f'Invalid hour: {hour}')
    return rate * kwh

### 2) Looping with .itertuples() and .iterrows()

### 3) Pandas’ .apply()

### 4) Selecting Data With .isin()

### 5) Pandas’ pd.cut() function

### 6) Using NumPy