In [3]:
import pandas as pd
import numpy as np
import gc
import timeit

file_path = "../ex04/fines.csv"
df = pd.read_csv(file_path)

def measure_time(stmt, globals=None):
    return timeit.timeit(stmt, globals=globals, number=5)


def calc_fines_range(df):
    results = []
    for i in range(len(df)):
        results.append(df.iloc[i]['Fines'] / df.iloc[i]['Refund'] * df.iloc[i]['Year'])
    return results


print("Execution time for range + iloc:", measure_time("df['Calc_Range'] = calc_fines_range(df)", globals=globals()))
def calc_fines_iterrows(df):
    results = []
    for index, row in df.iterrows():
        results.append(row['Fines'] / row['Refund'] * row['Year'])
    return results


print("Execution time for iterrows:", measure_time("df['Calc_Iterrows'] = calc_fines_iterrows(df)", globals=globals()))

print("Execution time for apply + lambda:", measure_time("df['Calc_Apply'] = df.apply(lambda row: row['Fines'] / row['Refund'] * row['Year'], axis=1)", globals=globals()))

print("Execution time for Series operations:", measure_time("df['Calc_Series'] = (df['Fines'] / df['Refund']) * df['Year']", globals=globals()))

print("Execution time for NumPy operations:", measure_time("df['Calc_Values'] = (df['Fines'].values / df['Refund'].values) * df['Year'].values", globals=globals()))

print("Execution time for df.loc[df['CarNumber'] == 'O136HO197RUS']:", measure_time("df.loc[df['CarNumber'] == 'O136HO197RUS']", globals=globals()))

df.set_index('CarNumber', inplace=True)

print("Execution time for df.loc['O136HO197RUS']:", measure_time("df.loc['O136HO197RUS']", globals=globals()))

# Memory optimization
print("Initial memory usage:")
print(df.info(memory_usage='deep'))

df_optimized = df.copy()

df_optimized[df_optimized.select_dtypes(include=['float64']).columns] = df_optimized.select_dtypes(include=['float64']).astype(np.float32)
df_optimized[df_optimized.select_dtypes(include=['int64']).columns] = df_optimized.select_dtypes(include=['int64']).apply(pd.to_numeric, downcast='integer')

print("Optimized memory usage:")
print(df_optimized.info(memory_usage='deep'))

for col in df_optimized.select_dtypes(include=['object']).columns:
    df_optimized[col] = df_optimized[col].astype('category')

print("Memory usage after converting object columns to category:")
print(df_optimized.info(memory_usage='deep'))

del df
gc.collect()

Execution time for range + iloc: 0.1557165830163285
Execution time for iterrows: 0.05300716700730845
Execution time for apply + lambda: 0.019199042057152838
Execution time for Series operations: 0.0005019169766455889
Execution time for NumPy operations: 0.00021654198644682765
Execution time for df.loc[df['CarNumber'] == 'O136HO197RUS']: 0.000710874970536679
Execution time for df.loc['O136HO197RUS']: 0.00017962499987334013
Initial memory usage:
<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to E005
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Refund         925 non-null    float64
 1   Fines          930 non-null    float64
 2   Make           930 non-null    object 
 3   Model          919 non-null    object 
 4   Year           930 non-null    int64  
 5   Calc_Range     925 non-null    float64
 6   Calc_Iterrows  925 non-null    float64
 7   Calc_Apply     925 non-null    float64


1573