# Importing Libraries

In [20]:
import time
import pandas as pd
import polars as pl
import plotly.graph_objects as go

## Settings and Functions

In [21]:
def plot_comparison(sizes, pandas, polars, title):
    # Convertendo os tempos de leitura para texto com 2 casas decimais
    pandas_times_text = [f"{time:.2f}" for time in pandas]
    polars_times_text = [f"{time:.2f}" for time in polars]

    # Criar um gráfico de barras com Plotly
    fig = go.Figure()

    fig.add_trace(go.Bar(
        x=[str(size) for size in sizes],
        y=pandas,
        name='Pandas',
        marker_color='blue',
        text=pandas_times_text,
        textposition='auto'
    ))

    fig.add_trace(go.Bar(
        x=[str(size) for size in sizes],
        y=polars,
        name='Polars',
        marker_color='brown',
        text=polars_times_text,
        textposition='auto'
    ))

    fig.update_layout(
        title={
            'text': f'Comparação de Tempo de {title}: Pandas vs Polars',
            'y':0.9,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': {'size': 24}
        },
        xaxis_title={
            'text': 'Tamanho do Arquivo',
            'font': {'size': 18}
        },
        yaxis_title={
            'text': f'Tempo de {title} (s)',
            'font': {'size': 18}
        },
        barmode='group',
        font=dict(
            family="Arial, sans-serif",
            size=16,
            #color="RebeccaPurple"
        ),
        legend=dict(
            font=dict(
                size=18
            )
        )
    )

    fig.show(width=1600, height=600)


# Defining the size of files to be test

In [22]:
# Tamanhos dos arquivos
csv_sizes = [50000, 100000, 250000, 500000, 1000000, 2500000, 5000000]
xlsx_sizes = [50000, 100000, 250000, 500000, 1000000]

# Testing the read method

## csv file

In [23]:
# Funções de benchmarking para Pandas
def benchmark_pandas_read_csv(file_path, size):
    start = time.time()
    globals()[f'df_mock_data_{size}_pandas_csv'] = pd.read_csv(file_path)
    end = time.time()
    return end - start


# Funções de benchmarking para Polars
def benchmark_polars_read_csv(file_path, size):
    start = time.time()
    globals()[f'df_mock_data_{size}_polars_csv'] = pl.read_csv(file_path)
    end = time.time()
    return end - start

In [24]:
# Listas para armazenar os tempos de leitura
pandas_read_csv_times = []
polars_read_csv_times = []

# Comparação de desempenho
for size in csv_sizes:
    csv_path = f"mock_data/mock_data_{size}.csv"
    
    # Pandas
    pandas_read_csv_time = benchmark_pandas_read_csv(csv_path, size)
    pandas_read_csv_times.append(pandas_read_csv_time)
    
    # Polars
    polars_read_csv_time = benchmark_polars_read_csv(csv_path, size)
    polars_read_csv_times.append(polars_read_csv_time)

# Formatando os tempos para ter 2 casas decimais
pandas_read_csv_times_text = [f"{time:.2f}" for time in pandas_read_csv_times]
polars_read_csv_times_text = [f"{time:.2f}" for time in polars_read_csv_times]

plot_comparison(csv_sizes, pandas_read_csv_times, polars_read_csv_times, 'Leitura (csv)')

In [25]:
# Criando um DataFrame
df_read_csv = pd.DataFrame({
    'CSV_Size': csv_sizes,
    'Pandas_Read_Time': pandas_read_csv_times,
    'Polars_Read_Time': polars_read_csv_times
})

# Calculando a porcentagem de melhoria
df_read_csv['Improvement (%)'] = (df_read_csv['Pandas_Read_Time'] - df_read_csv['Polars_Read_Time']) / df_read_csv['Pandas_Read_Time'] * 100

df_read_csv

Unnamed: 0,CSV_Size,Pandas_Read_Time,Polars_Read_Time,Improvement (%)
0,50000,0.225736,0.047731,78.855436
1,100000,0.237007,0.019061,91.95748
2,250000,0.574337,0.039208,93.173274
3,500000,1.033592,0.097246,90.591486
4,1000000,2.009939,0.161764,91.9518
5,2500000,4.876081,0.640126,86.872128
6,5000000,25.276014,1.595681,93.686977


## xlsx file

In [26]:
# Funções de benchmarking para Pandas
def benchmark_pandas_read_xlsx(file_path, size):
    start = time.time()
    globals()[f'df_mock_data_{size}_pandas_xlsx'] = pd.read_excel(file_path)
    end = time.time()
    return end - start


# Funções de benchmarking para Polars
def benchmark_polars_read_xlsx(file_path, size):
    start = time.time()
    globals()[f'df_mock_data_{size}_polars_xlsx'] = pl.read_excel(file_path)
    end = time.time()
    return end - start

In [27]:
pandas_read_xlsx_times = []
polars_read_xlsx_times = []

# Comparação de desempenho
for size in xlsx_sizes:
    xlsx_path = f"mock_data/mock_data_{size}.xlsx"
    
    # Pandas
    pandas_read_xlsx_time = benchmark_pandas_read_xlsx(xlsx_path, size)
    pandas_read_xlsx_times.append(pandas_read_xlsx_time)
    
    # Polars
    polars_read_xlsx_time = benchmark_polars_read_xlsx(xlsx_path, size)
    polars_read_xlsx_times.append(polars_read_xlsx_time)

plot_comparison(xlsx_sizes, pandas_read_xlsx_times, polars_read_xlsx_times, 'Leitura (xlsx)')

        O xlsx foi limitado a 1kk de linhas porque é o máximo que o Excel suporta.

In [28]:
# Criando um DataFrame
df_read_xlsx = pd.DataFrame({
    'XLSX_Size': xlsx_sizes,
    'Pandas_Read_Time': pandas_read_xlsx_times,
    'Polars_Read_Time': polars_read_xlsx_times
})

# Calculando a porcentagem de melhoria
df_read_xlsx['Improvement (%)'] = (df_read_xlsx['Pandas_Read_Time'] - df_read_xlsx['Polars_Read_Time']) / df_read_xlsx['Pandas_Read_Time'] * 100

df_read_xlsx

Unnamed: 0,XLSX_Size,Pandas_Read_Time,Polars_Read_Time,Improvement (%)
0,50000,12.993335,7.246907,44.225971
1,100000,22.272125,6.493893,70.842957
2,250000,16.086137,10.978991,31.74874
3,500000,105.407522,28.196191,73.250305
4,1000000,196.482837,88.794861,54.807828


# Testing the filter (loc) method

In [29]:
# Funções de benchmarking para Pandas
def benchmark_pandas_filter(size):
    start = time.time()
    df = globals()[f'df_mock_data_{size}_pandas_csv']
    df_filtered = df.loc[df['B'] > 0.5]
    end = time.time()
    return end - start


# Funções de benchmarking para Polars
def benchmark_polars_filter(size):
    start = time.time()
    df = globals()[f'df_mock_data_{size}_polars_csv']
    df_filtered = df.filter(df['B'] > 0.5)
    end = time.time()
    return end - start

In [30]:
pandas_filter_times = []
polars_filter_times = []

# Comparação de desempenho
for size in csv_sizes:
    
    # Pandas
    pandas_filter_time = benchmark_pandas_filter(size)
    pandas_filter_times.append(pandas_filter_time)
    
    # Polars
    polars_filter_time = benchmark_polars_filter(size)
    polars_filter_times.append(polars_filter_time)

# Formatando os tempos para ter 2 casas decimais
pandas_filter_times_text = [f"{time:.2f}" for time in pandas_filter_times]
polars_filter_times_text = [f"{time:.2f}" for time in polars_filter_times]

plot_comparison(csv_sizes, pandas_filter_times, polars_filter_times, 'Filtro')

In [31]:
# Criando um DataFrame
df_filter = pd.DataFrame({
    'Size': csv_sizes,
    'Pandas_Filter_Time': pandas_filter_times,
    'Polars_Filter_Time': polars_filter_times
})

# Calculando a porcentagem de melhoria
df_filter['Improvement (%)'] = (df_filter['Pandas_Filter_Time'] - df_filter['Polars_Filter_Time']) / df_filter['Pandas_Filter_Time'] * 100

df_filter

Unnamed: 0,Size,Pandas_Filter_Time,Polars_Filter_Time,Improvement (%)
0,50000,0.058119,0.051646,11.136044
1,100000,0.109464,0.057716,47.274169
2,250000,0.212103,0.049244,76.782828
3,500000,0.645479,0.162131,74.88208
4,1000000,0.959949,0.310631,67.640873
5,2500000,3.962259,1.327868,66.487092
6,5000000,7.28929,2.91108,60.063594


# Testing the calculation method

In [32]:
# Funções de benchmarking para Pandas
def benchmark_pandas_calc(size):
    start = time.time()
    df = globals()[f'df_mock_data_{size}_pandas_csv']
    df['G'] = df['B'] * df['C']
    end = time.time()
    return end - start


# Funções de benchmarking para Polars
def benchmark_polars_calc(size):
    start = time.time()
    df = globals()[f'df_mock_data_{size}_polars_csv']
    new_col = (df['B'] * df['C']).alias('G')
    df = df.hstack([new_col])
    end = time.time()
    return end - start

In [70]:
pandas_calc_times = []
polars_calc_times = []

# Comparação de desempenho
for size in csv_sizes:
    
    # Pandas
    pandas_calc_time = benchmark_pandas_calc(size)
    pandas_calc_times.append(pandas_calc_time)
    
    # Polars
    polars_calc_time = benchmark_polars_calc(size)
    polars_calc_times.append(polars_calc_time)

# Formatando os tempos para ter 2 casas decimais
pandas_calc_times_text = [f"{time:.2f}" for time in pandas_calc_times]
polars_calc_times_text = [f"{time:.2f}" for time in polars_calc_times]

plot_comparison(csv_sizes, pandas_calc_times, polars_calc_times, 'Cálculo')

In [71]:
# Criando um DataFrame
df_calc = pd.DataFrame({
    'Size': csv_sizes,
    'Pandas_Calc_Time': pandas_calc_times,
    'Polars_Calc_Time': polars_calc_times
})

# Calculando a porcentagem de melhoria
df_calc['Improvement (%)'] = (df_calc['Pandas_Calc_Time'] - df_calc['Polars_Calc_Time']) / df_calc['Pandas_Calc_Time'] * 100

df_calc

Unnamed: 0,Size,Pandas_Calc_Time,Polars_Calc_Time,Improvement (%)
0,50000,0.002536,0.0,100.0
1,100000,0.002019,0.000998,50.602125
2,250000,0.003,0.001,66.666667
3,500000,0.007543,0.000998,86.761917
4,1000000,0.007518,0.003,60.086899
5,2500000,0.023045,0.015904,30.986178
6,5000000,0.040806,0.025291,38.020228


# Testing the iteration (for) method

In [35]:
# Funções de benchmarking para Pandas
def benchmark_pandas_iter(size):
    start = time.time()
    df = globals()[f'df_mock_data_{size}_pandas_csv']
    for index, row in df.iterrows():
        result = row['C'] * row['D']
    end = time.time()
    return end - start

# Funções de benchmarking para Polars
def benchmark_polars_iter(size):
    start = time.time()
    df = globals()[f'df_mock_data_{size}_polars_csv']
    for row in range(len(df)):
        result = df.select('C')[row] * df.select('D')[row]
    end = time.time()
    return end - start

In [72]:
pandas_iter_times = []
polars_iter_times = []

# Comparação de desempenho
for size in csv_sizes:
    
    # Pandas
    pandas_iter_time = benchmark_pandas_iter(size)
    pandas_iter_times.append(pandas_iter_time)
    
    # Polars
    polars_iter_time = benchmark_polars_iter(size)
    polars_iter_times.append(polars_iter_time)

# Formatando os tempos para ter 2 casas decimais
pandas_iter_times_text = [f"{time:.2f}" for time in pandas_iter_times]
polars_iter_times_text = [f"{time:.2f}" for time in polars_iter_times]

plot_comparison(csv_sizes, pandas_iter_times, polars_iter_times, 'Interação (for)')

In [37]:
# Criando um DataFrame
df_iter = pd.DataFrame({
    'Size': csv_sizes,
    'Pandas_Iter_Time': pandas_iter_times,
    'Polars_Iter_Time': polars_iter_times
})

# Calculando a porcentagem de melhoria
df_iter['Improvement (%)'] = (df_iter['Pandas_Iter_Time'] - df_iter['Polars_Iter_Time']) / df_iter['Pandas_Iter_Time'] * 100

df_iter

Unnamed: 0,Size,Pandas_Iter_Time,Polars_Iter_Time,Improvement (%)
0,50000,6.455223,6.218842,3.661868
1,100000,12.537319,12.085164,3.606477
2,250000,30.438864,29.575767,2.835509
3,500000,57.985548,24.542773,57.674327
4,1000000,118.454758,98.809536,16.584578
5,2500000,181.469234,302.884228,-66.906655
6,5000000,342.304652,606.756559,-77.256299
