# Importing Libraries

In [1]:
import time
import pandas as pd
import polars as pl
import plotly.graph_objects as go

# Defining the size of files to be test

In [2]:
# Tamanhos dos arquivos
csv_sizes = [50000, 100000, 250000, 500000, 1000000, 2500000, 5000000]
xlsx_sizes = [50000, 100000, 250000, 500000, 1000000]

# Testing the read method

In [3]:
# Funções de benchmarking para Pandas
def benchmark_pandas_read(file_path):
    start = time.time()
    df = pd.read_csv(file_path)
    end = time.time()
    return end - start


# Funções de benchmarking para Polars
def benchmark_polars_read(file_path):
    start = time.time()
    df = pl.read_csv(file_path)
    end = time.time()
    return end - start

## csv file

In [4]:
# Listas para armazenar os tempos de leitura
pandas_read_times = []
polars_read_times = []

# Comparação de desempenho
for size in csv_sizes:
    csv_path = f"mock_data/mock_data_{size}.csv"
    
    # Pandas
    pandas_read_time = benchmark_pandas_read(csv_path)
    pandas_read_times.append(pandas_read_time)
    
    # Polars
    polars_read_time = benchmark_polars_read(csv_path)
    polars_read_times.append(polars_read_time)

# Criar um gráfico de barras com Plotly
fig = go.Figure()

fig.add_trace(go.Bar(
    x=[str(size) for size in csv_sizes],
    y=pandas_read_times,
    name='Pandas',
    marker_color='blue'
))

fig.add_trace(go.Bar(
    x=[str(size) for size in csv_sizes],
    y=polars_read_times,
    name='Polars',
    marker_color='green'
))

fig.update_layout(
    title='Comparação de Tempo de Leitura: Pandas vs Polars',
    xaxis_title='Tamanho do Arquivo',
    yaxis_title='Tempo de Leitura (s)',
    barmode='group'
)

fig.show()

## xlsx file

# Testing the filter method

In [None]:
# Funções de benchmarking para Pandas
def benchmark_pandas_filter(df):
    start = time.time()
    df_filtered = df[df['B'] > 0.5]
    end = time.time()
    return end - start


# Funções de benchmarking para Polars
def benchmark_polars_filter(df):
    start = time.time()
    df_filtered = df.filter(df['B'] > 0.5)
    end = time.time()
    return end - start

## csv file