# Pandas VS Polars

## Environment set up

Change the working directory to be able to work with the source-code.

In [1]:
import os
from pathlib import Path

WORKING_DIRECTORY = Path.cwd().parents[0]
os.chdir(WORKING_DIRECTORY)

## Imports

In [2]:
import pandas as pd
import polars as pl
from typing import List, Union
from src.measurements import drop_faulty_sensor_data
from src.signals import get_resolution
from loguru import logger
from src.read import read_nasa_vibration_file
from datetime import datetime

## Functions

In [3]:
def read_pandas_nasa_vibration_file(file_path: Path, sensors: List[str],
                             signal_resolution: Union[int, float],
                             acceptable_sensor_range: Union[float, None]=None) -> pd.DataFrame:
    """
    Read one vibration file from the IMS Bearing dataset obtained from NASAs acoustics and vibrations datasets.
    According to its documentation, the channels belong to the following bearings:
    Check the "Readme Document for IMS Bearing Data.pdf" before loading the data since the channel or sensor
    settings are different depending on the test (1st, 2nd, or 3rd).

    :param file_path: path to the location of the vibration file
    :param sensors: name of the channels or sensors to be used. Example:
    ['channel_1', 'channel_2', 'channel_3', 'channel_4', 'channel_5', 'channel_6', 'channel_7', 'channel_8']
    :param signal_resolution: resolution of the signal in seconds
    :param acceptable_sensor_range: if provided, sensors with a value range below this threshold will be set to pd.NA
    :return: Pandas DataFrame containing the vibration data for different channels or sensors
    """
    if not file_path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")

    df = pd.read_csv(file_path, sep='\t', header=None, names=sensors)
    df['measurement_time_in_seconds'] = df.index * signal_resolution

    if acceptable_sensor_range is not None:
        df = drop_faulty_sensor_data(df=df, sensors=sensors, acceptable_range=acceptable_sensor_range)

    return df

## Inputs

The inputs have been obtained from the NASA bearings documentation:

In [4]:
INPUTS = {
    '1st_test': {'data_path': 'data/nasa_ims_bearing_dataset/1st_test',
                  'column_names': ['channel_1', 'channel_2', 'channel_3', 'channel_4',
                                   'channel_5', 'channel_6', 'channel_7', 'channel_8'],
                 'faulty_channels': ['channel_5', 'channel_6', 'channel_7', 'channel_8']},
    '2nd_test': {'data_path': 'data/nasa_ims_bearing_dataset/2nd_test',
                 'column_names': ['channel_1', 'channel_2', 'channel_3', 'channel_4'],
                 'faulty_channels': ['channel_1']},
    '3rd_test': {'data_path': 'data/nasa_ims_bearing_dataset/3rd_test/4th_test/txt',
                 'column_names': ['channel_1', 'channel_2', 'channel_3', 'channel_4'],
                 'faulty_channels': ['channel_3']}
          }

In [5]:
SAMPLING_RATE_IN_HERTZ = 20000
MEASUREMENT_DURATION_IN_SECONDS = 1
ACCEPTABLE_SENSOR_RANGE = 0.01

In [6]:
signal_resolution = get_resolution(sampling_frequency=SAMPLING_RATE_IN_HERTZ)
print(f"Signal resolution: {signal_resolution} seconds")

Signal resolution: 5e-05 seconds


## Read the data

In [7]:
complete_data_path_per_test = {}

for test, inputs_per_test in INPUTS.items():
    for key, values in inputs_per_test.items():
        data_path = inputs_per_test['data_path']
        complete_path = WORKING_DIRECTORY.joinpath(data_path)
        complete_data_path_per_test[test] = complete_path

### As pandas.DataFrame

In [8]:
pandas_df_list = {}
for test, folder_path in complete_data_path_per_test.items():
    logger.info(f'test: {test}')
    list_of_files =  os.listdir(folder_path)
    first_file = list_of_files[0]
    file_path = folder_path.joinpath(first_file)
    logger.info(f'file_path: {file_path}')
    column_names = INPUTS[test]['column_names']
    start = datetime.now()
    pandas_data = read_pandas_nasa_vibration_file(file_path=file_path,
                                                  sensors=column_names,
                                                  signal_resolution=signal_resolution,
                                                  acceptable_sensor_range=ACCEPTABLE_SENSOR_RANGE)
    end = datetime.now()
    reading_time = (end - start).total_seconds()
    logger.info(f'Time taken to read file in Pandas: {reading_time} seconds')
    pandas_df_list[test] = pandas_data

[32m2026-01-02 16:59:02.497[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mtest: 1st_test[0m
[32m2026-01-02 16:59:02.498[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mfile_path: /home/idloea/python/nasa-bearings/data/nasa_ims_bearing_dataset/1st_test/2003.11.15.02.28.46[0m
[32m2026-01-02 16:59:02.518[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mTime taken to read file in Pandas: 0.019032 seconds[0m
[32m2026-01-02 16:59:02.518[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mtest: 2nd_test[0m
[32m2026-01-02 16:59:02.519[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mfile_path: /home/idloea/python/nasa-bearings/data/nasa_ims_bearing_dataset/2nd_test/2004.02.13.20.42.39[0m
[32m2026-01-02 16:59:02.530[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mTime taken to read file in Pandas: 0.010858 se

### As polars.DataFrame

In [9]:
polars_df_list = {}
for test, folder_path in complete_data_path_per_test.items():
    logger.info(f'test: {test}')
    list_of_files =  os.listdir(folder_path)
    first_file = list_of_files[0]
    file_path = folder_path.joinpath(first_file)
    logger.info(f'file_path: {file_path}')
    column_names = INPUTS[test]['column_names']
    start = datetime.now()
    polars_data = read_nasa_vibration_file(file_path=file_path, 
                                           sensors=column_names,
                                           signal_resolution=signal_resolution,
                                           acceptable_sensor_range=ACCEPTABLE_SENSOR_RANGE)
    end = datetime.now()
    reading_time = (end - start).total_seconds()
    logger.info(f'Time taken to read file in Polars: {reading_time} seconds')
    polars_df_list[test] = polars_data

[32m2026-01-02 16:59:02.544[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mtest: 1st_test[0m
[32m2026-01-02 16:59:02.545[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mfile_path: /home/idloea/python/nasa-bearings/data/nasa_ims_bearing_dataset/1st_test/2003.11.15.02.28.46[0m
[32m2026-01-02 16:59:02.550[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mTime taken to read file in Polars: 0.004433 seconds[0m
[32m2026-01-02 16:59:02.550[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mtest: 2nd_test[0m
[32m2026-01-02 16:59:02.551[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mfile_path: /home/idloea/python/nasa-bearings/data/nasa_ims_bearing_dataset/2nd_test/2004.02.13.20.42.39[0m
[32m2026-01-02 16:59:02.552[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mTime taken to read file in Polars: 0.001078 se

## Checks
Check that the obtained values in Pandas and Polars are the same:

In [10]:
for test_name in pandas_df_list.keys():
    logger.info(f'Test Name: {test_name}')
    pandas_df = pandas_df_list[test_name]
    polars_df = polars_df_list[test_name]
    print(f'Pandas dtypes: {pandas_df.dtypes}')
    print(f'Polars dtypes: {polars_df.dtypes}')
    print('Pandas dataframe description:')
    display(pandas_df.describe())
    print('Polars dataframe description:')
    display(polars_df.describe())

[32m2026-01-02 16:59:02.559[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mTest Name: 1st_test[0m


Pandas dtypes: channel_1                      float64
channel_2                      float64
channel_3                      float64
channel_4                      float64
channel_5                      float64
channel_6                      float64
channel_7                      float64
channel_8                      float64
measurement_time_in_seconds    float64
dtype: object
Polars dtypes: [Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64]
Pandas dataframe description:


Unnamed: 0,channel_1,channel_2,channel_3,channel_4,channel_5,channel_6,channel_7,channel_8,measurement_time_in_seconds
count,20480.0,20480.0,20480.0,20480.0,20480.0,20480.0,20480.0,20480.0,20480.0
mean,-0.117869,-0.117813,-0.118122,-0.118088,-0.114136,-0.114551,-0.114537,-0.111024,0.511975
std,0.099014,0.089247,0.09136,0.076483,0.114354,0.118455,0.074492,0.080365,0.295611
min,-0.708,-0.569,-0.75,-0.547,-0.784,-0.789,-0.432,-0.515,0.0
25%,-0.181,-0.176,-0.181,-0.168,-0.188,-0.19,-0.161,-0.161,0.255988
50%,-0.117,-0.117,-0.117,-0.117,-0.115,-0.115,-0.115,-0.11,0.511975
75%,-0.054,-0.061,-0.056,-0.068,-0.039,-0.037,-0.068,-0.061,0.767962
max,0.491,0.579,0.229,0.259,0.593,0.469,0.325,0.366,1.02395


Polars dataframe description:


statistic,channel_1,channel_2,channel_3,channel_4,channel_5,channel_6,channel_7,channel_8,measurement_time_in_seconds
str,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",20480.0,20480.0,20480.0,20480.0,20480.0,20480.0,20480.0,20480.0,20480.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",-0.117869,-0.117813,-0.118122,-0.118088,-0.114136,-0.114551,-0.114537,-0.111024,0.511975
"""std""",0.099014,0.089247,0.09136,0.076483,0.114354,0.118455,0.074492,0.080365,0.295611
"""min""",-0.708,-0.569,-0.75,-0.547,-0.784,-0.789,-0.432,-0.515,0.0
"""25%""",-0.181,-0.176,-0.181,-0.168,-0.188,-0.19,-0.161,-0.161,0.256
"""50%""",-0.117,-0.117,-0.117,-0.117,-0.115,-0.115,-0.115,-0.11,0.512
"""75%""",-0.054,-0.061,-0.056,-0.068,-0.039,-0.037,-0.068,-0.061,0.76795
"""max""",0.491,0.579,0.229,0.259,0.593,0.469,0.325,0.366,1.02395


[32m2026-01-02 16:59:02.582[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mTest Name: 2nd_test[0m


Pandas dtypes: channel_1                      float64
channel_2                      float64
channel_3                      float64
channel_4                      float64
measurement_time_in_seconds    float64
dtype: object
Polars dtypes: [Float64, Float64, Float64, Float64, Float64]
Pandas dataframe description:


Unnamed: 0,channel_1,channel_2,channel_3,channel_4,measurement_time_in_seconds
count,20480.0,20480.0,20480.0,20480.0,20480.0
mean,-0.002153,-0.001498,-0.001489,-0.002907,0.511975
std,0.075615,0.094318,0.100196,0.054808,0.295611
min,-0.5,-0.449,-0.569,-0.239,0.0
25%,-0.051,-0.066,-0.063,-0.039,0.255988
50%,-0.002,-0.002,-0.002,-0.002,0.511975
75%,0.046,0.061,0.061,0.034,0.767962
max,0.403,0.41,0.774,0.229,1.02395


Polars dataframe description:


statistic,channel_1,channel_2,channel_3,channel_4,measurement_time_in_seconds
str,f64,f64,f64,f64,f64
"""count""",20480.0,20480.0,20480.0,20480.0,20480.0
"""null_count""",0.0,0.0,0.0,0.0,0.0
"""mean""",-0.002153,-0.001498,-0.001489,-0.002907,0.511975
"""std""",0.075615,0.094318,0.100196,0.054808,0.295611
"""min""",-0.5,-0.449,-0.569,-0.239,0.0
"""25%""",-0.051,-0.066,-0.063,-0.039,0.256
"""50%""",-0.002,-0.002,-0.002,-0.002,0.512
"""75%""",0.046,0.061,0.061,0.034,0.76795
"""max""",0.403,0.41,0.774,0.229,1.02395


[32m2026-01-02 16:59:02.600[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mTest Name: 3rd_test[0m


Pandas dtypes: channel_1                      float64
channel_2                      float64
channel_3                      float64
channel_4                      float64
measurement_time_in_seconds    float64
dtype: object
Polars dtypes: [Float64, Float64, Float64, Float64, Float64]
Pandas dataframe description:


Unnamed: 0,channel_1,channel_2,channel_3,channel_4,measurement_time_in_seconds
count,20480.0,20480.0,20480.0,20480.0,20480.0
mean,-0.002517,-0.001575,-0.001742,-0.002836,0.511975
std,0.073897,0.082452,0.066064,0.050828,0.295611
min,-0.364,-0.408,-0.276,-0.249,0.0
25%,-0.051,-0.059,-0.046,-0.037,0.255988
50%,-0.002,-0.002,-0.002,-0.002,0.511975
75%,0.046,0.054,0.044,0.032,0.767962
max,0.447,0.452,0.31,0.22,1.02395


Polars dataframe description:


statistic,channel_1,channel_2,channel_3,channel_4,measurement_time_in_seconds
str,f64,f64,f64,f64,f64
"""count""",20480.0,20480.0,20480.0,20480.0,20480.0
"""null_count""",0.0,0.0,0.0,0.0,0.0
"""mean""",-0.002517,-0.001575,-0.001742,-0.002836,0.511975
"""std""",0.073897,0.082452,0.066064,0.050828,0.295611
"""min""",-0.364,-0.408,-0.276,-0.249,0.0
"""25%""",-0.051,-0.059,-0.046,-0.037,0.256
"""50%""",-0.002,-0.002,-0.002,-0.002,0.512
"""75%""",0.046,0.054,0.044,0.032,0.76795
"""max""",0.447,0.452,0.31,0.22,1.02395
