In [0]:
pip install pytest

Python interpreter will be restarted.
Collecting pytest
  Downloading pytest-8.2.2-py3-none-any.whl (339 kB)
Collecting pluggy<2.0,>=1.5
  Downloading pluggy-1.5.0-py3-none-any.whl (20 kB)
Collecting iniconfig
  Downloading iniconfig-2.0.0-py3-none-any.whl (5.9 kB)
Collecting exceptiongroup>=1.0.0rc8
  Downloading exceptiongroup-1.2.2-py3-none-any.whl (16 kB)
Installing collected packages: pluggy, iniconfig, exceptiongroup, pytest
Successfully installed exceptiongroup-1.2.2 iniconfig-2.0.0 pluggy-1.5.0 pytest-8.2.2
Python interpreter will be restarted.


In [0]:
import os

Unit Testing

In [0]:
from pyspark.sql import SparkSession
import pytest

# Initialize SparkSession for testing
spark = SparkSession.builder.master("local").appName("Test").getOrCreate()

# Sample function to test
def process_data(df):
    return df.filter(df['value'] > 10)
def test_process_data():
    sample_data = [(1, 5), (2, 15), (3, 25)]
    columns = ['id', 'value']
    df = spark.createDataFrame(sample_data, columns)
    result = process_data(df)
    
    # Detailed assertions with messages
    try:
        assert result.count() == 2, f"Expected 2 rows, but got {result.count()}"
        assert result.collect()[0]['value'] == 15, f"Expected first row value to be 15, but got {result.collect()[0]['value']}"
        assert result.collect()[1]['value'] == 25, f"Expected second row value to be 25, but got {result.collect()[1]['value']}"
        print("Test passed")
    except AssertionError as e:
        print("Test failed")
        print(e)


In [0]:
# Run the test
test_process_data()

Test passed


Integration Testing

In [0]:
import pytest
from pyspark.sql import SparkSession

# Initialize SparkSession for testing
spark = SparkSession.builder.master("local").appName("Test").getOrCreate()

# Sample function to test
def process_data(df):
    return df.filter(df['age'] > 25)

def test_integration():
    # Mock data setup
    sample_data = [(1, 'Alice', 23), (2, 'Bob', 30), (3, 'Charlie', 28)]
    df = spark.createDataFrame(sample_data, ['id', 'name', 'age'])
    
    # Example integration process
    processed_df = process_data(df)
    
    # Print the processed DataFrame for inspection
    print("Processed DataFrame:")
    processed_df.show()
    
    try:
        assert processed_df.count() == 2, f"Expected 2 rows, but got {processed_df.count()}"
        assert processed_df.collect()[0]['name'] == 'Bob', f"Expected name 'Bob', but got {processed_df.collect()[0]['name']}"
        print("Test passed")
    except AssertionError as e:
        print("Test failed")
        print(e)

if __name__ == "__main__":
    test_integration()


Processed DataFrame:
+---+-------+---+
| id|   name|age|
+---+-------+---+
|  2|    Bob| 30|
|  3|Charlie| 28|
+---+-------+---+

Test passed


End-to-End Testing

In [0]:
test_data = [(1, 5), (2, 15), (3, 25), (4, 8), (5, 20)]
columns = ['id', 'value']
df_test = spark.createDataFrame(test_data, columns)

In [0]:
# Assuming 'data_pipeline' is your notebook name
notebook_params = {'input': df_test.toPandas().to_csv()}  # Convert DataFrame to CSV string for input
dbutils.notebook.run('Testing', 60, notebook_params)

Test a Notebook Using Pytest and nbval

In [0]:
pip install pytest nbval
# test_notebook.py

import pytest

@pytest.mark.parametrize('notebook_path', ['Testing.ipynb'])
def test_notebook(notebook_path):
    """
    Basic test to validate a notebook using nbval.
    """
    import nbval
    nbval.validate(notebook_path)
