In [2]:
# Import the findspark module to help locate and initialize Spark
import findspark

# Initialize Spark
findspark.init()

# Import the jupyter_black module, which is an extension to format code cells in Jupyter Notebook
import jupyter_black

# Load and enable the jupyter_black extension to format code cells automatically
jupyter_black.load()

In [3]:
# importing required libraries

import pandas as pd

from pyspark.conf import SparkConf
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col, count, isnan, when

from common import *

In [5]:
def get_data_types(data_df: DataFrame, columns: list) -> pd.DataFrame:
    """
    Get data types of specified columns in the DataFrame.

    Parameters:
        data_df (DataFrame): The input DataFrame.
        columns (list): List of column names.

    Returns:
        pd.DataFrame: DataFrame containing column names and their data types.
    """
    data_types_df = pd.DataFrame(
        {
            "column_names": columns,
            "data_types": [x[1] for x in data_df.select(columns).dtypes],
        }
    )
    return data_types_df[["column_names", "data_types"]]

In [None]:
def get_null_counts(data_df: DataFrame, columns: list) -> pd.DataFrame:
    """
    Get counts of null and NaN values for specified columns in the DataFrame.

    Parameters:
        data_df (DataFrame): The input DataFrame.
        columns (list): List of column names.

    Returns:
        pd.DataFrame: DataFrame containing column names and their null value counts.
    """
    null_counts_df = (
        data_df.select(
            [
                count(when(isnan(c) | col(c).isNull(), c)).alias(c)
                for c in columns
                if data_df.select(c).dtypes[0][1] != "timestamp"
            ]
        )
        .toPandas()
        .transpose()
    )
    null_counts_df = null_counts_df.reset_index()
    null_counts_df.columns = ["column_names", "num_null"]
    return null_counts_df

In [7]:
def get_space_and_blank_counts(data_df: DataFrame, columns: list) -> pd.DataFrame:
    """
    Get counts of white spaces and blanks for specified columns in the DataFrame.

    Parameters:
        data_df (DataFrame): The input DataFrame.
        columns (list): List of column names.

    Returns:
        pd.DataFrame: DataFrame containing column names and their space/blank counts.
    """
    num_spaces = [data_df.where(col(c).rlike("^\\s+$")).count() for c in columns]
    num_blank = [data_df.where(col(c) == "").count() for c in columns]

    space_blank_df = pd.DataFrame(
        {"column_names": columns, "num_spaces": num_spaces, "num_blank": num_blank}
    )
    return space_blank_df

In [None]:
def get_descriptive_stats(data_df: DataFrame) -> pd.DataFrame:
    """
    Get descriptive statistics for specified columns in the DataFrame.

    Parameters:
        data_df (DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: DataFrame containing descriptive statistics.
    """
    desc_df = data_df.describe().toPandas().transpose()
    desc_df.columns = ["count", "mean", "stddev", "min", "max"]
    desc_df = desc_df.iloc[1:, :]
    desc_df = desc_df.reset_index()
    desc_df.columns.values[0] = "column_names"
    desc_df = desc_df[["column_names", "count", "mean", "stddev"]]
    return desc_df

In [None]:
def get_distinct_value_counts(data_df: DataFrame, columns: list) -> pd.DataFrame:
    """
    Get the number of distinct values for specified columns in the DataFrame.

    Parameters:
        data_df (DataFrame): The input DataFrame.
        columns (list): List of column names.

    Returns:
        pd.DataFrame: DataFrame containing column names and their distinct value counts.
    """
    distinct_counts_df = pd.DataFrame(
        {
            "column_names": columns,
            "num_distinct": [data_df.select(x).distinct().count() for x in columns],
        }
    )
    return distinct_counts_df

In [None]:
def get_most_frequent_values(data_df: DataFrame, columns: list) -> pd.DataFrame:
    """
    Get the most frequently occurring value and its count for specified columns in the DataFrame.

    Parameters:
        data_df (DataFrame): The input DataFrame.
        columns (list): List of column names.

    Returns:
        pd.DataFrame: DataFrame containing column names, most frequent value, and its count.
    """
    most_freq_values = [
        data_df.groupBy(x)
        .count()
        .sort("count", ascending=False)
        .limit(1)
        .toPandas()
        .iloc[0]
        .tolist()
        for x in columns
    ]
    most_freq_values_df = pd.DataFrame(
        most_freq_values, columns=["most_freq_value", "most_freq_value_count"]
    )
    most_freq_values_df["column_names"] = columns
    most_freq_values_df = most_freq_values_df[
        ["column_names", "most_freq_value", "most_freq_value_count"]
    ]
    return most_freq_values_df

In [None]:
def get_least_frequent_values(data_df: DataFrame, columns: list) -> pd.DataFrame:
    """
    Get the least frequently occurring value and its count for specified columns in the DataFrame.

    Parameters:
        data_df (DataFrame): The input DataFrame.
        columns (list): List of column names.

    Returns:
        pd.DataFrame: DataFrame containing column names, least frequent value, and its count.
    """
    least_freq_values = [
        data_df.groupBy(x)
        .count()
        .sort("count", ascending=True)
        .limit(1)
        .toPandas()
        .iloc[0]
        .tolist()
        for x in columns
    ]
    least_freq_values_df = pd.DataFrame(
        least_freq_values, columns=["least_freq_value", "least_freq_value_count"]
    )
    least_freq_values_df["column_names"] = columns
    least_freq_values_df = least_freq_values_df[
        ["column_names", "least_freq_value", "least_freq_value_count"]
    ]
    return least_freq_values_df

In [8]:
def is_categorical_column(column: DataFrame, distinct_threshold: int) -> bool:
    """
    Determine if a column is categorical based on the distinct value threshold.

    Parameters:
        column (DataFrame): The input column to be evaluated.
        distinct_threshold (int): The threshold to consider a column as categorical.

    Returns:
        bool: True if the column is categorical, False otherwise.
    """
    num_distinct = column.nunique()
    return num_distinct <= distinct_threshold

In [4]:
def profile_column_values(data_df: DataFrame, column_name: str):
    """
    Profile the values of a column by showing value counts.

    Parameters:
        data_df (DataFrame): The input DataFrame.
        column_name (str): The name of the column to be profiled.
    """
    value_counts = data_df.groupBy(column_name).count().orderBy(col("count").desc())

    return value_counts.toPandas()