In [9]:
# Import the findspark module to help locate and initialize Spark
import findspark

# Initialize Spark
findspark.init()

# Import the jupyter_black module, which is an extension to format code cells in Jupyter Notebook
import jupyter_black

# Load and enable the jupyter_black extension to format code cells automatically
jupyter_black.load()

In [10]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import desc, col, sum, max, year, current_date, date_sub, avg
from pyspark.sql.window import Window

In [11]:
def get_top_n_categories_by_numeric_column(
    df: DataFrame, category_column: str, numeric_column: str, n: int
) -> DataFrame:
    """
    Get the top N categories based on a numeric column.

    Args:
        df (DataFrame): The input DataFrame containing data.
        category_column (str): The name of the categorical column.
        numeric_column (str): The name of the numeric column for ranking.
        n (int): The number of top categories to return.

    Returns:
        DataFrame: A DataFrame with the top N categories based on the numeric column.
    """
    # Group by the categorical column and aggregate the sum of the numeric column
    category_aggregated = df.groupBy(category_column).agg(
        sum(numeric_column).alias("Total_" + numeric_column)
    )

    # Sort by total numeric column in descending order and limit to top N
    top_n_categories = category_aggregated.sort(desc("Total_" + numeric_column)).limit(
        n
    )

    return top_n_categories

In [12]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import round, avg, stddev, min, max


def get_numeric_column_distribution_by_category(
    df: DataFrame, num_col: str, cat_col: str
) -> DataFrame:
    """
    Calculate distribution statistics of a numeric column per job category.

    Args:
        df (DataFrame): Input DataFrame with the provided schema.
        num_col (str): Name of the numeric column for which distribution is calculated.
        cat_col (str): Name of the categorical column used for grouping.

    Returns:
        DataFrame: DataFrame with rounded numeric column distribution statistics per job category.
    """
    # Calculate distribution statistics per job category
    distribution = df.groupBy(cat_col).agg(
        round(avg(num_col), 2).alias(num_col + "_Avg"),
        round(stddev(num_col), 2).alias(num_col + "_StdDev"),
        round(min(num_col), 2).alias(num_col + "_Min"),
        round(max(num_col), 2).alias(num_col + "_Max"),
    )

    return distribution

In [13]:
def get_highest_salary_per_agency(df: DataFrame) -> DataFrame:
    """
    Get the highest salary records within each agency.

    Args:
        df (DataFrame): Input DataFrame with job records.

    Returns:
        DataFrame: DataFrame with the highest salary records per agency.
    """
    # Define a window specification to partition by "Agency"
    window_spec = Window.partitionBy("agency")

    # Calculate the maximum salary within each agency using the window function
    df_with_max_salary = df.withColumn(
        "MaxSalary", max(col("annualsalaryto")).over(window_spec)
    )

    # Filter rows where the salary matches the maximum salary per agency
    result = df_with_max_salary.filter(col("annualsalaryto") == col("MaxSalary"))

    # Select the relevant columns for the final result
    final_result = result.select("job_id", "agency", "annualsalaryto")

    final_result_sorted = final_result.orderBy(col("AnnualSalaryTo").desc())

    return final_result_sorted

In [None]:
def calculate_average_salary_per_agency_last_n_years(
    df: DataFrame, n_years: int = 2
) -> DataFrame:
    """
    Calculate the average annual salary for each agency for job postings in the last N years.

    Args:
        df (DataFrame): Input DataFrame with job records.
        n_years (int): Number of years to consider. Default is 2.

    Returns:
        DataFrame: DataFrame containing the average annual salary per agency.
    """

    # Convert "Posting Date" to date format
    df = df.withColumn("Posting_Date", col("Posting_Date").cast("date"))

    # Calculate the date N years ago from current date
    nyears_ago = date_sub(current_date(), 365 * n_years)

    # Filter data for the last N years
    filtered_df = df.filter(col("Posting_Date") >= nyears_ago)

    # Calculate average annual salary per agency
    avg_salary_df = filtered_df.groupBy("Agency").agg(avg("AnnualSalaryTo"))

    return avg_salary_df

In [14]:
from pyspark.sql.functions import col, split, explode, round, avg


def calculate_average_salary_by_ngram(
    df: DataFrame, ngram_column: str, salary_column: str
) -> DataFrame:
    """
    Calculate the average salary for different ngrams in the provided column.

    Args:
        df (DataFrame): Input DataFrame with job records.
        ngram_column (str): Name of the column containing ngrams.
        salary_column (str): Name of the column containing salary information.
    Returns:
        DataFrame: DataFrame containing the average salary per ngram.
    """

    # Split the ngram column by comma and create a new column
    df = df.withColumn("ngrams_list", split(col(ngram_column), ","))

    # Explode the ngrams list and select necessary columns
    exploded_df = df.select(explode("ngrams_list").alias("ngram"), col(salary_column))

    # Calculate average salary per ngram
    ngram_avg_salary_df = exploded_df.groupBy("ngram").agg(
        round(avg(col(salary_column)), 0).alias("avg_salary")
    )

    # Sort the result by average salary in descending order
    sorted_ngram_avg_salary_df = ngram_avg_salary_df.orderBy(
        "avg_salary", ascending=False
    )

    return sorted_ngram_avg_salary_df