In [1]:
%pip install -r ./requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split
import boto3

In [3]:
def create_spark_session():
        """
        Create and return a Spark session configured for AWS S3 access.
        Returns:
            SparkSession: Configured Spark session object.
        """
        aws_profile_name = "default"

        spark = SparkSession.builder \
                            .appName("SplitColumnExample") \
                            .config(
                                "spark.jars.packages",
                                "org.apache.hadoop:hadoop-aws:3.3.4,"
                                "com.amazonaws:aws-java-sdk-bundle:1.12.262"
                            ) \
                            .config(
                                "spark.hadoop.fs.s3a.aws.credentials.provider",
                                "com.amazonaws.auth.profile.ProfileCredentialsProvider"
                            ) \
                            .config("spark.hadoop.fs.s3a.profile", aws_profile_name) \
                            .getOrCreate()
        
        return spark

In [4]:
def get_s3_data():
    """
    Function to list and print S3 object keys in a specified bucket and prefix.
    Returns:
        str: S3 path of the last object found in the specified bucket and prefix.
    """

    s3 = boto3.client('s3')
    bucket_name = "data-challenge-loadsmart-stg"
    bucket_prefix = "data_challenge/"

    paginator = s3.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=bucket_prefix)

    for page in page_iterator:
        if 'Contents' in page:
            for obj in page['Contents']:
                print(obj['Key'])
                data_path = f"s3a://{bucket_name}/{obj['Key']}"

    return data_path

In [5]:
spark = create_spark_session()
data_path = get_s3_data()

:: loading settings :: url = jar:file:/home/leonardooliveira/.pyenv/versions/3.11.6/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/leonardooliveira/.ivy2/cache
The jars for the packages stored in: /home/leonardooliveira/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-bb28617b-bf72-49a9-a1cb-404f3d8990d1;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 195ms :: artifacts dl 7ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number|

data_challenge/_SUCCESS
data_challenge/loading_date=2025-11-30/part-00000-b4cbcec7-c430-4556-a19f-7103b742e0da.c000.snappy.parquet


In [6]:
df = spark.read.parquet(data_path)
df.show()

25/11/30 07:33:28 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
25/11/30 07:33:33 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+------------+--------------------+-------------------+-------------------+-------------------+-------------------+-------------------+----------+------------+--------+-------+--------------+--------------+----------------+-----------+------------------------+---------------+------------+-------------------------+---------------------------+-----------------------+-----------------------+-------------------------+-------------------------+-------------------------+-----------------------+----------------+---------------+------------------------+-------------------------+------------------+
|loadsmart_id|                lane|         quote_date|          book_date|        source_date|        pickup_date|      delivery_date|book_price|source_price|     pnl|mileage|equipment_type|carrier_rating|sourcing_channel|vip_carrier|carrier_dropped_us_count|   carrier_name|shipper_name|carrier_on_time_to_pickup|carrier_on_time_to_delivery|carrier_on_time_overall|pickup_appointment_time|delivery_app

In [7]:
df.count()

                                                                                

5361

In [8]:
def split_column(df, select_columns, column_to_split, delimiter):
    """
    Split a column in a DataFrame into multiple columns based on a delimiter.
    Args:
        df (DataFrame): Input DataFrame.
        select_columns (list): List of columns to select from the DataFrame.
        column_to_split (str): Name of the column to split.
        delimiter (str): Delimiter used for splitting the column.
    Returns:
        DataFrame: DataFrame with the split columns.
    """

    split_df = df.select(*select_columns).withColumn("split_column",split(df[column_to_split], delimiter))

    split_df = split_df.withColumn("origin", split_df["split_column"].getItem(0)) \
                         .withColumn("destination", split_df["split_column"].getItem(1))
    
    split_df = split_df.withColumn("pickup_city", split(split_df["origin"], ",").getItem(0)) \
                         .withColumn("pickup_state", split(split_df["origin"], ",").getItem(1)) \
                         .withColumn("delivery_city", split(split_df["destination"], ",").getItem(0)) \
                         .withColumn("delivery_state", split(split_df["destination"], ",").getItem(1))
    return split_df

In [9]:
lanes = split_column(
    df,
    select_columns=["loadsmart_id", "lane"],
    column_to_split="lane",
    delimiter=" -> "
)

In [10]:
lanes.show(truncate=False)

[Stage 5:>                                                          (0 + 1) / 1]

+------------+----------------------------------+----------------------------------+-------------+-----------------+-----------+------------+--------------+--------------+
|loadsmart_id|lane                              |split_column                      |origin       |destination      |pickup_city|pickup_state|delivery_city |delivery_state|
+------------+----------------------------------+----------------------------------+-------------+-----------------+-----------+------------+--------------+--------------+
|206431033   |Hood River,OR -> Upper Marlboro,MD|[Hood River,OR, Upper Marlboro,MD]|Hood River,OR|Upper Marlboro,MD|Hood River |OR          |Upper Marlboro|MD            |
|206521177   |Etowah,TN -> Reno,NV              |[Etowah,TN, Reno,NV]              |Etowah,TN    |Reno,NV          |Etowah     |TN          |Reno          |NV            |
|206694049   |Salinas,CA -> Upper Marlboro,MD   |[Salinas,CA, Upper Marlboro,MD]   |Salinas,CA   |Upper Marlboro,MD|Salinas    |CA          

                                                                                

In [11]:
result = df.alias("origin") \
    .join(
        lanes.alias("lanes"),
        on="loadsmart_id",
        how="inner"
    ) \
    .select(
        "origin.*",
        "pickup_city",
        "pickup_state",
        "delivery_city",
        "delivery_state"
    )

result.show(truncate=False)

25/11/30 07:33:43 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
[Stage 7:>                                                          (0 + 1) / 1]

+------------+----------------------------------+-------------------+-------------------+-------------------+-------------------+-------------------+----------+------------+--------+-------+--------------+--------------+----------------+-----------+------------------------+---------------+------------+-------------------------+---------------------------+-----------------------+-----------------------+-------------------------+-------------------------+-------------------------+-----------------------+----------------+---------------+------------------------+-------------------------+------------------+-----------+------------+--------------+--------------+
|loadsmart_id|lane                              |quote_date         |book_date          |source_date        |pickup_date        |delivery_date      |book_price|source_price|pnl     |mileage|equipment_type|carrier_rating|sourcing_channel|vip_carrier|carrier_dropped_us_count|carrier_name   |shipper_name|carrier_on_time_to_pickup|carri

                                                                                

In [12]:
result.count()

                                                                                

5369