# Analysis of v_load_summary_hourly Table

This notebook analyzes the schema and contents of the v_load_summary_hourly table from AWS Glue Catalog.

In [None]:
# Import required libraries
import pandas as pd
import pyarrow as pa
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the parquet file
file_path = '/local/home/admsia/parquet_analysis/load_summary.parquet'
df = pd.read_parquet(file_path)
print(f"Dataset shape: {df.shape}")

## Schema Overview

The table contains 116 columns with various data about vehicle routes, schedules, and logistics metrics.

In [None]:
# Display table columns
print("Column Names:")
columns = df.columns.tolist()
for i, col in enumerate(columns):
    print(f"{i+1}. {col}")

In [None]:
# Show data types
df.dtypes

## Data Sample

Here's a sample of the data to understand its structure:

In [None]:
# Display a sample of the data
df.head(5)

## Key Field Analysis

Let's examine the distribution of key fields in the dataset.

In [None]:
# Program code distribution
print("Program code distribution:")
print(df['program_code'].value_counts().head(10))

# Shipment mode distribution
print("\nShipment mode distribution:")
print(df['shipment_mode'].value_counts())

# Equipment type distribution
print("\nEquipment type distribution:")
print(df['equipment_type'].value_counts().head(10))

# Vehicle execution status
print("\nVehicle execution status distribution:")
print(df['vehicle_execution_status'].value_counts())

## Transit Time Analysis

Analysis of scheduled vs. actual transit times.

In [None]:
# Transit time statistics
print("Transit time statistics:")
df[['transit_hours_actual', 'scheduled_transit_hours']].describe()

# Late arrival analysis
print("\nLate arrival statistics:")
print(f"Origin arrival late hours - average: {df['origin_arrival_late_hrs'].mean()}")
print(f"Origin arrival late hours - median: {df['origin_arrival_late_hrs'].median()}")
print(f"Destination arrival late hours - average: {df['dest_arrival_late_hrs'].mean()}")
print(f"Destination arrival late hours - median: {df['dest_arrival_late_hrs'].median()}")

## Route Analysis

Examining characteristics of the routes.

In [None]:
# Miles distribution
plt.figure(figsize=(12, 6))
sns.histplot(df['miles'], bins=50)
plt.title('Distribution of Route Miles')
plt.xlabel('Miles')
plt.ylabel('Count')
plt.show()

# Stop count distribution
print("Stop count distribution:")
print(df['stop_count'].value_counts().head(10))

## SQL-Style Schema Definition

Below is an SQL representation of the schema with descriptions of each column:

In [None]:
# Print the SQL schema definition
sql_schema = """
CREATE TABLE v_load_summary_hourly (
  vrid VARCHAR,                         -- Unique vehicle route ID
  unused_region_id VARCHAR,             -- Unused region identifier
  unused_report_day TIMESTAMP,          -- Unused report day timestamp
  report_week VARCHAR,                  -- Week of the report (e.g., W27)
  report_month VARCHAR,                 -- Month of the report (e.g., M07)
  program_code VARCHAR,                 -- Purpose code (EMPTY, XFER, SCIB, AMZL, etc.)
  carrier_manager VARCHAR,              -- Carrier manager
  tp_id VARCHAR,                        -- Transport plan ID
  tour_id VARCHAR,                      -- Tour identifier
  scac VARCHAR,                         -- Standard Carrier Alpha Code
  carrier_name VARCHAR,                 -- Name of the carrier
  subcarrier VARCHAR,                   -- Subcarrier code
  carrier_group VARCHAR,                -- Carrier group (e.g., ATS_BROKERAGE, ATS_DEDICATED)
  lane VARCHAR,                         -- Origin to destination route (e.g., DCL4->CLE2)
  stop_count INT,                       -- Number of stops
  account_id VARCHAR,                   -- Business purpose identifier
  shipment_mode VARCHAR,                -- Mode of shipment (TRUCKLOAD, INTERMODAL, LESS_THAN_TRUCKLOAD)
  miles INT,                            -- Distance in miles
  cpt TIMESTAMP,                        -- Critical pull time/commitment
  adhoc_load VARCHAR,                   -- Whether the load is adhoc
  equipment_type VARCHAR,               -- Type of equipment/vehicle
  transit_operator_type VARCHAR,        -- Type of driver operation (e.g., SINGLE_DRIVER)
  tr_id VARCHAR,                        -- Transport record ID
  crid VARCHAR,                         -- Customer reference ID
  canceled_load VARCHAR,                -- Whether load was canceled (TRUE/FALSE)
  canceled_date TIMESTAMP,              -- Date and time of cancellation
  cancelation_reason VARCHAR,           -- Reason for cancellation
  origin VARCHAR,                       -- Origin facility code
  origin_zip VARCHAR,                   -- Origin ZIP code
  origin_city VARCHAR,                  -- Origin city
  origin_state VARCHAR,                 -- Origin state
  origin_country VARCHAR,               -- Origin country
  origin_type VARCHAR,                  -- Type of origin facility
  origin_local_timezone VARCHAR,        -- Local timezone of origin
  final_destination VARCHAR,            -- Destination facility code
  dest_zip VARCHAR,                     -- Destination ZIP code
  dest_city VARCHAR,                    -- Destination city
  dest_state VARCHAR,                   -- Destination state
  dest_country VARCHAR,                 -- Destination country
  destination_type VARCHAR,             -- Type of destination facility
  dest_local_timezone VARCHAR,          -- Local timezone of destination
  manifest_base DECIMAL,                -- Base manifest cost
  manifest_fuel DECIMAL,                -- Fuel manifest cost
  manifest_total DECIMAL,               -- Total manifest cost
  total_invoice_amount DECIMAL,         -- Total invoice amount
  total_paid_amount DECIMAL,            -- Total paid amount
  total_accessorials DECIMAL,           -- Total accessorial charges
  estimated_cost_accrual DECIMAL,       -- Estimated cost accrual
  accrual_cost_source VARCHAR,          -- Source of accrual cost
  tour_day_rate DECIMAL,                -- Day rate for tour
  total_pkg_unit_count INT,             -- Total package/unit count
  total_cube DECIMAL,                   -- Total cubic volume
  pallet_count INT,                     -- Count of pallets
  gaylord_count INT,                    -- Count of gaylord containers
  cube_target_cubic_ft DECIMAL,         -- Target cubic feet
  global_dea_pkgs VARCHAR,              -- Global DEA packages
  transit_hours_actual DECIMAL,         -- Actual transit hours
  scheduled_transit_hours DECIMAL,      -- Scheduled transit hours
  origin_scheduled_arrival TIMESTAMP,   -- Scheduled arrival at origin
  origin_calc_arrival TIMESTAMP,        -- Calculated/actual arrival at origin
  origin_calc_arrival_source VARCHAR,   -- Source of origin arrival calculation
  origin_begin_loading_time TIMESTAMP,  -- Beginning of loading at origin
  origin_finish_loading_time TIMESTAMP, -- End of loading at origin
  origin_arrival_late_group VARCHAR,    -- Grouping for origin arrival lateness
  origin_arrival_late_hrs DECIMAL,      -- Hours late at origin arrival (negative is early)
  origin_responsible VARCHAR,           -- Responsible party at origin
  origin_arrival_reason VARCHAR,        -- Reason for arrival timing at origin
  origin_arrival_note VARCHAR,          -- Notes regarding origin arrival
  origin_scheduled_depart TIMESTAMP,    -- Scheduled departure from origin
  origin_calc_depart TIMESTAMP,         -- Calculated/actual departure from origin
  origin_calc_depart_source VARCHAR,    -- Source of origin departure calculation
  origin_departure_late_group VARCHAR,  -- Grouping for origin departure lateness
  origin_depart_late_hrs DECIMAL,       -- Hours late at origin departure (negative is early)
  origin_fc_delay_hours DECIMAL,        -- Hours of delay at origin facility
  dest_scheduled_arrival TIMESTAMP,     -- Scheduled arrival at destination
  dest_calc_arrival TIMESTAMP,          -- Calculated/actual arrival at destination
  dest_calc_arrival_source VARCHAR,     -- Source of destination arrival calculation
  dest_begin_unloading_time TIMESTAMP,  -- Beginning of unloading at destination
  dest_finish_unloading_time TIMESTAMP, -- End of unloading at destination
  late_to_destination_per_calc VARCHAR, -- Whether late to destination per calculation
  dest_arrival_late_group VARCHAR,      -- Grouping for destination arrival lateness
  dest_arrival_late_hrs DECIMAL,        -- Hours late at destination arrival (negative is early)
  dest_responsible VARCHAR,             -- Responsible party at destination
  dest_arrival_note VARCHAR,            -- Notes regarding destination arrival
  dest_arrival_reason VARCHAR,          -- Reason for arrival timing at destination
  trailer_id VARCHAR,                   -- Trailer identifier
  bobtail_trailer_id VARCHAR,           -- Bobtail trailer identifier
  driver_id VARCHAR,                    -- Driver identifier
  driver_id_2 VARCHAR,                  -- Secondary driver identifier
  arc_type VARCHAR,                     -- ARC type
  wims_load VARCHAR,                    -- Whether it's a WIMS load
  enrichment_flag VARCHAR,              -- Enrichment flag
  tem_owned VARCHAR,                    -- TEM owned indicator
  run_structure_id VARCHAR,             -- Run structure identifier
  oneday_core_pkgs VARCHAR,             -- One-day core packages
  trailer_ready_time TIMESTAMP,         -- Time trailer is ready
  rate_type VARCHAR,                    -- Rate type (PER_LOAD, PER_TRIP)
  origin_load_type VARCHAR,             -- Type of loading at origin
  dest_unload_type VARCHAR,             -- Type of unloading at destination
  drop_trailer_time TIMESTAMP,          -- Time of trailer drop
  resource_block_id VARCHAR,            -- Resource block identifier
  operator_id VARCHAR,                  -- Operator identifier
  container_program VARCHAR,            -- Container program
  containerized_pkgs DECIMAL,           -- Containerized packages
  gl_account VARCHAR,                   -- General ledger account
  vr_create_date TIMESTAMP,             -- Vehicle route creation date
  rlb_load VARCHAR,                     -- RLB load indicator
  plan_type VARCHAR,                    -- Plan type
  movement_type VARCHAR,                -- Movement type
  power_id VARCHAR,                     -- Power unit identifier
  tem_rsp_region VARCHAR,               -- TEM responsible region
  is_customer_facing VARCHAR,           -- Customer facing indicator
  vehicle_execution_status VARCHAR,     -- Execution status (COMPLETED, CANCELLED, etc.)
  facility_sequence VARCHAR,            -- Facility sequence
  dest_planned_arrival TIMESTAMP,       -- Planned arrival at destination
  region_id VARCHAR                     -- Region identifier
);
"""

print(sql_schema)

## Data Completeness

Checking for missing values in key columns.

In [None]:
# Calculate missing values percentage
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100

# Create a dataframe with the results
missing_info = pd.DataFrame({
    'Missing Values': missing_data,
    'Missing Percent': missing_percent
})

# Sort by missing percent
missing_info = missing_info[missing_info['Missing Values'] > 0].sort_values('Missing Percent', ascending=False)
missing_info.head(20)