# Joining and Feature Selection
This notebook is used for joining internal driving data with external data (weather, crashes and car accidents from insurance) and performing feature selection. 

## Environmental SetUp


In [0]:
# import libraries
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import col, count, mean, stddev, min, max, when, isnan, countDistinct, lit, to_timestamp, to_date, hour, avg, sum, date_sub, current_date, datediff, floor, row_number, date_format, weekofyear, year, round, substring, concat, regexp_replace, max as Fmax, lag, last, rank, unix_timestamp, to_timestamp, expr


from pyspark.sql.functions import max as spark_max
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.sql.types import DoubleType
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, NumericType, DateType
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.sql.window import Window
import matplotlib.pyplot  as plt 


In [0]:
app_name = "feature-selection"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()

sc = spark.sparkContext

## Load Data


In [0]:
def clean_column_name(name):
    # Replace invalid characters
    invalid_chars = ",;{}()\n\t="
    clean_name = name
    for char in invalid_chars:
        clean_name = clean_name.replace(char, "_")
    return clean_name

In [0]:
# load internal data 
internal_data = spark.read.parquet("dbfs:/FileStore/tables/features_data_cleaned.parquet")
insurance_data = spark.read.csv("dbfs:/FileStore/tables/combined_accident.csv", inferSchema=True, header=True)

# convert driver_id to matched datatype with internal_data
insurance_data = insurance_data.withColumn("driver_id", F.col("driver_id").cast(DoubleType()))

# drop group_id from internal data
internal_data = internal_data.drop("group_id")

In [0]:
# 1) Ensure proper data types and column names 

driver_trips = internal_data.select("driver_id", "trip_date").distinct().withColumn("trip_date", F.col("trip_date").cast("date"))
insurance_data = insurance_data.withColumn("Date", F.col("Date").cast("date"))


In [0]:
# 2) Join *from* insurance_data -> driver_trips, matching on driver_id.
joined_df = (
    insurance_data.alias("ins")
    .join(
        driver_trips.alias("i"),
        on="driver_id",
        how="left"
    )
    .withColumn(
        "date_diff",
        F.abs(F.datediff(F.col("ins.Date"), F.col("i.trip_date")))
    )
)

# 3) For every accident row (driver_id + Date), we want the single internal_data trip_date
#    that has the smallest date_diff (and is < 15 days).
#    Define a window partitioned by (driver_id, Date) and order by date_diff ascending.
w = Window.partitionBy("ins.driver_id", "ins.Date").orderBy(F.col("date_diff").asc())

# 4) Find the single best match by picking row_number=1 within each partition 
#    and filter out any rows that exceed 30 days.
best_match_df = (
    joined_df
    .withColumn("rownum", F.row_number().over(w))
    .filter((F.col("rownum") == 1) & (F.col("date_diff") < 30))
)

# 5) best_match_df now contains at most one best match row in internal_data
#    for every row in insurance_data (within 30 days).
#    This is effectively the *unique* mapping from each accident row to a single trip_date.
#    We'll pick out the columns needed to map back to internal_data.
best_match_df = best_match_df.select(
    F.col("ins.driver_id").alias("accident_driver_id"),
    F.col("ins.Date").alias("accident_date"),
    F.col("i.trip_date")
)

# create column is_accident = 1
best_match_df = best_match_df.withColumn("is_accident", F.lit(1))

In [0]:
# 6) Now we want to mark those matched internal_data rows as is_accident=1.
#    So we can left-join best_match back to internal_data on (driver_id, trip_date).
final_df = (
    driver_trips.alias("i")
    .join(
       best_match_df.alias("b"),
       on=[
          F.col("i.driver_id") == F.col("b.accident_driver_id"),
          F.col("i.trip_date") == F.col("b.trip_date")
       ],
       how="left"
    )
    .withColumn("is_accident",
        when(F.col("b.accident_driver_id").isNotNull(), 1).otherwise(0)
    )
    .select("i.*", "is_accident")
)

# remove duplication
final_df = final_df.dropDuplicates()

In [0]:
# 7) calculate if the driver previously had an accident 
accident_window = Window.partitionBy("driver_id").orderBy("trip_date").rowsBetween(Window.unboundedPreceding, -1)

final_df = final_df.withColumn(
    "has_prev_accidents",
    when(max(col("is_accident")).over(accident_window) == 1, lit(1)).otherwise(lit(0))
)

In [0]:

accident_count_in_internal = final_df.filter(F.col("is_accident") == 1)

# Compare with the row count in insurance_data to verify that
#    every row in insurance_data found exactly one match. 
num_insurance_rows = insurance_data.count()
print(f"Num of matched driver_trips rows = {accident_count_in_internal.count()}")
print(f"Num of insurance_data rows = {num_insurance_rows}")

Num of matched driver_trips rows = 336
Num of insurance_data rows = 364


(**) Unassigned driver causes not being to matched 100 percent when does the accident happen. Evenwith an 30 days lenient, the matched is only 92%. Because of this, I am going to create another columns in `internal_data` called `has_accidents` basically what is trying to is based on the driver id and Date from insurance data. Given a driver, we can then has that driver commit incidents in the past based on trip_date

In [0]:
final_df.groupBy('is_accident').count().show()

+-----------+------+
|is_accident| count|
+-----------+------+
|          1|   336|
|          0|201498|
+-----------+------+



In [0]:
all_columns = internal_data.columns

# Create a list of columns to drop
columns_to_drop = ['change_in_distance', 'change_in_minutes']

# Add columns containing '7' or '30' to the drop list
for col in all_columns:
    if '7' in col or '30' in col:
        columns_to_drop.append(col)

# Print columns that will be dropped for verification
print(f"Dropping {len(columns_to_drop)} columns:")
for col in columns_to_drop:
    print(f"  - {col}")

# Create a new DataFrame without the dropped columns
internal_data = internal_data.drop(*columns_to_drop)

Dropping 135 columns:
  - change_in_distance
  - change_in_minutes
  - rolling_7day_trip_count
  - rolling_30day_trip_count
  - rolling_7trip_avg_speed_mph
  - rolling_30trip_avg_speed_mph
  - rolling_7day_total_distance
  - rolling_7day_total_minutes
  - rolling_30day_total_distance
  - rolling_30day_total_minutes
  - vehicle_rolling_7trip_num_inspect
  - vehicle_rolling_7trip_num_issues
  - vehicle_rolling_7trip_sum_travel_dist
  - vehicle_rolling_7trip_sum_minutes_driving
  - vehicle_rolling_7trip_num_driving_events
  - vehicle_rolling_30trip_num_inspect
  - vehicle_rolling_30trip_num_issues
  - vehicle_rolling_30trip_sum_travel_dist
  - vehicle_rolling_30trip_sum_minutes_driving
  - vehicle_rolling_30trip_num_driving_events
  - vehicle_rolling_7trip_insp_distance
  - vehicle_rolling_7trip_insp_minutes
  - vehicle_rolling_7trip_insp_events
  - vehicle_rolling_7trip_issues_distance
  - vehicle_rolling_7trip_issues_minutes
  - vehicle_rolling_7trip_issues_events
  - vehicle_rolling_30

In [0]:
final_combined_df.columns

['driver_id', 'trip_date', 'is_accident', 'has_prev_accidents']

In [0]:
# join with internal_data 
complete_driver_df = internal_data.join(final_df, on=["driver_id", "trip_date"], how="left")

# check to see if join correctly 
print(complete_driver_df.count())
print(internal_data.select("driver_id", "vehicle_id", "trip_date").distinct().count())
print(complete_driver_df.select("driver_id","vehicle_id", "trip_date").distinct().count())

216942
216942
216942


In [0]:
# load external data
external_data = spark.read.csv("dbfs:/FileStore/tables/location_data_feature_engineering.csv", 
                               inferSchema=True, 
                               header=True)

In [0]:
# Remove Extraneous Columns + Feature Selection
# A lot of the aggregated variables capture the same information 
removed_columns = [
    "ID","Brand","Region","State",
    "City",
    "Address",
    "Latitude",
    "Longitude",
    "County",
    "Country",
    "local_month_first_day",
    "timezone",
    "Crash Year","Crash Month",
    "mean_precipitation", "median_precipitation", "stddev_precipitation", 
    "q3_precipitation", "q1_precipitation", "min_precipitation", "max_precipitation", "iqr_precipitation",
    "total_crashes","total_fatalities","total_injuries","total_vehicles",
    "year_month", "first_day_of_month", 
    "adjusted_date",
    "date",
    "period_end_utc",
    "period_start_utc",
    "period_end_eastern",
    "period_start_eastern",
    "collection_date_utc",
    "collection_date_eastern", 
    "stddev_precipitation_prev_1d_avg",
    "q3_precipitation_prev_1d_avg",
    "q1_precipitation_prev_1d_avg",
    "min_precipitation_prev_1d_avg",
    "max_precipitation_prev_1d_avg",
    "iqr_precipitation_prev_1d_avg",

    "stddev_precipitation_prev_2d_avg",
    "q3_precipitation_prev_2d_avg",
    "q1_precipitation_prev_2d_avg",
    "min_precipitation_prev_2d_avg",
    "max_precipitation_prev_2d_avg",
    "iqr_precipitation_prev_2d_avg",

    "stddev_precipitation_prev_3d_avg",
    "q3_precipitation_prev_3d_avg",
    "q1_precipitation_prev_3d_avg",
    "min_precipitation_prev_3d_avg",
    "max_precipitation_prev_3d_avg",
    "iqr_precipitation_prev_3d_avg",

    "total_crashes_prev_4m_avg",
    "total_crashes_prev_5m_avg",
    "total_crashes_prev_6m_avg",

    "total_injuries_prev_4m_avg",
    "total_injuries_prev_5m_avg",
    "total_injuries_prev_6m_avg",

    "total_vehicles_prev_4m_avg",
    "total_vehicles_prev_5m_avg",
    "total_vehicles_prev_6m_avg",
    
    "total_fatalities_prev_4m_avg",
    "total_fatalities_prev_5m_avg",
    "total_fatalities_prev_6m_avg",
    
    "mean_precipitation_prev_1d_avg", 
    "mean_precipitation_prev_2d_avg",
    "mean_precipitation_prev_3d_avg",
    "mean_precipitation_prev_2d_avg",
    "stddev_precipitation_prev_1d_avg",
    
    "max_precipitation_prev_2d_avg", 
    "iqr_precipitation_prev_2d_avg",
    "stddev_precipitation_prev_3d_avg",
    "q3_precipitation_prev_3d_avg", 
    "q1_precipitation_prev_3d_avg", 
    "min_precipitation_prev_3d_avg",
    "max_precipitation_prev_3d_avg", 
    "iqr_precipitation_prev_3d_avg",
    "total_crashes_prev_1m_avg", 
    "total_crashes_prev_2m_avg", 
    "total_crashes_prev_4m_avg", "total_crashes_prev_5m_avg", "total_crashes_prev_6m_avg",
    "total_fatalities_prev_1m_avg", "total_fatalities_prev_2m_avg",
    "total_fatalities_prev_4m_avg", "total_fatalities_prev_5m_avg", "total_fatalities_prev_6m_avg", 
    "total_injuries_prev_1m_avg", "total_injuries_prev_2m_avg", 
    "total_injuries_prev_4m_avg", "total_injuries_prev_5m_avg", "total_injuries_prev_6m_avg",
    "total_vehicles_prev_1m_avg", "total_vehicles_prev_2m_avg", 
    "total_vehicles_prev_4m_avg", "total_vehicles_prev_5m_avg", "total_vehicles_prev_6m_avg"
    
]
external_data= external_data.drop(*removed_columns).cache()

In [0]:
final_df.printSchema()

root
 |-- driver_id: double (nullable = true)
 |-- trip_date: date (nullable = true)
 |-- is_accident: integer (nullable = false)
 |-- has_prev_accidents: integer (nullable = false)



In [0]:
# convert numerical to integer 
# List of string columns that are actually numeric and should be cast
string_columns_to_convert = [
    "median_precipitation_prev_1d_avg", 
     "median_precipitation_prev_2d_avg",
     "median_precipitation_prev_3d_avg", 
    "total_crashes_prev_3m_avg",
    "total_fatalities_prev_3m_avg",
    "total_injuries_prev_3m_avg",
    "total_vehicles_prev_3m_avg"
]

# Cast each string column to double
for col_name in string_columns_to_convert:
    if col_name in external_data.columns:
        print(col_name)
        external_data = external_data.withColumn(col_name, F.col(col_name).cast(DoubleType()))

median_precipitation_prev_1d_avg
median_precipitation_prev_2d_avg
median_precipitation_prev_3d_avg
total_crashes_prev_3m_avg
total_fatalities_prev_3m_avg
total_injuries_prev_3m_avg
total_vehicles_prev_3m_avg


In [0]:
# Identify numeric columns
numeric_cols = [field.name for field in external_data.schema.fields if isinstance(field.dataType, NumericType)]

# Fill nulls in numeric columns with 0 (or another value if preferred)
external_data = external_data.fillna(0, subset=numeric_cols)
external_data= external_data.withColumnRenamed("local_date", "trip_date")
external_data= external_data.withColumnRenamed("Zip", "zipcode")

In [0]:
# join between those 2 together on zipcode and local_date
joined_data = complete_driver_df.join(external_data, on = ['zipcode',"trip_date"], how = 'left').cache()

In [0]:
joined_data.groupBy('is_accident').count().show()

+-----------+------+
|is_accident| count|
+-----------+------+
|          1|   368|
|          0|216574|
+-----------+------+



In [0]:
schema = joined_data.schema

# find non-numerical columns
numerical_cols = [field.name for field in schema if isinstance(field.dataType, NumericType)]
# remove is_accident column
numerical_cols = [col for col in numerical_cols if col != "is_accident"]
non_numerical_cols = [field.name for field in schema if not isinstance(field.dataType, NumericType)]

# Show results
print("Non-numerical columns:", non_numerical_cols)

Non-numerical columns: ['trip_date']


In [0]:
# check for null values
# Count nulls per column
null_counts = joined_data.select([
    sum(F.col(c).isNull().cast("int")).alias(c)
    for c in joined_data.columns
])

null_counts.show()



## Data Splitting 

Our task is binary classfication of whether an accidents will occur. We can't use random train-test splitting because this method would leak future information into the past, which violates the temporal order (trip_date). This is why I am going to do time-based (chronological split) using `trip_date` columns 

In [0]:
# Filter for accidents only
accident_df = joined_data.filter(F.col("is_accident") == 1)

# Sort by date and get the earliest accident
earliest_accident = accident_df.orderBy(F.col("trip_date").asc()).limit(1)

# Show the earliest accident|
print("Earliest accident:")
earliest_accident.select("trip_date").show(truncate=False)

Earliest accident:
+----------+
|trip_date |
+----------+
|2023-03-01|
+----------+



In [0]:
non_accident_df = joined_data.filter(F.col("is_accident") == 0)
non_accident_df_cnt = non_accident_df.count()
accident_count = accident_df.count()


In [0]:
print(non_accident_df_cnt)
print(accident_count)
print(accident_count/non_accident_df_cnt)

216574
368
0.0016991882682131742


In [0]:
from pyspark.sql.functions import col, year, month, count

# Create month and year columns from trip_date
df_with_date_parts = joined_data.withColumn("year", year(col("trip_date"))) \
                             .withColumn("month", month(col("trip_date")))

# Group by year and month to get accident counts
accidents_by_month = df_with_date_parts.filter(col("is_accident") == 1) \
                                       .groupBy("year", "month") \
                                       .count() \
                                       .orderBy("year", "month")

# Show the results
accidents_by_month.show(100)  

+----+-----+-----+
|year|month|count|
+----+-----+-----+
|2023|    3|    5|
|2023|    4|    2|
|2023|    5|    4|
|2023|    6|    3|
|2023|    7|    2|
|2023|    8|   10|
|2023|    9|    8|
|2023|   10|   14|
|2023|   11|    5|
|2023|   12|    6|
|2024|    1|    8|
|2024|    2|   12|
|2024|    3|   10|
|2024|    4|   15|
|2024|    5|   19|
|2024|    6|   31|
|2024|    7|   18|
|2024|    8|   23|
|2024|    9|   33|
|2024|   10|   32|
|2024|   11|   22|
|2024|   12|   35|
|2025|    1|   24|
|2025|    2|   27|
+----+-----+-----+



In [0]:
# Excluding data before 3/2023 as Motive API adoption was still low and there are not of accidents for our model to 
# learn from in this period.
cutoff_date = earliest_accident.select("trip_date").first()["trip_date"]

# Split chronologically
data_for_model = joined_data.filter(F.col("trip_date") >= cutoff_date)

# Convert driver id and vehicle id for one hot encoding 
vehicle_indexer = StringIndexer(inputCol="vehicle_id", outputCol="vehicle_id_index", handleInvalid="keep")
driver_indexer = StringIndexer(inputCol="driver_id", outputCol="driver_id_index", handleInvalid="keep")

# OneHotEncoders for the indexed columns
vehicle_encoder = OneHotEncoder(inputCols=["vehicle_id_index"], outputCols=["vehicle_id_encoded"])
driver_encoder = OneHotEncoder(inputCols=["driver_id_index"], outputCols=["driver_id_encoded"])

#  Create and fit the pipeline
encoding_pipeline = Pipeline(stages=[vehicle_indexer, driver_indexer, vehicle_encoder, driver_encoder])

# Apply transformations to data_for_model
encoded_data = encoding_pipeline.fit(data_for_model).transform(data_for_model)
# drop vehicle_id and driver_id
encoded_data = encoded_data.drop("vehicle_id", "driver_id", "vehicle_id_index", "driver_id_index")


In [0]:
encoded_data.printSchema()

root
 |-- zipcode: integer (nullable = true)
 |-- trip_date: date (nullable = true)
 |-- days_since_last_trip: integer (nullable = true)
 |-- driving_year_since_first_trip: double (nullable = true)
 |-- previous_trip_date_avg_speed_mph: double (nullable = true)
 |-- rolling_15trip_avg_speed_mph: double (nullable = true)
 |-- prev_trip_date_distance: double (nullable = true)
 |-- prev_trip_date_minutes: double (nullable = true)
 |-- rolling_15day_total_distance: double (nullable = true)
 |-- rolling_15day_total_minutes: double (nullable = true)
 |-- last_has_issues: integer (nullable = true)
 |-- vehicle_inspection_rate: double (nullable = true)
 |-- vehicle_cum_issues: long (nullable = true)
 |-- vehicle_rolling_15trip_num_inspect: long (nullable = true)
 |-- vehicle_rolling_15trip_num_issues: long (nullable = true)
 |-- vehicle_rolling_15trip_sum_travel_dist: double (nullable = true)
 |-- vehicle_rolling_15trip_sum_minutes_driving: double (nullable = true)
 |-- vehicle_rolling_15trip_

In [0]:
# remove information that depends on current value
columns_exclude=["driver_total_trip_count", "driver_log_trip_count", "vehicle_cum_distance", "vehicle_cum_inspections", "vehicle_inspection_rate"
                 "vehicle_cum_issues", "vehicle_issues_rate", ""]
encoded_data= encoded_data.drop(*columns_exclude)

In [0]:
# Choose a split date 
split_date = "2024-11-01"

# Split chronologically
train_data  = encoded_data.filter(F.col("trip_date") < split_date)
test_data = encoded_data.filter(F.col("trip_date") >= split_date)

# Cache if reused
#train_df = train_data.cache()
#test_df = test_data.cache()

# Verify sizes
print("Train count:", train_data.count())
print("Test count:", test_data.count())

print("Ratio of train ", train_data.count()/encoded_data.count())
print("ratio of test ", test_data.count()/encoded_data.count())

Train count: 167079
Test count: 45271
Ratio of train  0.7868095125971274
ratio of test  0.2131904874028726


In [0]:
train_data.groupBy("is_accident").count().show()

+-----------+------+
|is_accident| count|
+-----------+------+
|          1|   260|
|          0|166819|
+-----------+------+



In [0]:
test_data.groupBy("is_accident").count().show()

+-----------+-----+
|is_accident|count|
+-----------+-----+
|          1|  108|
|          0|45163|
+-----------+-----+



In [0]:
null_counts = train_data.select([
    sum(F.col(c).isNull().cast("int")).alias(c)
    for c in train_data.columns
])
null_counts.show()

+-------+---------+--------------------+-----------------------------+--------------------------------+----------------------------+-----------------------+----------------------+----------------------------+---------------------------+---------------+-----------------------+------------------+----------------------------------+---------------------------------+--------------------------------------+------------------------------------------+-----------------------------------------+------------------------------------+-----------------------------------+----------------------------------+--------------------------------------+-------------------------------------+------------------------------------+--------------------------------------+----------------------------------------+--------------------------+------------------------+------------------------+-----------------------+-------------------------+----------------------------------+----------------------------------+-------------

In [0]:
null_counts = test_data.select([
    sum(F.col(c).isNull().cast("int")).alias(c)
    for c in test_data.columns
])
null_counts.show()

+-------+---------+--------------------+-----------------------------+--------------------------------+----------------------------+-----------------------+----------------------+----------------------------+---------------------------+---------------+-----------------------+------------------+----------------------------------+---------------------------------+--------------------------------------+------------------------------------------+-----------------------------------------+------------------------------------+-----------------------------------+----------------------------------+--------------------------------------+-------------------------------------+------------------------------------+--------------------------------------+----------------------------------------+--------------------------+------------------------+------------------------+-----------------------+-------------------------+----------------------------------+----------------------------------+-------------

In [0]:
dbutils.fs.rm("dbfs:/FileStore/tables/train_data3.parquet", recurse=True)
dbutils.fs.rm("dbfs:/FileStore/tables/test_data3.parquet", recurse=True)



True

In [0]:
train_data.coalesce(1).write.parquet("dbfs:/FileStore/tables/train_data3.parquet")
test_data.coalesce(1).write.parquet("dbfs:/FileStore/tables/test_data3.parquet")

In [0]:
train_data.printSchema()

root
 |-- zipcode: integer (nullable = true)
 |-- trip_date: date (nullable = true)
 |-- days_since_last_trip: integer (nullable = true)
 |-- driving_year_since_first_trip: double (nullable = true)
 |-- previous_trip_date_avg_speed_mph: double (nullable = true)
 |-- rolling_15trip_avg_speed_mph: double (nullable = true)
 |-- prev_trip_date_distance: double (nullable = true)
 |-- prev_trip_date_minutes: double (nullable = true)
 |-- rolling_15day_total_distance: double (nullable = true)
 |-- rolling_15day_total_minutes: double (nullable = true)
 |-- last_has_issues: integer (nullable = true)
 |-- vehicle_inspection_rate: double (nullable = true)
 |-- vehicle_cum_issues: long (nullable = true)
 |-- vehicle_rolling_15trip_num_inspect: long (nullable = true)
 |-- vehicle_rolling_15trip_num_issues: long (nullable = true)
 |-- vehicle_rolling_15trip_sum_travel_dist: double (nullable = true)
 |-- vehicle_rolling_15trip_sum_minutes_driving: double (nullable = true)
 |-- vehicle_rolling_15trip_

In [0]:
loaded_test= spark.read.parquet("dbfs:/FileStore/tables/test_data.parquet")
loaded_test.printSchema()

root
 |-- zipcode: integer (nullable = true)
 |-- trip_date: date (nullable = true)
 |-- days_since_last_trip: integer (nullable = true)
 |-- driving_year_since_first_trip: double (nullable = true)
 |-- previous_trip_date_avg_speed_mph: double (nullable = true)
 |-- rolling_15trip_avg_speed_mph: double (nullable = true)
 |-- prev_trip_date_distance: double (nullable = true)
 |-- prev_trip_date_minutes: double (nullable = true)
 |-- rolling_15day_total_distance: double (nullable = true)
 |-- rolling_15day_total_minutes: double (nullable = true)
 |-- last_has_issues: integer (nullable = true)
 |-- vehicle_inspection_rate: double (nullable = true)
 |-- vehicle_cum_issues: long (nullable = true)
 |-- vehicle_rolling_15trip_num_inspect: long (nullable = true)
 |-- vehicle_rolling_15trip_num_issues: long (nullable = true)
 |-- vehicle_rolling_15trip_sum_travel_dist: double (nullable = true)
 |-- vehicle_rolling_15trip_sum_minutes_driving: double (nullable = true)
 |-- vehicle_rolling_15trip_