# State Crash Data Raw Data to Input Data

This data was sourced from [Federal Motor Carrier Safety Administration](https://ai.fmcsa.dot.gov/CrashStatistics/?tab=Summary&type=&report_id=1&crash_type_id=4&datasource_id=1&time_period_id=2&report_date=2025&vehicle_type=1&state=AllStates&domicile=US&measure_id=1&operation_id=null) and reports data on all large truck and bus crashes between 2018 and 2025.

The data was available for download as a .xlsx files so we uploaded it as a folder to the Local Filesystem and then copied it to GCS for persistent storage.

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, month, year, avg, count
from pyspark.sql.types import StructType, StructField, StringType
import os
import pandas as pd
from functools import reduce
from pyspark.sql import DataFrame
import re
from pyspark.sql.functions import input_file_name, regexp_extract, lit
import os
import glob
import os
from pyspark.sql.functions import lit, col, to_date, month
from pyspark.sql.functions import col, count, sum, when, isnan, mean, corr, month, year, desc
from pyspark.sql.types import IntegerType, DoubleType, FloatType, DateType

In [3]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("CSV Processing by Year") \
    .getOrCreate()

In [4]:
data_path = "/data/state_crash_data"

In [18]:
# Set paths
local_data_path = "/data/state_crash_data"
gcs_bucket_path = "gs://ds5460-arvelo-bucket0311/notebooks/jupyter/state_crash_data"

In [30]:
# Find all CSV files in the local directory and subdirectories
csv_files = glob.glob(f"{local_data_path}/**/*.csv", recursive=True)
print(f"Found {len(csv_files)} CSV files to upload")

# Upload each CSV file to GCS
for csv_file in csv_files:
    # Get the relative path to maintain directory structure
    relative_path = os.path.relpath(csv_file, local_data_path)
    # Construct the destination path
    destination = os.path.join(gcs_bucket_path, relative_path)
    # Upload the file
    print(f"Uploading {csv_file} to {destination}")
    !gsutil cp {csv_file} {destination}

print("Upload complete!")

Found 150 CSV files to upload
Uploading /data/state_crash_data/KY_2025_crash.csv to gs://ds5460-arvelo-bucket0311/notebooks/jupyter/state_crash_data/KY_2025_crash.csv
Copying file:///data/state_crash_data/KY_2025_crash.csv [Content-Type=text/csv]...
/ [1 files][132.8 KiB/132.8 KiB]                                                
Operation completed over 1 objects/132.8 KiB.                                    
Uploading /data/state_crash_data/NY_2024_crash.csv to gs://ds5460-arvelo-bucket0311/notebooks/jupyter/state_crash_data/NY_2024_crash.csv
Copying file:///data/state_crash_data/NY_2024_crash.csv [Content-Type=text/csv]...
/ [1 files][  2.4 MiB/  2.4 MiB]                                                
Operation completed over 1 objects/2.4 MiB.                                      
Uploading /data/state_crash_data/VA_2025_crash.csv to gs://ds5460-arvelo-bucket0311/notebooks/jupyter/state_crash_data/VA_2025_crash.csv
Copying file:///data/state_crash_data/VA_2025_crash.csv [Content-Ty

In [10]:
# Create the local directory structure
local_dir = f"{local_data_path}/yearly_state_crash_data"
os.makedirs(local_dir, exist_ok=True)

In [19]:
def process_csv_files(local_data_path, gcs_bucket_path):
    # Find all CSV files using Python's glob
    csv_files = glob.glob(f"{local_data_path}/**/*.csv", recursive=True)
    print(f"Found {len(csv_files)} CSV files")
    
    # Group files by year
    files_by_year = {}
    for file_path in csv_files:
        # Extract year from filename or path
        match = re.search(r'(\d{4})', os.path.basename(file_path))
        if match:
            year = match.group(1)
            if year not in files_by_year:
                files_by_year[year] = []
            files_by_year[year].append(file_path)
    
    # Process each year's files
    for year, file_list in files_by_year.items():
        print(f"Processing {len(file_list)} files for year {year}")
        
        # Create empty list to store dataframes
        dfs = []
        
        # Read and concatenate all files for this year using pandas
        for file_path in file_list:
            try:
                # Read CSV file with pandas
                df = pd.read_csv(file_path)
                # Add source filename as a column
                df['source_file'] = os.path.basename(file_path)
                # Append to list
                dfs.append(df)
                print(f"  Added {file_path} with {len(df)} rows")
            except Exception as e:
                print(f"  Error reading {file_path}: {e}")
        
        if dfs:
            # Concatenate all dataframes for this year
            combined_df = pd.concat(dfs, ignore_index=True)
            print(f"  Created combined dataset with {len(combined_df)} rows")
            
            # Define output paths
            output_dir = f"{local_data_path}/yearly_state_crash_data"
            local_output_path = f"{output_dir}/state_crash_data_{year}.csv"
            gcs_output_path = f"{gcs_bucket_path}/yearly_state_crash_data/state_crash_data_{year}.csv"
            
            # Create directory if it doesn't exist
            os.makedirs(output_dir, exist_ok=True)
            
            # Save locally using pandas
            combined_df.to_csv(local_output_path, index=False)
            print(f"  Saved to {local_output_path}")
            
            # Upload to GCS
            os.system(f"gsutil cp {local_output_path} {gcs_output_path}")
            print(f"  Uploaded to {gcs_output_path}")
        else:
            print(f"  No valid files found for year {year}")
    
    print("Processing complete!")

# Call the function with your paths

In [20]:
process_csv_files(local_data_path, gcs_bucket_path)

Found 150 CSV files
Processing 50 files for year 2025
  Added /data/state_crash_data/KY_2025_crash.csv with 240 rows
  Added /data/state_crash_data/VA_2025_crash.csv with 309 rows
  Added /data/state_crash_data/UT_2025_crash.csv with 49 rows
  Added /data/state_crash_data/NE_2025_crash.csv with 67 rows
  Added /data/state_crash_data/OH_2025_crash.csv with 460 rows
  Added /data/state_crash_data/WA_2025_crash.csv with 138 rows
  Added /data/state_crash_data/MI_2025_crash.csv with 193 rows
  Added /data/state_crash_data/MD_2025_crash.csv with 0 rows
  Added /data/state_crash_data/ME_2025_crash.csv with 55 rows
  Added /data/state_crash_data/NV_2025_crash.csv with 78 rows
  Added /data/state_crash_data/FL_2025_crash.csv with 506 rows
  Added /data/state_crash_data/OR_2025_crash.csv with 90 rows
  Added /data/state_crash_data/SD_2025_crash.csv with 2 rows
  Added /data/state_crash_data/CO_2025_crash.csv with 51 rows
  Added /data/state_crash_data/LA_2025_crash.csv with 160 rows
  Added /da