In [17]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [19]:
df_composite_data = pd.read_csv('/content/drive/MyDrive/Fleet Insight Dashboard/Initial Data/composite-data-for-fleet-dna-csv-1.csv')
df_composite_data

Unnamed: 0.1,Unnamed: 0,vid,did,pid,class_id,voc_id,type_id,drive_id,fuel_id,day_id,...,spd_cat_7_distance,spd_cat_7_mean_speed,spd_cat_7_std_speed,spd_cat_7_ttl,spd_cat_7_zero_speed,spd_cat_8_distance,spd_cat_8_mean_speed,spd_cat_8_std_speed,spd_cat_8_ttl,spd_cat_8_zero_speed
0,0,236,25,17,7,10,5,0,1,260,...,0.0,,,0,0,0.0,,,0,0
1,1,236,25,17,7,10,5,0,1,262,...,0.0,,,0,0,0.0,,,0,0
2,2,238,25,17,7,10,5,0,1,77,...,0.0,,,0,0,0.0,,,0,0
3,3,236,25,17,7,10,5,0,1,263,...,0.0,,,0,0,0.0,,,0,0
4,4,236,25,17,7,10,5,0,1,264,...,0.0,,,0,0,0.0,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4700,4700,236,25,17,7,10,5,0,1,251,...,0.0,,,0,0,0.0,,,0,0
4701,4701,236,25,17,7,10,5,0,1,252,...,0.0,,,0,0,0.0,,,0,0
4702,4702,236,25,17,7,10,5,0,1,253,...,0.0,,,0,0,0.0,,,0,0
4703,4703,236,25,17,7,10,5,0,1,258,...,0.0,,,0,0,0.0,,,0,0


In [20]:
def cleaning_beta(df_composite_data):
    # Step 1: Extract relevant ID columns
    new = df_composite_data[['vid', 'pid', 'class_id', 'voc_id', 'type_id', 'drive_id', 'fuel_id', 'day_id']].copy()

    # Step 2: Mapping dictionaries
    vocation_map = {
        1: 'Telecom', 2: 'Beverage Delivery', 3: 'Warehouse Delivery', 4: 'Parcel Delivery',
        5: 'School Bus', 6: 'Linen Delivery', 7: 'Refuse Pickup', 8: 'Long Haul', 10: 'Mass Transit',
        11: 'Towing', 12: 'Grocery Delivery', 13: 'Port Drayage', 14: 'Food Delivery', 15: 'Snow Plow',
        16: 'Utility', 18: 'Local Delivery'
    }

    vehicle_type_map = {
        1: 'Beverage', 2: 'Bucket Truck', 3: 'Cement Mixer', 4: 'City Delivery', 5: 'City Transit Bus',
        6: 'Conventional Van', 7: 'Crew Size Pickup', 8: 'Dump', 9: 'Fire Truck', 10: 'Fuel',
        11: 'Full Size Pickup', 12: 'Furniture', 13: 'Heavy Semi Tractor', 14: 'High Profile Semi',
        15: 'Home Fuel', 16: 'Landscape Utility', 17: 'Medium Semi Tractor', 18: 'Mini Bus',
        20: 'Mini Pickup', 21: 'Minivan', 23: 'Rack', 24: 'Refrigerated Van', 25: 'Refuse Truck',
        26: 'School Bus', 27: 'Semi Sleeper', 28: 'Service Van', 29: 'Single Axle Van',
        30: 'Stake Body', 31: 'Step Van', 32: 'Straight Truck', 33: 'SUV', 34: 'Tour Bus', 35: 'Tow',
        36: 'Tractor', 37: 'Type C', 38: 'Utility Van', 39: 'Walk In'
    }

    drivetrain_map = {
        0: 'Conventional', 1: 'Parallel Hybrid', 2: 'Hydraulic Hybrid', 3: 'Series Hybrid',
        4: 'Hybrid', 5: 'Electric', 6: 'Hybrid Electric'
    }

    fuel_map = {
        0: 'Gasoline', 1: 'Diesel', 2: 'Electricity', 3: 'Compressed Natural Gas'
    }

    # Step 3: Apply mapping
    new['vocation'] = new['voc_id'].map(vocation_map)
    new['vehicle_type'] = new['type_id'].map(vehicle_type_map)
    new['drivetrain_type'] = new['drive_id'].map(drivetrain_map)
    new['fuel_type'] = new['fuel_id'].map(fuel_map)
    new.rename(columns={'class_id': 'vehicle_class'}, inplace=True)
    new.drop(columns=['voc_id', 'type_id', 'drive_id', 'fuel_id'], inplace=True)

    # Step 4: Filter out unrealistic vehicle types
    before_filtering = new.shape[0]
    new = new[~new['vehicle_type'].isin(['City Transit Bus', 'Refuse Truck', 'School Bus'])]
    after_filtering = new.shape[0]
    print(f"Rows removed due to vehicle_type filtering: {before_filtering - after_filtering}")

    # Step 5: Add remaining columns efficiently
    columns_to_add = df_composite_data.drop(columns=['vid', 'pid', 'class_id', 'voc_id', 'type_id', 'drive_id', 'fuel_id', 'day_id'])
    new = pd.concat([new, columns_to_add], axis=1)

    # Step 6: Fill missing values
    new.fillna(0, inplace=True)

    return new.reset_index(drop=True)


def seperate_vehicle_data_beta(df_cleaned):
    vehicle_df = df_cleaned[['vid', 'vehicle_class', 'vocation', 'vehicle_type', 'fuel_type', 'drivetrain_type']].copy()
    vehicle_df = vehicle_df.drop_duplicates().sort_values(by='vid').reset_index(drop=True)
    return vehicle_df


def filter_valid_vehicle_rows(df_cleaned):
    # Keep only rows with valid (non-zero and non-null) values in critical columns
    return df_cleaned[
        df_cleaned['vocation'].notna() & (df_cleaned['vocation'] != 0) &
        df_cleaned['vehicle_type'].notna() & (df_cleaned['vehicle_type'] != 0) &
        df_cleaned['fuel_type'].notna() & (df_cleaned['fuel_type'] != 0) &
        df_cleaned['drivetrain_type'].notna() & (df_cleaned['drivetrain_type'] != 0)
    ].copy()


In [21]:
df_cleaned = cleaning_beta(df_composite_data)  # Clean and map values
df_filtered = filter_valid_vehicle_rows(df_cleaned)  # Filter valid vehicle rows
vehicle_df = seperate_vehicle_data_beta(df_filtered)  # Get distinct vehicle info

# Preview results
df_filtered

Rows removed due to vehicle_type filtering: 1716


Unnamed: 0.1,vid,pid,vehicle_class,day_id,vocation,vehicle_type,drivetrain_type,fuel_type,Unnamed: 0,did,...,spd_cat_7_distance,spd_cat_7_mean_speed,spd_cat_7_std_speed,spd_cat_7_ttl,spd_cat_7_zero_speed,spd_cat_8_distance,spd_cat_8_mean_speed,spd_cat_8_std_speed,spd_cat_8_ttl,spd_cat_8_zero_speed
0,37.0,3.0,7.0,14.0,Parcel Delivery,Straight Truck,Conventional,Diesel,76,40,...,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0
1,10.0,3.0,4.0,1.0,Parcel Delivery,Walk In,Parallel Hybrid,Diesel,147,1,...,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0
2,10.0,3.0,4.0,2.0,Parcel Delivery,Walk In,Parallel Hybrid,Diesel,148,1,...,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0
3,10.0,3.0,4.0,3.0,Parcel Delivery,Walk In,Parallel Hybrid,Diesel,149,1,...,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0
4,10.0,3.0,4.0,6.0,Parcel Delivery,Walk In,Parallel Hybrid,Diesel,150,1,...,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2984,239.0,17.0,7.0,129.0,Local Delivery,Tractor,Conventional,Diesel,4510,26,...,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0
2985,239.0,17.0,7.0,130.0,Local Delivery,Tractor,Conventional,Diesel,4511,26,...,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0
2986,239.0,17.0,7.0,131.0,Local Delivery,Tractor,Conventional,Diesel,4512,26,...,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0
2987,239.0,17.0,7.0,132.0,Local Delivery,Tractor,Conventional,Diesel,4513,26,...,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0


In [22]:
vehicle_df

Unnamed: 0,vid,vehicle_class,vocation,vehicle_type,fuel_type,drivetrain_type
0,1.0,7.0,Beverage Delivery,Tractor,Diesel,Conventional
1,2.0,7.0,Beverage Delivery,Tractor,Diesel,Conventional
2,3.0,8.0,Beverage Delivery,Tractor,Diesel,Parallel Hybrid
3,4.0,8.0,Beverage Delivery,Tractor,Diesel,Parallel Hybrid
4,5.0,8.0,Beverage Delivery,Tractor,Diesel,Conventional
...,...,...,...,...,...,...
219,575.0,3.0,Food Delivery,Straight Truck,Diesel,Conventional
220,576.0,4.0,Parcel Delivery,Walk In,Diesel,Parallel Hybrid
221,577.0,4.0,Parcel Delivery,Walk In,Diesel,Parallel Hybrid
222,578.0,4.0,Parcel Delivery,Walk In,Diesel,Conventional


In [23]:
df_filtered.to_csv('/content/drive/MyDrive/Fleet Insight Dashboard/Cleaned Data/cleaned_data.csv', index=False)
vehicle_df.to_csv('/content/drive/MyDrive/Fleet Insight Dashboard/Cleaned Data/vehicle_info.csv', index=False)