# Car Type Demo (Data pre-processing)

This notebook will preprocess the EPA dataset for the demo notebook "engine_car_type_demo.ipynb"

data source: https://www.fueleconomy.gov/feg/ws/index.shtml#vehicle

In [1]:
import pandas as pd
import numpy as np
import random

pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
df = pd.read_csv('../data/vehicles.csv', low_memory=False)

In [3]:
df.columns

Index(['barrels08', 'barrelsA08', 'charge120', 'charge240', 'city08',
       'city08U', 'cityA08', 'cityA08U', 'cityCD', 'cityE', 'cityUF', 'co2',
       'co2A', 'co2TailpipeAGpm', 'co2TailpipeGpm', 'comb08', 'comb08U',
       'combA08', 'combA08U', 'combE', 'combinedCD', 'combinedUF', 'cylinders',
       'displ', 'drive', 'engId', 'eng_dscr', 'feScore', 'fuelCost08',
       'fuelCostA08', 'fuelType', 'fuelType1', 'ghgScore', 'ghgScoreA',
       'highway08', 'highway08U', 'highwayA08', 'highwayA08U', 'highwayCD',
       'highwayE', 'highwayUF', 'hlv', 'hpv', 'id', 'lv2', 'lv4', 'make',
       'model', 'mpgData', 'phevBlended', 'pv2', 'pv4', 'range', 'rangeCity',
       'rangeCityA', 'rangeHwy', 'rangeHwyA', 'trany', 'UCity', 'UCityA',
       'UHighway', 'UHighwayA', 'VClass', 'year', 'youSaveSpend', 'guzzler',
       'trans_dscr', 'tCharger', 'sCharger', 'atvType', 'fuelType2', 'rangeA',
       'evMotor', 'mfrCode', 'c240Dscr', 'charge240b', 'c240bDscr',
       'createdOn', 'modifiedOn

In [4]:
# columns of interest:
cols = ['city08', 'drive', 'fuelType', 'highway08', 'hlv', 'hpv', 'lv2', 'lv4', 'make', 'model', 'pv2', 'pv4', 'year', 'VClass']

In [5]:
col_names = ['CityMPG', 'DriveType', 'FuelType', 'HighwayMPG',
       'HatchbackLuggageVolume', 'HatchbackPassengerVolume',
       'TwoDoorLuggageVolume', 'FourDoorLuggageVolume', 'Make', 'Model',
       'TwoDoorPassengerVolume', 'FourDoorPassengerVolume', 'Year',
       'VehicleClass']

processed_df = df[cols]
processed_df.columns = col_names

NOTE: 

```(8) Interior volume dimensions are not required for two-seater passenger cars or any vehicle classified as truck which includes vans, pickups, special purpose vehicles, minivan and sport utility vehicles.```

In [6]:
processed_df.head()

Unnamed: 0,CityMPG,DriveType,FuelType,HighwayMPG,HatchbackLuggageVolume,HatchbackPassengerVolume,TwoDoorLuggageVolume,FourDoorLuggageVolume,Make,Model,TwoDoorPassengerVolume,FourDoorPassengerVolume,Year,VehicleClass
0,19,Rear-Wheel Drive,Regular,25,0,0,0,0,Alfa Romeo,Spider Veloce 2000,0,0,1985,Two Seaters
1,9,Rear-Wheel Drive,Regular,14,0,0,0,0,Ferrari,Testarossa,0,0,1985,Two Seaters
2,23,Front-Wheel Drive,Regular,33,19,77,0,0,Dodge,Charger,0,0,1985,Subcompact Cars
3,10,Rear-Wheel Drive,Regular,12,0,0,0,0,Dodge,B150/B250 Wagon 2WD,0,0,1985,Vans
4,17,4-Wheel or All-Wheel Drive,Premium,23,0,0,0,14,Subaru,Legacy AWD Turbo,0,90,1993,Compact Cars


In [7]:
processed_df.shape

(43550, 14)

In [8]:
df = processed_df

In [9]:
# df = df.dropna()
target_column = 'VehicleClass'

df['PassengerVolume']  = df.loc[ : , ['HatchbackPassengerVolume', 'TwoDoorPassengerVolume', 'FourDoorPassengerVolume']].apply(np.max, axis=1)
df['LuggageVolume'] = df.loc[ : , ['HatchbackLuggageVolume', 'TwoDoorLuggageVolume', 'FourDoorLuggageVolume']].apply(np.max, axis=1)

df = df[(df.loc[ : , 'LuggageVolume'] != 0) & (df.loc[ : , 'PassengerVolume'] != 0)]

df = df.drop(['HatchbackPassengerVolume', 'TwoDoorPassengerVolume', 'FourDoorPassengerVolume', 'HatchbackLuggageVolume', 'TwoDoorLuggageVolume', 'FourDoorLuggageVolume'], axis=1)

In [10]:
df.isnull().sum()

CityMPG               0
DriveType          1121
FuelType              0
HighwayMPG            0
Make                  0
Model                 0
Year                  0
VehicleClass          0
PassengerVolume       0
LuggageVolume         0
dtype: int64

In [11]:
df.shape

(23606, 10)

In [12]:
cols = df.columns.drop('VehicleClass').tolist() + ['VehicleClass']
df = df[cols]

In [13]:
cols

['CityMPG',
 'DriveType',
 'FuelType',
 'HighwayMPG',
 'Make',
 'Model',
 'Year',
 'PassengerVolume',
 'LuggageVolume',
 'VehicleClass']

In [14]:
df.to_csv('../data/vehicle_processed.csv', index=False)