# One-Hot Encoding

In [1]:
# Imports

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('../data/pre_encoded_price_categorized.csv')

In [3]:
df.head()

Unnamed: 0,body_type,city,city_fuel_economy,daysonmarket,engine_displacement,engine_type,frame_damaged,fuel_type,has_accidents,highway_fuel_economy,...,model_name,owner_count,price,theft_title,transmission_display,trim_name,wheel_system,year,torque,price_category
0,SUV / Crossover,Venice,14.0,68,4000.0,I6,False,Gasoline,True,18.0,...,Wrangler,3.0,16499.0,False,5-Speed Manual,Rubicon,4WD,2004,235,0
1,SUV / Crossover,Delaware,15.0,159,3700.0,V6,False,Gasoline,True,21.0,...,Nitro,7.0,2465.0,False,Automatic,SXT 4WD,4WD,2007,235,0
2,Sedan,Montclair,25.0,247,2400.0,I4,False,Gasoline,False,36.0,...,Sonata,1.0,12499.0,False,Automatic,SE FWD,FWD,2017,178,0
3,Sedan,Pawling,27.0,80,1500.0,I4,False,Gasoline,False,36.0,...,Malibu,1.0,16900.0,False,6-Speed Automatic,LT FWD,FWD,2017,184,0
4,Sedan,Mars,20.0,56,3000.0,I6,False,Gasoline,False,28.0,...,7 Series,1.0,59550.0,False,Automatic,740i xDrive AWD,AWD,2020,331,1


In [4]:
columns_to_encode = ['body_type','city','engine_type','frame_damaged','fuel_type','has_accidents','isCab','is_new','make_name','maximum_seating','model_name','theft_title','transmission_display','wheel_system','trim_name']

In [5]:

encoder = OneHotEncoder()

# Fit and transform the selected columns
encoded_data = encoder.fit_transform(df[columns_to_encode])

# Get feature names from the original DataFrame
feature_names = encoder.get_feature_names_out(input_features=columns_to_encode)

# Convert the sparse matrix to a DataFrame
encoded_df = pd.DataFrame(encoded_data.toarray(), columns=feature_names)

# Concatenate the original DataFrame and the one-hot encoded DataFrame
df = pd.concat([df, encoded_df], axis=1)

# Drop the original columns that were encoded
df = df.drop(columns=columns_to_encode)

# Print the resulting DataFrame
df.head()

Unnamed: 0,city_fuel_economy,daysonmarket,engine_displacement,highway_fuel_economy,horsepower,mileage,owner_count,price,year,torque,...,trim_name_xDrive28d AWD,trim_name_xDrive28i AWD,trim_name_xDrive30i AWD,trim_name_xDrive35d AWD,trim_name_xDrive35i AWD,trim_name_xDrive35i Premium AWD,trim_name_xDrive35i Sport Activity AWD,trim_name_xDrive40i AWD,trim_name_xDrive48i AWD,trim_name_xDrive50i AWD
0,14.0,68,4000.0,18.0,190.0,99246.0,3.0,16499.0,2004,235,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15.0,159,3700.0,21.0,210.0,238098.0,7.0,2465.0,2007,235,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,25.0,247,2400.0,36.0,185.0,30597.0,1.0,12499.0,2017,178,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,27.0,80,1500.0,36.0,160.0,26454.0,1.0,16900.0,2017,184,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,20.0,56,3000.0,28.0,335.0,12197.0,1.0,59550.0,2020,331,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
df.columns

Index(['city_fuel_economy', 'daysonmarket', 'engine_displacement',
       'highway_fuel_economy', 'horsepower', 'mileage', 'owner_count', 'price',
       'year', 'torque',
       ...
       'trim_name_xDrive28d AWD', 'trim_name_xDrive28i AWD',
       'trim_name_xDrive30i AWD', 'trim_name_xDrive35d AWD',
       'trim_name_xDrive35i AWD', 'trim_name_xDrive35i Premium AWD',
       'trim_name_xDrive35i Sport Activity AWD', 'trim_name_xDrive40i AWD',
       'trim_name_xDrive48i AWD', 'trim_name_xDrive50i AWD'],
      dtype='object', length=7775)

In [9]:

num_columns = len(df.columns)
print(num_columns)

7775


In [10]:
# Commenting below line so that the large file is not regenerated each time
# df.to_csv('../data/one_hot_encoded.csv', index=False)