# Import

In [1]:
import pandas as pd
import numpy as np 

import seaborn as sns 
import matplotlib.pyplot as plt
sns.set_theme()

import sklearn
import scipy

# Loading Data

In [2]:
filename = 'data/CarPrice_Assignment.csv'
car_price = pd.read_csv(filename)

In [3]:
car_price.shape

(205, 26)

In [8]:
car_price.columns

Index(['car_ID', 'symboling', 'CarName', 'fueltype', 'aspiration',
       'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'wheelbase',
       'carlength', 'carwidth', 'carheight', 'curbweight', 'enginetype',
       'cylindernumber', 'enginesize', 'fuelsystem', 'boreratio', 'stroke',
       'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg',
       'price'],
      dtype='object')

In [9]:
selected_columns = ['price', 'fueltype', 'aspiration','carbody', 
                  'drivewheel','wheelbase', #'brand_category',
                  'curbweight', 'enginetype', 'cylindernumber', 
                  'enginesize', 'boreratio','horsepower', 'carlength',
                  'carwidth','citympg','highwaympg']

In [10]:
car_price_selected = car_price[selected_columns]

In [12]:
len(selected_columns), car_price_selected.shape

(16, (205, 16))

In [13]:
X = car_price_selected.drop('price', axis=1)
y = car_price_selected['price']

In [14]:
X.shape

(205, 15)

# One-Hot Encoding

## Categorical Columns

In [19]:
X.shape

(205, 15)

In [18]:
list(X.dtypes.items())

[('fueltype', dtype('O')),
 ('aspiration', dtype('O')),
 ('carbody', dtype('O')),
 ('drivewheel', dtype('O')),
 ('wheelbase', dtype('float64')),
 ('curbweight', dtype('int64')),
 ('enginetype', dtype('O')),
 ('cylindernumber', dtype('O')),
 ('enginesize', dtype('int64')),
 ('boreratio', dtype('float64')),
 ('horsepower', dtype('int64')),
 ('carlength', dtype('float64')),
 ('carwidth', dtype('float64')),
 ('citympg', dtype('int64')),
 ('highwaympg', dtype('int64'))]

In [20]:
categorical_columns = X.select_dtypes(include=['object']).columns

In [21]:
categorical_columns

Index(['fueltype', 'aspiration', 'carbody', 'drivewheel', 'enginetype',
       'cylindernumber'],
      dtype='object')

In [22]:
numerical_columns = X.select_dtypes(include=['float64', 'int64']).columns

In [23]:
numerical_columns

Index(['wheelbase', 'curbweight', 'enginesize', 'boreratio', 'horsepower',
       'carlength', 'carwidth', 'citympg', 'highwaympg'],
      dtype='object')

In [24]:
X.shape, categorical_columns.shape, numerical_columns.shape

((205, 15), (6,), (9,))

## `OneHotEncoder`

In [25]:
# Initialize the encoder
encoder = sklearn.preprocessing.OneHotEncoder(drop='first')

In [26]:
# Transform the categorical data
X_encoded = encoder.fit_transform(X[categorical_columns])

In [27]:
type(X_encoded)

scipy.sparse._csr.csr_matrix

In [35]:
# Get the value count for categorical columns as a dictionary
value_count_dict = X[categorical_columns].nunique().to_dict()
sum([v - 1 for v in value_count_dict.values()]), value_count_dict

(20,
 {'fueltype': 2,
  'aspiration': 2,
  'carbody': 5,
  'drivewheel': 3,
  'enginetype': 7,
  'cylindernumber': 7})

In [32]:
encoded_columns = encoder.get_feature_names_out(categorical_columns)

In [34]:
encoded_columns.shape, encoded_columns

((20,),
 array(['fueltype_gas', 'aspiration_turbo', 'carbody_hardtop',
        'carbody_hatchback', 'carbody_sedan', 'carbody_wagon',
        'drivewheel_fwd', 'drivewheel_rwd', 'enginetype_dohcv',
        'enginetype_l', 'enginetype_ohc', 'enginetype_ohcf',
        'enginetype_ohcv', 'enginetype_rotor', 'cylindernumber_five',
        'cylindernumber_four', 'cylindernumber_six',
        'cylindernumber_three', 'cylindernumber_twelve',
        'cylindernumber_two'], dtype=object))

In [42]:
# Create a DataFrame with the encoded columns including the column names
X_encoded_df_categorical = pd.DataFrame(X_encoded.toarray(), columns=encoded_columns)

In [43]:
X_encoded_df_categorical.shape

(205, 20)

In [44]:
# Concatenate the numerical columns with the encoded columns into a single DataFrame
X_encoded_df = pd.concat([X[numerical_columns], X_encoded_df_categorical], axis=1)

In [46]:
len(numerical_columns) + len(encoded_columns), X_encoded_df.shape

(29, (205, 29))

In [47]:
X_encoded_df.shape

(205, 29)

In [48]:
X_encoded_df.head(5)

Unnamed: 0,wheelbase,curbweight,enginesize,boreratio,horsepower,carlength,carwidth,citympg,highwaympg,fueltype_gas,...,enginetype_ohc,enginetype_ohcf,enginetype_ohcv,enginetype_rotor,cylindernumber_five,cylindernumber_four,cylindernumber_six,cylindernumber_three,cylindernumber_twelve,cylindernumber_two
0,88.6,2548,130,3.47,111,168.8,64.1,21,27,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,88.6,2548,130,3.47,111,168.8,64.1,21,27,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,94.5,2823,152,2.68,154,171.2,65.5,19,26,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,99.8,2337,109,3.19,102,176.6,66.2,24,30,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,99.4,2824,136,3.19,115,176.6,66.4,18,22,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
