# Assignment 4
## Jacob Wallace

In [1]:
## Imports
import numpy as np
import pandas as pd
from sklearn import preprocessing as pp

# Read Data from CSV
usedCar_df = pd.read_csv('data/cars.csv')

In [2]:
usedCar_df.shape

(38531, 30)

In [3]:
usedCar_df.head()

Unnamed: 0,manufacturer_name,model_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_has_gas,engine_type,engine_capacity,...,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,duration_listed
0,Subaru,Outback,automatic,silver,190000,2010,gasoline,False,gasoline,2.5,...,True,True,True,False,True,False,True,True,True,16
1,Subaru,Outback,automatic,blue,290000,2002,gasoline,False,gasoline,3.0,...,True,False,False,True,True,False,False,False,True,83
2,Subaru,Forester,automatic,red,402000,2001,gasoline,False,gasoline,2.5,...,True,False,False,False,False,False,False,True,True,151
3,Subaru,Impreza,mechanical,blue,10000,1999,gasoline,False,gasoline,3.0,...,False,False,False,False,False,False,False,False,False,86
4,Subaru,Legacy,automatic,black,280000,2001,gasoline,False,gasoline,2.5,...,True,False,True,True,False,False,False,False,True,7


In [4]:
## Remove numerical data columns from the processed dataframe
nonEncodedColumns = usedCar_df[usedCar_df.describe().columns]
usedCar_df.drop(nonEncodedColumns.columns.to_list(),axis=1,inplace=True)
nonEncodedColumns.head()

Unnamed: 0,odometer_value,year_produced,engine_capacity,price_usd,number_of_photos,up_counter,duration_listed
0,190000,2010,2.5,10900.0,9,13,16
1,290000,2002,3.0,5000.0,12,54,83
2,402000,2001,2.5,2800.0,4,72,151
3,10000,1999,3.0,9999.0,9,42,86
4,280000,2001,2.5,2134.11,14,7,7


In [5]:
## Remove the already encoded columns described as the feature columns
featureColumns = usedCar_df.filter(like='feature')
usedCar_df.drop(featureColumns.columns.to_list(),axis=1,inplace=True)
featureColumns.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9
0,False,True,True,True,False,True,False,True,True,True
1,False,True,False,False,True,True,False,False,False,True
2,False,True,False,False,False,False,False,False,True,True
3,True,False,False,False,False,False,False,False,False,False
4,False,True,False,True,True,False,False,False,False,True


In [6]:
## Initialize a one-hot encoder from sklearn
encoder = pp.OneHotEncoder(handle_unknown='ignore')
usedCar_encoded_df = pd.DataFrame(encoder.fit_transform(usedCar_df).toarray())

## Perform one-hot encoding on the categorical data and set the column headers to the headers generated from the encoder
usedCar_encoded_df.columns = encoder.get_feature_names_out()

In [7]:
usedCar_encoded_df.head()

Unnamed: 0,manufacturer_name_Acura,manufacturer_name_Alfa Romeo,manufacturer_name_Audi,manufacturer_name_BMW,manufacturer_name_Buick,manufacturer_name_Cadillac,manufacturer_name_Chery,manufacturer_name_Chevrolet,manufacturer_name_Chrysler,manufacturer_name_Citroen,...,drivetrain_front,drivetrain_rear,is_exchangeable_False,is_exchangeable_True,location_region_Брестская обл.,location_region_Витебская обл.,location_region_Гомельская обл.,location_region_Гродненская обл.,location_region_Минская обл.,location_region_Могилевская обл.
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [8]:
## Concatenate all the seperate dataframes together to recombine data for processing later
final_df = pd.concat([usedCar_encoded_df,featureColumns,nonEncodedColumns],axis=1)
final_df.head()

Unnamed: 0,manufacturer_name_Acura,manufacturer_name_Alfa Romeo,manufacturer_name_Audi,manufacturer_name_BMW,manufacturer_name_Buick,manufacturer_name_Cadillac,manufacturer_name_Chery,manufacturer_name_Chevrolet,manufacturer_name_Chrysler,manufacturer_name_Citroen,...,feature_7,feature_8,feature_9,odometer_value,year_produced,engine_capacity,price_usd,number_of_photos,up_counter,duration_listed
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,True,True,190000,2010,2.5,10900.0,9,13,16
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,True,290000,2002,3.0,5000.0,12,54,83
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,True,True,402000,2001,2.5,2800.0,4,72,151
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,10000,1999,3.0,9999.0,9,42,86
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,True,280000,2001,2.5,2134.11,14,7,7


In [9]:
final_df

Unnamed: 0,manufacturer_name_Acura,manufacturer_name_Alfa Romeo,manufacturer_name_Audi,manufacturer_name_BMW,manufacturer_name_Buick,manufacturer_name_Cadillac,manufacturer_name_Chery,manufacturer_name_Chevrolet,manufacturer_name_Chrysler,manufacturer_name_Citroen,...,feature_7,feature_8,feature_9,odometer_value,year_produced,engine_capacity,price_usd,number_of_photos,up_counter,duration_listed
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,True,True,190000,2010,2.5,10900.00,9,13,16
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,True,290000,2002,3.0,5000.00,12,54,83
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,True,True,402000,2001,2.5,2800.00,4,72,151
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,10000,1999,3.0,9999.00,9,42,86
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,True,280000,2001,2.5,2134.11,14,7,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,False,True,True,290000,2000,3.5,2750.00,5,85,301
38527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,False,True,True,321000,2004,2.2,4800.00,4,20,317
38528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,False,True,True,777957,2000,3.5,4300.00,3,63,369
38529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,False,False,True,20000,2001,2.0,4000.00,7,156,490
