In [54]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from IPython.display import HTML

In [55]:
df = pd.read_csv('..\\data\\unclean\\car_dataset.csv', delimiter = ";")
print("\nNumber of missing values in dataset:")
print(df.isnull().sum())

output_path = "..\\data\\clean\\"


Number of missing values in dataset:
buying          0
maintenance     0
doors           0
persons         0
luggage_boot    0
safety          0
class_values    0
dtype: int64


In [56]:
ordinal_categories = {
    'buying': ['low', 'med', 'high', 'vhigh'],
    'safety': ['low', 'med', 'high'],
    'maintenance': ['low', 'med', 'high', 'vhigh'],
    'doors': ['2', '3', '4', '5more'],
    'luggage_boot': ['small', 'med', 'big'],
    'persons': ['2', '4', 'more']
}
ordinal_encoder = OrdinalEncoder()
df_encoded = df.copy()

for column, categories in ordinal_categories.items():
    column_encoder = OrdinalEncoder(categories=[categories])
    df_encoded[column] = column_encoder.fit_transform(df_encoded[[column]])

print(df_encoded.head())

   buying  maintenance  doors  persons  luggage_boot  safety class_values
0     3.0          3.0    0.0      0.0           0.0     0.0        unacc
1     3.0          3.0    0.0      0.0           0.0     1.0        unacc
2     3.0          3.0    0.0      0.0           0.0     2.0        unacc
3     3.0          3.0    0.0      0.0           1.0     0.0        unacc
4     3.0          3.0    0.0      0.0           1.0     1.0        unacc


In [57]:
output = "<table><tr>"

for column, categories in ordinal_categories.items():
    output += f"<th>{column}</th>"
output += "</tr><tr>"

for column, categories in ordinal_categories.items():
    mapping_df = pd.DataFrame({
        'Original Value': categories,
        'Encoded Value': range(len(categories))
    })
    
    mapping_html = mapping_df.to_html(header=True, border=1, index=False)
    output += f"<td>{mapping_html}</td>"
output += "</tr></table>"

display(HTML(output))

Original Value,Encoded Value,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0
Original Value,Encoded Value,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Original Value,Encoded Value,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Original Value,Encoded Value,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
Original Value,Encoded Value,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4
Original Value,Encoded Value,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5
low,0,,,,
med,1,,,,
high,2,,,,
vhigh,3,,,,
low,0,,,,
med,1,,,,
high,2,,,,
low,0,,,,
med,1,,,,
high,2,,,,

Original Value,Encoded Value
low,0
med,1
high,2
vhigh,3

Original Value,Encoded Value
low,0
med,1
high,2

Original Value,Encoded Value
low,0
med,1
high,2
vhigh,3

Original Value,Encoded Value
2,0
3,1
4,2
5more,3

Original Value,Encoded Value
small,0
med,1
big,2

Original Value,Encoded Value
2,0
4,1
more,2


In [58]:
le = LabelEncoder()
df_encoded['class_values'] = le.fit_transform(df_encoded['class_values'])
df_encoded.rename(columns={'class_values': 'class_value'}, inplace=True)

print(df_encoded)


      buying  maintenance  doors  persons  luggage_boot  safety  class_value
0        3.0          3.0    0.0      0.0           0.0     0.0            2
1        3.0          3.0    0.0      0.0           0.0     1.0            2
2        3.0          3.0    0.0      0.0           0.0     2.0            2
3        3.0          3.0    0.0      0.0           1.0     0.0            2
4        3.0          3.0    0.0      0.0           1.0     1.0            2
...      ...          ...    ...      ...           ...     ...          ...
1723     0.0          0.0    3.0      2.0           1.0     1.0            1
1724     0.0          0.0    3.0      2.0           1.0     2.0            3
1725     0.0          0.0    3.0      2.0           2.0     0.0            2
1726     0.0          0.0    3.0      2.0           2.0     1.0            1
1727     0.0          0.0    3.0      2.0           2.0     2.0            3

[1728 rows x 7 columns]


In [59]:
file_name = "ordinal_encoded.csv"
df_encoded.to_csv(output_path + file_name, index=False)