**ANN Project**

In [8]:
#import tensorflow as tf
#from tensorflow import keras
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## Prepping the dataset for training the model

#### Cleaning milage, price, and dropping some columns

In [9]:
# Load the dataset
used_cars = pd.read_csv('used_cars.csv')

used_cars = used_cars.drop(columns=['int_col', 'clean_title'])

print("Before dropping rows with NaN values:")
print(used_cars.shape[0])

used_cars = used_cars.dropna()

print("After dropping rows with NaN values:")
print(used_cars.shape[0])

used_cars['price'] = used_cars['price'].str.replace('$', '').str.replace(',', '').astype(int)
used_cars['milage'] = used_cars['milage'].str.replace('mi.', '').str.replace(',', '').astype(int)

used_cars.head(10)

Before dropping rows with NaN values:
4009
After dropping rows with NaN values:
3730


Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,accident,price
0,Ford,Utility Police Interceptor Base,2013,51000,E85 Flex Fuel,300.0HP 3.7L V6 Cylinder Engine Flex Fuel Capa...,6-Speed A/T,Black,At least 1 accident or damage reported,10300
1,Hyundai,Palisade SEL,2021,34742,Gasoline,3.8L V6 24V GDI DOHC,8-Speed Automatic,Moonlight Cloud,At least 1 accident or damage reported,38005
2,Lexus,RX 350 RX 350,2022,22372,Gasoline,3.5 Liter DOHC,Automatic,Blue,None reported,54598
3,INFINITI,Q50 Hybrid Sport,2015,88900,Hybrid,354.0HP 3.5L V6 Cylinder Engine Gas/Electric H...,7-Speed A/T,Black,None reported,15500
4,Audi,Q3 45 S line Premium Plus,2021,9835,Gasoline,2.0L I4 16V GDI DOHC Turbo,8-Speed Automatic,Glacier White Metallic,None reported,34999
5,Acura,ILX 2.4L,2016,136397,Gasoline,2.4 Liter,F,Silver,None reported,14798
6,Audi,S3 2.0T Premium Plus,2017,84000,Gasoline,292.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,Blue,None reported,31000
7,BMW,740 iL,2001,242000,Gasoline,282.0HP 4.4L 8 Cylinder Engine Gasoline Fuel,A/T,Green,None reported,7300
8,Lexus,RC 350 F Sport,2021,23436,Gasoline,311.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,6-Speed A/T,Black,None reported,41927
10,Land,Rover Range Rover Sport 3.0 Supercharged HST,2021,27608,Gasoline,V6,Automatic,Fuji White,None reported,73897


#### Cleaning the Accident Column

In [10]:
print(used_cars['accident'].unique())

def clean_accident(text):
    if pd.isna(text):
        return None
    elif "None reported" in text:
        return 0
    else:
        return 1
    
used_cars["accident_reported"] = used_cars["accident"].apply(clean_accident)
used_cars = used_cars.dropna(subset=["accident_reported"])

print(used_cars['accident_reported'].value_counts(dropna=False))

used_cars.head(10)

['At least 1 accident or damage reported' 'None reported']
accident_reported
0    2752
1     978
Name: count, dtype: int64


Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,accident,price,accident_reported
0,Ford,Utility Police Interceptor Base,2013,51000,E85 Flex Fuel,300.0HP 3.7L V6 Cylinder Engine Flex Fuel Capa...,6-Speed A/T,Black,At least 1 accident or damage reported,10300,1
1,Hyundai,Palisade SEL,2021,34742,Gasoline,3.8L V6 24V GDI DOHC,8-Speed Automatic,Moonlight Cloud,At least 1 accident or damage reported,38005,1
2,Lexus,RX 350 RX 350,2022,22372,Gasoline,3.5 Liter DOHC,Automatic,Blue,None reported,54598,0
3,INFINITI,Q50 Hybrid Sport,2015,88900,Hybrid,354.0HP 3.5L V6 Cylinder Engine Gas/Electric H...,7-Speed A/T,Black,None reported,15500,0
4,Audi,Q3 45 S line Premium Plus,2021,9835,Gasoline,2.0L I4 16V GDI DOHC Turbo,8-Speed Automatic,Glacier White Metallic,None reported,34999,0
5,Acura,ILX 2.4L,2016,136397,Gasoline,2.4 Liter,F,Silver,None reported,14798,0
6,Audi,S3 2.0T Premium Plus,2017,84000,Gasoline,292.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,Blue,None reported,31000,0
7,BMW,740 iL,2001,242000,Gasoline,282.0HP 4.4L 8 Cylinder Engine Gasoline Fuel,A/T,Green,None reported,7300,0
8,Lexus,RC 350 F Sport,2021,23436,Gasoline,311.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,6-Speed A/T,Black,None reported,41927,0
10,Land,Rover Range Rover Sport 3.0 Supercharged HST,2021,27608,Gasoline,V6,Automatic,Fuji White,None reported,73897,0


In [11]:
def clean_transmission(text):
    if pd.isna(text):
        return None
    elif 'automatic' in text.lower() or 'a/t' in text.lower():
        return 0
    elif 'manual' in text.lower() or 'm/t' in text.lower():
        return 1
    else:
        return 2

# Apply the function to clean the transmission column
used_cars["transmission"] = used_cars["transmission"].apply(clean_transmission)

# Drop rows with NaN values in the 'transmission' column
used_cars = used_cars.dropna(subset=["transmission"])


used_cars.head(10)

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,accident,price,accident_reported
0,Ford,Utility Police Interceptor Base,2013,51000,E85 Flex Fuel,300.0HP 3.7L V6 Cylinder Engine Flex Fuel Capa...,0,Black,At least 1 accident or damage reported,10300,1
1,Hyundai,Palisade SEL,2021,34742,Gasoline,3.8L V6 24V GDI DOHC,0,Moonlight Cloud,At least 1 accident or damage reported,38005,1
2,Lexus,RX 350 RX 350,2022,22372,Gasoline,3.5 Liter DOHC,0,Blue,None reported,54598,0
3,INFINITI,Q50 Hybrid Sport,2015,88900,Hybrid,354.0HP 3.5L V6 Cylinder Engine Gas/Electric H...,0,Black,None reported,15500,0
4,Audi,Q3 45 S line Premium Plus,2021,9835,Gasoline,2.0L I4 16V GDI DOHC Turbo,0,Glacier White Metallic,None reported,34999,0
5,Acura,ILX 2.4L,2016,136397,Gasoline,2.4 Liter,2,Silver,None reported,14798,0
6,Audi,S3 2.0T Premium Plus,2017,84000,Gasoline,292.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,0,Blue,None reported,31000,0
7,BMW,740 iL,2001,242000,Gasoline,282.0HP 4.4L 8 Cylinder Engine Gasoline Fuel,0,Green,None reported,7300,0
8,Lexus,RC 350 F Sport,2021,23436,Gasoline,311.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,0,Black,None reported,41927,0
10,Land,Rover Range Rover Sport 3.0 Supercharged HST,2021,27608,Gasoline,V6,0,Fuji White,None reported,73897,0


#### Cleaning Brand

In [12]:
# Assign unique numbers to each brand
used_cars['brand_category'] = pd.Categorical(used_cars['brand']).codes + 1


columns_to_drop = [col for col in used_cars.columns if col.startswith('brand_') and col != 'brand_category']
used_cars = used_cars.drop(columns=columns_to_drop)

# Create a mapping table for brand names and their corresponding category numbers
brand_mapping = used_cars[['brand', 'brand_category']].drop_duplicates().sort_values(by='brand_category')

# Display the mapping table
print(brand_mapping)

used_cars.head(10)

              brand  brand_category
5             Acura               1
151            Alfa               2
11            Aston               3
4              Audi               4
7               BMW               5
40          Bentley               6
229         Bugatti               7
492           Buick               8
67         Cadillac               9
21        Chevrolet              10
55         Chrysler              11
17            Dodge              12
704            FIAT              13
152         Ferrari              14
0              Ford              15
101             GMC              16
82          Genesis              17
41            Honda              18
52           Hummer              19
1           Hyundai              20
3          INFINITI              21
14           Jaguar              22
35             Jeep              23
33              Kia              24
76      Lamborghini              25
10             Land              26
2             Lexus         

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,accident,price,accident_reported,brand_category
0,Ford,Utility Police Interceptor Base,2013,51000,E85 Flex Fuel,300.0HP 3.7L V6 Cylinder Engine Flex Fuel Capa...,0,Black,At least 1 accident or damage reported,10300,1,15
1,Hyundai,Palisade SEL,2021,34742,Gasoline,3.8L V6 24V GDI DOHC,0,Moonlight Cloud,At least 1 accident or damage reported,38005,1,20
2,Lexus,RX 350 RX 350,2022,22372,Gasoline,3.5 Liter DOHC,0,Blue,None reported,54598,0,27
3,INFINITI,Q50 Hybrid Sport,2015,88900,Hybrid,354.0HP 3.5L V6 Cylinder Engine Gas/Electric H...,0,Black,None reported,15500,0,21
4,Audi,Q3 45 S line Premium Plus,2021,9835,Gasoline,2.0L I4 16V GDI DOHC Turbo,0,Glacier White Metallic,None reported,34999,0,4
5,Acura,ILX 2.4L,2016,136397,Gasoline,2.4 Liter,2,Silver,None reported,14798,0,1
6,Audi,S3 2.0T Premium Plus,2017,84000,Gasoline,292.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,0,Blue,None reported,31000,0,4
7,BMW,740 iL,2001,242000,Gasoline,282.0HP 4.4L 8 Cylinder Engine Gasoline Fuel,0,Green,None reported,7300,0,5
8,Lexus,RC 350 F Sport,2021,23436,Gasoline,311.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,0,Black,None reported,41927,0,27
10,Land,Rover Range Rover Sport 3.0 Supercharged HST,2021,27608,Gasoline,V6,0,Fuji White,None reported,73897,0,26


#### Cleaning ext_col

In [13]:
# print(used_cars['ext_col'].unique())

# exploring the exotic colors 
# simple_colors = [
#     'white',
#     'black',
#     'blue',
#     'gray',
#     'red',
#     'silver',
#     'green',
#     'brown',
#     'orange',
#     'yellow',
#     "gold",
#     "beige",
#     "purple"
# ]

# for c in used_cars['ext_col']:
#     if pd.isna(c):
#         continue  
#     color = c.lower()
#     if not any(simple in color for simple in simple_colors):
#         print(color.lower())


def simplify_color(color):
    color = str(color).lower()

    if any(x in color for x in ['white', 'chalk', 'pearl', 'bianco']):
        return 'White'
    elif any(x in color for x in ['black', 'nero', 'ebony', 'obsidian']):
        return 'Black'
    elif any(x in color for x in ['blue', 'blu', 'stormy']):
        return 'Blue'
    elif any(x in color for x in ['gray', 'grey', 'graphite', 'slate', 'granite', 'quartzite']):
        return 'Gray'
    elif any(x in color for x in ['red', 'rosso', 'ruby', 'scarlet']):
        return 'Red'
    elif any(x in color for x in ['silver', 'iridium', 'tungsten']):
        return 'Silver'
    elif any(x in color for x in ['green', 'moss', 'verde']):
        return 'Green'
    elif any(x in color for x in ['brown', 'bronze', 'dune']):
        return 'Brown'
    elif any(x in color for x in ['orange', 'mango', 'arancio']):
        return 'Orange'
    elif any(x in color for x in ['yellow', 'hellayella']):
        return 'Yellow'
    elif any(x in color for x in ['purple', 'plum', 'ametrin']):
        return 'Purple'
    elif any(x in color for x in ['beige', 'tan', 'sandstone']):
        return 'Beige'
    elif any(x in color for x in ['gold']):
        return 'Gold'
    else:
        return 'Non-Stardard Color'

used_cars['ext_color_simple'] = used_cars['ext_col'].apply(simplify_color)

used_cars = used_cars.dropna(subset=["ext_color_simple"])
used_cars = used_cars.drop(columns=["ext_col"])
print(used_cars['ext_color_simple'].value_counts(dropna=False))

# Assign unique numbers to each color
used_cars['color_category'] = pd.Categorical(used_cars['ext_color_simple']).codes + 1

# Drop any existing one-hot encoded color columns (if they exist)
columns_to_drop = [col for col in used_cars.columns if col.startswith('color_') and col != 'color_category']
used_cars = used_cars.drop(columns=columns_to_drop)

# Create a mapping table for color names and their corresponding numbers
color_mapping = used_cars[['ext_color_simple', 'color_category']].drop_duplicates().sort_values(by='color_category')

# Display the mapping table
print("Color mapping:")
print(color_mapping)

used_cars.head(10)

ext_color_simple
Black                 951
White                 905
Gray                  508
Silver                393
Blue                  363
Red                   278
Green                  73
Non-Stardard Color     50
Brown                  44
Beige                  43
Gold                   41
Orange                 38
Yellow                 30
Purple                 13
Name: count, dtype: int64
Color mapping:
       ext_color_simple  color_category
99                Beige               1
0                 Black               2
2                  Blue               3
342               Brown               4
38                 Gold               5
16                 Gray               6
7                 Green               7
1    Non-Stardard Color               8
97               Orange               9
18               Purple              10
35                  Red              11
5                Silver              12
4                 White              13
12               Y

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,accident,price,accident_reported,brand_category,ext_color_simple,color_category
0,Ford,Utility Police Interceptor Base,2013,51000,E85 Flex Fuel,300.0HP 3.7L V6 Cylinder Engine Flex Fuel Capa...,0,At least 1 accident or damage reported,10300,1,15,Black,2
1,Hyundai,Palisade SEL,2021,34742,Gasoline,3.8L V6 24V GDI DOHC,0,At least 1 accident or damage reported,38005,1,20,Non-Stardard Color,8
2,Lexus,RX 350 RX 350,2022,22372,Gasoline,3.5 Liter DOHC,0,None reported,54598,0,27,Blue,3
3,INFINITI,Q50 Hybrid Sport,2015,88900,Hybrid,354.0HP 3.5L V6 Cylinder Engine Gas/Electric H...,0,None reported,15500,0,21,Black,2
4,Audi,Q3 45 S line Premium Plus,2021,9835,Gasoline,2.0L I4 16V GDI DOHC Turbo,0,None reported,34999,0,4,White,13
5,Acura,ILX 2.4L,2016,136397,Gasoline,2.4 Liter,2,None reported,14798,0,1,Silver,12
6,Audi,S3 2.0T Premium Plus,2017,84000,Gasoline,292.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,0,None reported,31000,0,4,Blue,3
7,BMW,740 iL,2001,242000,Gasoline,282.0HP 4.4L 8 Cylinder Engine Gasoline Fuel,0,None reported,7300,0,5,Green,7
8,Lexus,RC 350 F Sport,2021,23436,Gasoline,311.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,0,None reported,41927,0,27,Black,2
10,Land,Rover Range Rover Sport 3.0 Supercharged HST,2021,27608,Gasoline,V6,0,None reported,73897,0,26,White,13


# Clean Fuel_type Column

Gasoline = 0

Diesel = 1

Hybrid = 2

E85 Flex Fuel = 3

Other = 4



In [6]:
print(used_cars['fuel_type'].value_counts(dropna=False))

def clean_fuel(value):
    value = value.lower()
    if 'gasoline' in value:
        return 0
    elif 'diesel' in value:
        return 1
    elif 'hybrid' in value:
        return 2
    elif 'e85 flex fuel' in value:
        return 3
    else:
        return 4

used_cars['fuel_type'] = used_cars['fuel_type'].apply(clean_fuel)

used_cars.head(10)

fuel_type
Gasoline          3218
Hybrid             190
E85 Flex Fuel      133
Diesel             110
–                   44
Plug-In Hybrid      33
not supported        2
Name: count, dtype: int64


Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,price,accident_reported
0,Ford,Utility Police Interceptor Base,2013,51000,3,300.0HP 3.7L V6 Cylinder Engine Flex Fuel Capa...,0,Black,10300,1
1,Hyundai,Palisade SEL,2021,34742,0,3.8L V6 24V GDI DOHC,0,Moonlight Cloud,38005,1
2,Lexus,RX 350 RX 350,2022,22372,0,3.5 Liter DOHC,0,Blue,54598,0
3,INFINITI,Q50 Hybrid Sport,2015,88900,2,354.0HP 3.5L V6 Cylinder Engine Gas/Electric H...,0,Black,15500,0
4,Audi,Q3 45 S line Premium Plus,2021,9835,0,2.0L I4 16V GDI DOHC Turbo,0,Glacier White Metallic,34999,0
5,Acura,ILX 2.4L,2016,136397,0,2.4 Liter,2,Silver,14798,0
6,Audi,S3 2.0T Premium Plus,2017,84000,0,292.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,0,Blue,31000,0
7,BMW,740 iL,2001,242000,0,282.0HP 4.4L 8 Cylinder Engine Gasoline Fuel,0,Green,7300,0
8,Lexus,RC 350 F Sport,2021,23436,0,311.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,0,Black,41927,0
10,Land,Rover Range Rover Sport 3.0 Supercharged HST,2021,27608,0,V6,0,Fuji White,73897,0
