# Importing Libraries

In [355]:
import pandas as pd
import numpy as np
import re
import os

# File Loading

In [356]:
folder = 'data_cars'
files = os.listdir(folder)

dfs = []

headers = ['URL','Car','Status','Price_1','Price_2','Mileage','Dealer_1','Dealer_2','Dealer_3']

for file in files:
    path = os.path.join(folder, file)
    
    df = pd.read_excel(path, names=headers)
    
    dfs.append(df)

Cars = pd.concat(dfs)

Cars = Cars.drop_duplicates(subset='URL')
Cars.dropna(subset=['Car'], inplace=True)
Cars

Unnamed: 0,URL,Car,Status,Price_1,Price_2,Mileage,Dealer_1,Dealer_2,Dealer_3
1,https://www.cars.com/vehicledetail/5ee74d2b-dd...,2023 Mazda CX-50 2.5 S Premium Plus Package,New,36703,,"\n\n\n $36,703\n\n MSRP $38,635\n\n\n",,,
2,https://www.cars.com/vehicledetail/460b9155-fa...,2023 Kia Sportage S,New,28990,,"\n\n\n $28,990\n\n\n\n",,Classic Kia,
3,https://www.cars.com/vehicledetail/a0594120-25...,2024 Chevrolet Camaro 2LT,New,41425,,"\n\n\n $41,425\n\n MSRP $41,425\n\n\n",,Classic Chevrolet Beaumont,
4,https://www.cars.com/vehicledetail/b185d19c-b4...,2023 Ford Bronco Badlands,Used,,58900.0,"1,551 mi.",Mike Smith Chrysler Dodge Jeep RAM,,
5,https://www.cars.com/vehicledetail/52bb1768-17...,2021 Acura TLX Advance,Used,,34499.0,"30,384 mi.",Mike Smith Nissan,,
...,...,...,...,...,...,...,...,...,...
10494,https://www.cars.com/vehicledetail/06665049-49...,2023 Chevrolet Corvette Stingray w/3LT,New,122455,,"\n\n\n $122,455\n\n\n\n",,,Chase Chevrolet
10495,https://www.cars.com/vehicledetail/9e367454-56...,2023 Chevrolet Corvette Stingray w/3LT,New,125930,,"\n\n\n $125,930\n\n\n\n",,,Chase Chevrolet
10496,https://www.cars.com/vehicledetail/d578331d-53...,2023 Jeep Wrangler Rubicon,New,53715,,"\n\n\n $53,715\n\n MSRP $58,715\n\n\n ...",,Stockton Dodge Chrysler Jeep RAM,
10497,https://www.cars.com/vehicledetail/3b7dc7d4-60...,2023 Jeep Wrangler Rubicon 392,New,91450,,"\n\n\n $91,450\n\n MSRP $86,455\n\n\n ...",,Stockton Dodge Chrysler Jeep RAM,


Let's navigate into the dataset:

# Data Cleaning

In [357]:
Cars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18660 entries, 1 to 10498
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   URL       18660 non-null  object
 1   Car       18660 non-null  object
 2   Status    18660 non-null  object
 3   Price_1   9427 non-null   object
 4   Price_2   9233 non-null   object
 5   Mileage   18660 non-null  object
 6   Dealer_1  8942 non-null   object
 7   Dealer_2  8603 non-null   object
 8   Dealer_3  1082 non-null   object
dtypes: object(9)
memory usage: 1.4+ MB


In [358]:
Cars.describe()

Unnamed: 0,URL,Car,Status,Price_1,Price_2,Mileage,Dealer_1,Dealer_2,Dealer_3
count,18660,18660,18660,9427,9233.0,18660,8942,8603,1082
unique,18660,7086,30,7285,5921.0,17281,3087,3370,793
top,https://www.cars.com/vehicledetail/5ee74d2b-dd...,2023 Ford F-150 XLT,New,Not Priced,33995.0,\n\n\n Not Priced\n\n,Auto Hub,Morgan City Toyota,Doral Kia
freq,1,72,9427,386,26.0,163,64,34,7


We need to get the next fields:

- Brand
- Model
- Year
- Status
- Mileage
- Dealer
- Price

Let's work on this!

## Brand

In order to get the Brand field, I see two possible approaches:

- Extract from the first words of Car field.
- Use a dictionary.

Personally I prefer the second one, since I suspect the Brand name isn't always in the same position, as well as we will find Brands with one, two, three words that gonna make difficult the task.

In [359]:
brands_df = pd.read_excel('master_data/brands.xlsx')

brands_df

Unnamed: 0,keyword,brands
0,Ford,Ford
1,Chevrolet,Chevrolet
2,Toyota,Toyota
3,Honda,Honda
4,Nissan,Nissan
5,Jeep,Jeep
6,Subaru,Subaru
7,Hyundai,Hyundai
8,Kia,Kia
9,Dodge,Dodge


In [360]:
brands_dict = brands_df.set_index('keyword')['brands'].to_dict()

def assign_profile(row):
    for col in ['Car']:
        cell_value = str(row[col])  # Convierte el valor a cadena de texto
        for keyword, brands in brands_dict.items():
            if keyword.lower() in cell_value.lower():
                return brands
    return 'Other'

Cars['Brand'] = Cars.apply(assign_profile, axis=1)

In [361]:
brands_counts = Cars['Brand'].value_counts()

print(brands_counts)

Ford                   2386
Toyota                 1783
Chevrolet              1720
Jeep                   1108
GMC                     941
BMW                     893
Mercedes                831
Honda                   774
Lexus                   692
Kia                     682
Audi                    602
Mazda                   522
RAM                     511
Nissan                  494
Hyundai                 429
Volkswagen              408
Cadillac                382
Dodge                   365
Subaru                  335
Land Rover              328
Acura                   319
Volvo                   265
Porsche                 247
Lincoln                 241
Infiniti                218
Genesis                 207
Buick                   154
Jaguar                  115
Tesla                   114
Chrysler                103
Mitsubishi               82
MINI                     69
Alfa Romeo               61
Maserati                 57
Bentley                  40
Aston Martin        

In [362]:
other_cars = Cars[Cars['Brand'] == 'Other']['Car']

print(other_cars)

Series([], Name: Car, dtype: object)


Perfect!

## Model

The approach is the same as the brand, creating and using a dictionary:

In [363]:
models_df = pd.read_excel('master_data/models.xlsx')

models_df

Unnamed: 0,keyword,model
0,Outlander,Outlander
1,Mustang,Mustang
2,F-150,F-150
3,Explorer,Explorer
4,Escape,Escape
...,...,...
739,924 S,924 S
740,2,2
741,1,1
742,3,3


In [364]:
models_dict = models_df.set_index('keyword')['model'].to_dict()

def assign_profile(row):
    for col in ['Car']:
        cell_value = str(row[col])  # Convierte el valor a cadena de texto
        for keyword, model in models_dict.items():
            if isinstance(keyword, str) and keyword.lower() in cell_value.lower():
                return model
    return 'Other'

Cars['Model'] = Cars.apply(assign_profile, axis=1)

In [365]:
models_counts = Cars['Model'].value_counts()

print(models_counts)

F-150            639
Silverado        462
Sierra           415
Wrangler         409
Mustang          371
                ... 
Alpina             1
Silver Seraph      1
Marquis            1
C10                1
924 S              1
Name: Model, Length: 500, dtype: int64


In [366]:
Cars[['Brand','Model','Price_1','Price_2','Mileage']]

Unnamed: 0,Brand,Model,Price_1,Price_2,Mileage
1,Mazda,CX-5,36703,,"\n\n\n $36,703\n\n MSRP $38,635\n\n\n"
2,Kia,Sportage,28990,,"\n\n\n $28,990\n\n\n\n"
3,Chevrolet,Camaro,41425,,"\n\n\n $41,425\n\n MSRP $41,425\n\n\n"
4,Ford,Bronco,,58900.0,"1,551 mi."
5,Acura,TLX,,34499.0,"30,384 mi."
...,...,...,...,...,...
10494,Chevrolet,Corvette,122455,,"\n\n\n $122,455\n\n\n\n"
10495,Chevrolet,Corvette,125930,,"\n\n\n $125,930\n\n\n\n"
10496,Jeep,Wrangler,53715,,"\n\n\n $53,715\n\n MSRP $58,715\n\n\n ..."
10497,Jeep,Wrangler,91450,,"\n\n\n $91,450\n\n MSRP $86,455\n\n\n ..."


In [367]:
other_cars = Cars[Cars['Model'] == 'Other']['Car']

print(other_cars)

Series([], Name: Car, dtype: object)


Now looks nice!

## Year

In [368]:
Cars['Year'] = Cars['Car'].str[:4]

In [369]:
Cars['Year'].value_counts()

2023    7294
2024    2430
2022    1641
2021    1300
2020    1299
2019     924
2018     741
2017     601
2016     486
2015     418
2014     311
2013     256
2012     164
2011     164
2010      99
2008      90
2007      81
2006      65
2009      59
2004      47
2005      42
2003      26
2002      21
2001      19
1999      11
2000      11
1997       7
1998       6
1990       6
1995       5
1992       3
1993       3
1994       3
1979       2
1987       2
1989       2
1988       2
1970       2
1991       2
1996       2
1969       1
1973       1
1982       1
1971       1
1975       1
1977       1
1976       1
1972       1
1983       1
1978       1
1984       1
1985       1
1968       1
Name: Year, dtype: int64

Perfect!

## Status

To get the Status, we will transform the Certified ones into a single "Certified" Category:

In [370]:
Cars['Status'].value_counts()

New                        9427
Used                       7971
Toyota Certified            151
Ford Certified              128
Honda Certified              97
Chevrolet Certified          85
Mercedes-Benz Certified      82
BMW Certified                72
Porsche Certified            64
Kia Certified                58
Volvo Certified              54
Audi Certified               50
Volkswagen Certified         50
GMC Certified                49
Cadillac Certified           43
Jeep Certified               41
Acura Certified              39
Nissan Certified             38
INFINITI Certified           32
Hyundai Certified            30
Subaru Certified             25
Lincoln Certified            22
Dodge Certified              18
RAM Certified                17
Buick Certified               4
Ferrari Certified             3
Bentley Certified             3
Chrysler Certified            3
Rolls-Royce Certified         3
Maserati Certified            1
Name: Status, dtype: int64

In [371]:
Cars['Status'] = Cars['Status'].astype(str)
Cars['Status'] = Cars['Status'].apply(lambda x: 'Certified' if 'Certified' in x else x)

In [372]:
Cars['Status'].value_counts()

New          9427
Used         7971
Certified    1262
Name: Status, dtype: int64

## Mileage

We need to transform the Mileage column. First of all we need to delete ones that include the ' mi.' string, and then, for the rest we will delete the ' mi.' string:

In [373]:
def transform_mileage(mileage):
    if isinstance(mileage, str) and ' mi.' in mileage:
        # Elimina " mi." y convierte el valor a número (eliminando comas y espacios en blanco)
        return float(mileage.replace(' mi.', '').replace(',', '').strip())
    else:
        # Si no contiene " mi.", establece el valor como NaN
        return np.nan

# Aplica la función a la columna "Mileage"
Cars['Mileage'] = Cars['Mileage'].apply(transform_mileage)

In [374]:
Cars

Unnamed: 0,URL,Car,Status,Price_1,Price_2,Mileage,Dealer_1,Dealer_2,Dealer_3,Brand,Model,Year
1,https://www.cars.com/vehicledetail/5ee74d2b-dd...,2023 Mazda CX-50 2.5 S Premium Plus Package,New,36703,,,,,,Mazda,CX-5,2023
2,https://www.cars.com/vehicledetail/460b9155-fa...,2023 Kia Sportage S,New,28990,,,,Classic Kia,,Kia,Sportage,2023
3,https://www.cars.com/vehicledetail/a0594120-25...,2024 Chevrolet Camaro 2LT,New,41425,,,,Classic Chevrolet Beaumont,,Chevrolet,Camaro,2024
4,https://www.cars.com/vehicledetail/b185d19c-b4...,2023 Ford Bronco Badlands,Used,,58900.0,1551.0,Mike Smith Chrysler Dodge Jeep RAM,,,Ford,Bronco,2023
5,https://www.cars.com/vehicledetail/52bb1768-17...,2021 Acura TLX Advance,Used,,34499.0,30384.0,Mike Smith Nissan,,,Acura,TLX,2021
...,...,...,...,...,...,...,...,...,...,...,...,...
10494,https://www.cars.com/vehicledetail/06665049-49...,2023 Chevrolet Corvette Stingray w/3LT,New,122455,,,,,Chase Chevrolet,Chevrolet,Corvette,2023
10495,https://www.cars.com/vehicledetail/9e367454-56...,2023 Chevrolet Corvette Stingray w/3LT,New,125930,,,,,Chase Chevrolet,Chevrolet,Corvette,2023
10496,https://www.cars.com/vehicledetail/d578331d-53...,2023 Jeep Wrangler Rubicon,New,53715,,,,Stockton Dodge Chrysler Jeep RAM,,Jeep,Wrangler,2023
10497,https://www.cars.com/vehicledetail/3b7dc7d4-60...,2023 Jeep Wrangler Rubicon 392,New,91450,,,,Stockton Dodge Chrysler Jeep RAM,,Jeep,Wrangler,2023


## Dealer

In order to get the Dealer we will combine the three Dealer columns, since just one of them has the right value, and the other ones has NaN:

In [375]:
Cars['Dealer'] = Cars['Dealer_1'].combine_first(Cars['Dealer_2']).combine_first(Cars['Dealer_3'])

In [376]:
Cars

Unnamed: 0,URL,Car,Status,Price_1,Price_2,Mileage,Dealer_1,Dealer_2,Dealer_3,Brand,Model,Year,Dealer
1,https://www.cars.com/vehicledetail/5ee74d2b-dd...,2023 Mazda CX-50 2.5 S Premium Plus Package,New,36703,,,,,,Mazda,CX-5,2023,
2,https://www.cars.com/vehicledetail/460b9155-fa...,2023 Kia Sportage S,New,28990,,,,Classic Kia,,Kia,Sportage,2023,Classic Kia
3,https://www.cars.com/vehicledetail/a0594120-25...,2024 Chevrolet Camaro 2LT,New,41425,,,,Classic Chevrolet Beaumont,,Chevrolet,Camaro,2024,Classic Chevrolet Beaumont
4,https://www.cars.com/vehicledetail/b185d19c-b4...,2023 Ford Bronco Badlands,Used,,58900.0,1551.0,Mike Smith Chrysler Dodge Jeep RAM,,,Ford,Bronco,2023,Mike Smith Chrysler Dodge Jeep RAM
5,https://www.cars.com/vehicledetail/52bb1768-17...,2021 Acura TLX Advance,Used,,34499.0,30384.0,Mike Smith Nissan,,,Acura,TLX,2021,Mike Smith Nissan
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10494,https://www.cars.com/vehicledetail/06665049-49...,2023 Chevrolet Corvette Stingray w/3LT,New,122455,,,,,Chase Chevrolet,Chevrolet,Corvette,2023,Chase Chevrolet
10495,https://www.cars.com/vehicledetail/9e367454-56...,2023 Chevrolet Corvette Stingray w/3LT,New,125930,,,,,Chase Chevrolet,Chevrolet,Corvette,2023,Chase Chevrolet
10496,https://www.cars.com/vehicledetail/d578331d-53...,2023 Jeep Wrangler Rubicon,New,53715,,,,Stockton Dodge Chrysler Jeep RAM,,Jeep,Wrangler,2023,Stockton Dodge Chrysler Jeep RAM
10497,https://www.cars.com/vehicledetail/3b7dc7d4-60...,2023 Jeep Wrangler Rubicon 392,New,91450,,,,Stockton Dodge Chrysler Jeep RAM,,Jeep,Wrangler,2023,Stockton Dodge Chrysler Jeep RAM


## Price

Same approach as Dealer, but we need also to transform into a numeric:

In [377]:
Cars['Price'] = Cars['Price_1'].combine_first(Cars['Price_2'])
Cars['Price'] = pd.to_numeric(Cars['Price'], errors='coerce')

In [378]:
Cars.head(50)

Unnamed: 0,URL,Car,Status,Price_1,Price_2,Mileage,Dealer_1,Dealer_2,Dealer_3,Brand,Model,Year,Dealer,Price
1,https://www.cars.com/vehicledetail/5ee74d2b-dd...,2023 Mazda CX-50 2.5 S Premium Plus Package,New,36703,,,,,,Mazda,CX-5,2023,,36703.0
2,https://www.cars.com/vehicledetail/460b9155-fa...,2023 Kia Sportage S,New,28990,,,,Classic Kia,,Kia,Sportage,2023,Classic Kia,28990.0
3,https://www.cars.com/vehicledetail/a0594120-25...,2024 Chevrolet Camaro 2LT,New,41425,,,,Classic Chevrolet Beaumont,,Chevrolet,Camaro,2024,Classic Chevrolet Beaumont,41425.0
4,https://www.cars.com/vehicledetail/b185d19c-b4...,2023 Ford Bronco Badlands,Used,,58900.0,1551.0,Mike Smith Chrysler Dodge Jeep RAM,,,Ford,Bronco,2023,Mike Smith Chrysler Dodge Jeep RAM,58900.0
5,https://www.cars.com/vehicledetail/52bb1768-17...,2021 Acura TLX Advance,Used,,34499.0,30384.0,Mike Smith Nissan,,,Acura,TLX,2021,Mike Smith Nissan,34499.0
6,https://www.cars.com/vehicledetail/6c4f8f63-05...,2022 Volkswagen Golf GTI 2.0T Autobahn,Certified,,34000.0,13895.0,Volkswagen of Beaumont,,,Volkswagen,Golf,2022,Volkswagen of Beaumont,34000.0
7,https://www.cars.com/vehicledetail/7dd93af8-87...,2021 GMC Yukon Denali,Used,,56954.0,68506.0,BMW of Beaumont,,,GMC,Yukon,2021,BMW of Beaumont,56954.0
8,https://www.cars.com/vehicledetail/78ee816a-be...,2023 BMW M340 i,New,61715,,,,BMW of Beaumont,,BMW,M340,2023,BMW of Beaumont,61715.0
9,https://www.cars.com/vehicledetail/566bdb25-ca...,2023 Hyundai Sonata Hybrid Limited,New,37945,,,,Hyundai of Silsbee,,Hyundai,Sonata,2023,Hyundai of Silsbee,37945.0
10,https://www.cars.com/vehicledetail/fce328b2-f8...,2023 Hyundai Sonata Hybrid SEL,New,33495,,,,Hyundai of Silsbee,,Hyundai,Sonata,2023,Hyundai of Silsbee,33495.0


# Generating files

Now let's generate the files!

In [379]:
Cars_file = Cars[['Brand','Model','Year','Status','Mileage','Dealer','Price']]
Cars_file.to_excel('cars.xlsx', index=False)
Cars_file.to_csv('cars.csv', index=False, encoding="utf-16")