# Car Market Analysis
### Objective
Analysis current car market data for Toyota RAV4 and Honda CR-V and Pilot models.

### Cleaning the Data

In [8]:
# Import libraries
import pandas as pd
import numpy as np
import re

In [7]:
# Column names for csv files
cols_name = ['car_url', 'year_make_model', 'price_col', 'details_short', 'details_long' ]
 
# load all datasets
honda_crv_df = pd.read_csv("/home/lfigil/Documents/car_scraper/car_dataset_original/honda_data.csv", names=cols_name)
honda_pilot_df = pd.read_csv("/home/lfigil/Documents/car_scraper/car_dataset_original/honda_pilot.csv", names=cols_name)

In [11]:
# Quick view of the dfs shape
print(honda_crv_df.shape)
print(honda_pilot_df.shape)

(2612, 5)
(707, 5)


In [48]:
honda_crv_df.head(5)

Unnamed: 0,car_url,year_make_model,price_col,details_short,details_long
0,https://www.edmunds.com/honda/cr-v/2019/vin/5J...,2019 Honda CR-V\nLX 4dr SUV,"$15,000\ngreat price\n$4,956 below market","65,977 miles\nNo accidents, 2 Owners, Personal...","['Located in Elizabeth, NJ / 19 miles away fro..."
1,https://www.edmunds.com/honda/cr-v/2020/vin/5J...,2020 Honda CR-V\nLX 4dr SUV,"$17,000\ngreat price\n$4,757 below market","42,476 miles\nNo accidents, 3 Owners, Rental v...","['Located in Elizabeth, NJ / 19 miles away fro..."
2,https://www.edmunds.com/honda/cr-v/2019/vin/2H...,2019 Honda CR-V\nLX 4dr SUV,"$14,900\ngreat price\n$4,251 below market","80,621 miles\nNo accidents, 2 Owners, Personal...","['Located in Elizabeth, NJ / 19 miles away fro..."
3,https://www.edmunds.com/honda/cr-v/2021/vin/5J...,Certified 2021 Honda CR-V\nEX-L 4dr SUV,"$25,949\ngreat price\n$3,020 below market","17,096 miles\nNo accidents, 1 Owner, Personal ...","['Located in Jersey City, NJ / 12 miles away f..."
4,https://www.edmunds.com/honda/cr-v/2023/vin/7F...,2023 Honda CR-V\nSport Touring Hybrid 4dr SUV,"$31,685\ngreat price\n$6,409 below market","6,309 miles\nNo accidents, 1 Owner, Personal u...","['Located in Valley Stream, NY / 23 miles away..."


In [19]:
col1_tmp = honda_crv_df['year_make_model'].str.replace("\n", " ").str.strip().str.split(" ", expand=True)
col1_tmp = col1_tmp.rename(columns={0: 'Year', 1: 'Make', 2: 'Model', 3: 'Trim', 4: 'Doors', 5: 'type'})
col1_tmp = col1_tmp[col1_tmp['Year'] == 'Certified'].shift(-1, axis=1)

In [64]:
# Separate the `year_make_model` column by the newline which results in two new columns
tmp_cols_1 = honda_crv_df['year_make_model'].str.split("\n", expand=True)

# Replace rows with the word 'Certified' with and empty string
tmp_cols_1[0] = tmp_cols_1[0].str.replace('Certified', '').str.strip()

In [66]:
# Split into year, make, model columns to original df
honda_crv_df[['Year', 'Make', 'Model']] = tmp_cols_1[0] .str.split(" ", expand=True)

In [9]:
def extract_trim_info(text):
    pattern = r'^(.*?)\s+(\d+dr\s\S+)(.*)$'
    
    match = re.match(pattern, text)
    if match:
        trim = match.group(1).strip()  # Trim level
        doors_type = match.group(2).strip()  # Number of doors and car type
        
        # Extract number of doors and car type
        # Use a separate pattern to extract the number of doors and type from the captured doors_type string
        doors_pattern = r'(\d+dr)\s(\S+)'
        doors_match = re.match(doors_pattern, doors_type)
        if doors_match:
            doors = doors_match.group(1).strip()
            car_type = doors_match.group(2).strip()
        else:
            doors = None
            car_type = None
        
        return trim, doors, car_type
    else:
        # If no match, return None or empty strings
        return None, None, None

In [89]:
honda_crv_df['Trim'], honda_crv_df['Doors'], honda_crv_df['Type'] = zip(*tmp_cols_1[1].apply(extract_trim_info))

In [21]:
# Extract the dollar value from the price_col column and convert into numeric form
def extract_price(text):
    pattern = r'\$?(\d{1,3}(?:,\d{3})*(?:\.\d+)?)'
    
    match = re.search(pattern, text)
    if match:
        # Extracted number is the first numeric value found in the text
        numeric_value = match.group(1)
        
        # Remove commas and convert to float for further use
        numeric_value = numeric_value.replace(',', '')
        return float(numeric_value)
    else:
        # If no numeric value is found, return None
        return None

In [None]:
honda_crv_df['price'] = honda_crv_df['price_col'].apply(extract_price)

In [123]:
# Separate the details_short column into new columns by new line for further information extraction
tmp_3 = honda_crv_df['details_short'].str.split("\n", expand=True)

# This new columns will need further cleaning
tmp_3 = tmp_3.rename(columns={0: 'mileage', 1: 'details1'})

In [24]:
def extract_miles(text):
    pattern = r'(\d{1,3}(?:,\d{3})*)\s+miles$'

    match = re.search(pattern, text)

    if match:
        miles = match.group(1).replace(',', '')
        return int(miles)
    else:
        return None

In [None]:
honda_crv_df['Mileage'] = tmp_3['mileage'].apply(extract_miles)

In [None]:
tmp_4 = tmp_3['details1'].str.split(", ",expand=True)
tmp_4[['Owners', 'del', 'del2']] = tmp_4[1].str.split(" ", expand=True)
tmp_4[['Accidents', 'del3', 'del4']] = tmp_4[0].str.split(" ", expand=True)
tmp_4['Accidents'] = tmp_4['Accidents'].replace("No", "0")
tmp_4 = tmp_4.rename(columns={2: "Usage"})
honda_crv_df[['Accidents', 'Owners', 'Usage']] = tmp_4[['Accidents', 'n', 'Usage']]

In [154]:
pd.set_option('display.max_columns', None)

In [158]:
tmp_details_long = honda_crv_df['details_long'].str.split(",", expand=True)
# Define the key phrases you want to extract
keys_to_extract = {
    'Title Details': 'Title Details:',
    'Salvage Vehicle': 'Salvage Vehicle:',
    'Frame Damage': 'Frame Damage:',
    'Theft History': 'Theft History:',
    'Lemon Status': 'Lemon Status:',
    'Certified Pre-Owned': 'Certified Pre-Owned:',
    'Listed since': 'Listed since'
}

# Initialize empty columns for the keys
for key in keys_to_extract:
    honda_crv_df[key] = None

# Function to extract values based on the specified keys
def extract_information(details_list, keys):
    # Create a dictionary to hold the extracted values
    extracted_info = {}
    
    # Iterate through each string in the list
    for detail in details_list:
        # Check each key to see if it is present in the current detail string
        for key, prefix in keys.items():
            if prefix in detail:
                # Extract the value after the key prefix
                value = detail.split(prefix)[1].strip()
                # Store the value in the dictionary
                extracted_info[key] = value
    
    return extracted_info

# Iterate through each row in the DataFrame
for index, row in honda_crv_df.iterrows():
    # Parse the list of strings from the 'details_long' column
    details_list = eval(row['details_long'])  # Convert string representation of list to actual list
    
    # Extract the information using the function
    extracted_info = extract_information(details_list, keys_to_extract)
    
    # Assign the extracted information to the appropriate columns
    for key, value in extracted_info.items():
        honda_crv_df.loc[index, key] = value

honda_crv_df['Certified Pre-Owned'] = honda_crv_df['Certified Pre-Owned'].str.split("\n")[0]
honda_crv_df['Certified Pre-Owned'] = honda_crv_df['Certified Pre-Owned'].str.split("\n",expand=True)[0]
honda_crv_df['Listed since'] = pd.to_datetime(honda_crv_df['Listed since'].str.replace(": ", "").str.strip())

In [184]:
# Get only the clean data into new df
honda_crv_clean = honda_crv_df[['Year', 'Make', 'Model', 'Trim', 'Doors', 'Type', 'price', 'Mileage', 
                                'Accidents', 'Owners', 'Usage', 'Title Details', 'Salvage Vehicle', 
                                'Frame Damage', 'Theft History', 'Lemon Status', 'Certified Pre-Owned', 
                                'Listed since', 'car_url']]

save_path = "/home/lfigil/Documents/car_scraper/car_dataset_original/honda_crv_clean.csv"

honda_crv_clean.to_csv(save_path, index=False)

print(f"file save at {save_path}...")

file save at /home/lfigil/Documents/car_scraper/car_dataset_original/honda_crv_clean.csv...


In [183]:
honda_crv_df.columns

Index(['car_url', 'year_make_model', 'price_col', 'details_short',
       'details_long', 'Year', 'Make', 'Model', 'Trim', 'Doors', 'Type',
       'price', 'mileage', 'Mileage', 'Accidents', 'Owners', 'Usage',
       'Title Details', 'Salvage Vehicle', 'Frame Damage', 'Theft History',
       'Lemon Status', 'VIN', 'Stock', 'Certified Pre-Owned', 'Listed since'],
      dtype='object')

Now, it's time to clean the honda pilot data

In [3]:
# Column names for csv files
cols_name = ['car_url', 'year_make_model', 'price_col', 'details_short', 'details_long' ]
 
# load all datasets
honda_pilot_df = pd.read_csv("/home/lfigil/Documents/car_scraper/car_dataset_original/honda_pilot.csv", names=cols_name)

In [13]:
# Split tje year_make_model
temp_1 = honda_pilot_df['year_make_model'].str.split("\n", expand=True)


In [17]:
honda_pilot_df['Trim'], honda_pilot_df['Doors'], honda_pilot_df['Type'] = zip(*temp_1[1].apply(extract_trim_info))

In [19]:
# Replace rows with the word 'Certified' with and empty string
temp_1[0] = temp_1[0].str.replace('Certified', '').str.strip()

# Split into year, make, model columns to original df
honda_pilot_df[['Year', 'Make', 'Model']] = temp_1[0] .str.split(" ", expand=True)

In [22]:
honda_pilot_df['Price'] = honda_pilot_df['price_col'].apply(extract_price)

In [25]:
# Separate the details_short column into new columns by new line for further information extraction
temp_2 = honda_pilot_df['details_short'].str.split("\n", expand=True)

# This new columns will need further cleaning
temp_2 = temp_2.rename(columns={0: 'mileage', 1: 'details1'})

In [27]:
honda_pilot_df['Mileage'] = temp_2['mileage'].apply(extract_miles)

In [28]:
temp_2

Unnamed: 0,mileage,details1,2,3,4
0,"57,108 miles","No accidents, 1 Owner, Personal use",6cyl Automatic,M Sport Motor Cars (16 mi away),Home delivery*
1,"65,589 miles","4 Accidents, 2 Owners, Personal use only",6cyl Automatic,Lynnes Nissan City (10 mi away),Five Star Dealer
2,"59,065 miles","No accidents, 2 Owners, Rental vehicle",6cyl Automatic,Auto Lux (23 mi away),
3,"59,291 miles","No accidents, 1 Owner, Personal use only",6cyl Automatic,North Shore Honda (23 mi away),Home delivery*
4,"41,423 miles","2 Accidents, 1 Owner, Personal use",6cyl Automatic,Hudson Honda (7 mi away),Home delivery*
...,...,...,...,...,...
702,"65,258 miles","2 Accidents, 1 Owner, Personal use only",6cyl Automatic,Millennium Honda (25 mi away),
703,"43,976 miles","1 Accident, 3 Owners, Personal use",6cyl Automatic,Drive Deleon (11 mi away),
704,"51,664 miles","2 Accidents, 2 Owners, Personal use",6cyl Automatic,Drive Deleon (11 mi away),
705,"67,223 miles","1 Accident, 1 Owner, Personal use",6cyl Automatic,Straight Motor Sales (6 mi away),


In [47]:
temp_3 = temp_2['details1'].str.split(", ",expand=True)
temp_3[['Owners', 'del', 'del2']] = temp_3[1].str.split(" ", expand=True)
temp_3[['Accidents', 'del3', 'del4']] = temp_3[0].str.split(" ", expand=True)
temp_3['Accidents'] = temp_3['Accidents'].replace("No", "0")
temp_3 = temp_3.rename(columns={2: "Usage"})


In [49]:
honda_pilot_df[['Accidents', 'Owners', 'Usage']] = temp_3[['Accidents', 'Owners', 'Usage']]

In [52]:
# Function to extract values based on the specified keys
def extract_information(details_list, keys):
    # Create a dictionary to hold the extracted values
    extracted_info = {}
    
    # Iterate through each string in the list
    for detail in details_list:
        # Check each key to see if it is present in the current detail string
        for key, prefix in keys.items():
            if prefix in detail:
                # Extract the value after the key prefix
                value = detail.split(prefix)[1].strip()
                # Store the value in the dictionary
                extracted_info[key] = value
    
    return extracted_info

In [54]:
tmp_details_long = honda_pilot_df['details_long'].str.split(",", expand=True)

# Define the key phrases you want to extract
keys_to_extract = {
    'Title Details': 'Title Details:',
    'Salvage Vehicle': 'Salvage Vehicle:',
    'Frame Damage': 'Frame Damage:',
    'Theft History': 'Theft History:',
    'Lemon Status': 'Lemon Status:',
    'Certified Pre-Owned': 'Certified Pre-Owned:',
    'Listed since': 'Listed since'
}

# Initialize empty columns for the keys
for key in keys_to_extract:
    honda_pilot_df[key] = None

# Iterate through each row in the DataFrame
for index, row in honda_pilot_df.iterrows():
    # Parse the list of strings from the 'details_long' column
    details_list = eval(row['details_long'])  # Convert string representation of list to actual list
    
    # Extract the information using the function
    extracted_info = extract_information(details_list, keys_to_extract)
    
    # Assign the extracted information to the appropriate columns
    for key, value in extracted_info.items():
        honda_pilot_df.loc[index, key] = value

In [61]:
honda_pilot_df['Certified Pre-Owned'] = honda_pilot_df['Certified Pre-Owned'].str.split("\n",expand=True)[0]
honda_pilot_df['Listed since'] = pd.to_datetime(honda_pilot_df['Listed since'].str.replace(": ", "").str.strip())

In [63]:
honda_pilot_df.columns

Index(['car_url', 'year_make_model', 'price_col', 'details_short',
       'details_long', 'Trim', 'Doors', 'Type', 'Year', 'Make', 'Model',
       'Price', 'Mileage', 'Accidents', 'Owners', 'Usage', 'Title Details',
       'Salvage Vehicle', 'Frame Damage', 'Theft History', 'Lemon Status',
       'Certified Pre-Owned', 'Listed since'],
      dtype='object')

In [64]:
honda_pilot_clean = honda_pilot_df[['Year', 'Make', 'Model', 'Trim', 'Doors', 'Type', 'Price', 'Mileage', 'Accidents', 
                                    'Owners', 'Usage', 'Title Details', 'Salvage Vehicle', 'Frame Damage', 'Theft History', 
                                    'Lemon Status', 'Certified Pre-Owned', 'Listed since', 'car_url']]

In [66]:
save_path = "/home/lfigil/Documents/car_scraper/car_dataset_original/honda_pilot_clean.csv"

honda_pilot_clean.to_csv(save_path, index=False)

print(f"file save at {save_path} !!!")

file save at /home/lfigil/Documents/car_scraper/car_dataset_original/honda_pilot_clean.csv !!!


# Check for data types

In [68]:
# Read clean data
toyota_df = pd.read_csv("./car_dataset_original/toyota_clean.csv")
honda_p_df = pd.read_csv("./car_dataset_original/honda_pilot_clean.csv")
honda_c_df = pd.read_csv("./car_dataset_original/honda_crv_clean.csv")

In [69]:
toyota_df.shape

(2282, 22)

In [72]:
toyota_df = toyota_df[['year', 'make', 'model', 'trim', 'doors', 'type', 'mileage', 'price', 'Title Details',
                       'Salvage Vehicle', 'Frame Damage', 'Theft History', 'Lemon Status', 'Certified Pre-Owned',
                       'Accidents', 'Owners', 'Usage', 'Listed since', 'car_url']]

In [75]:
toyota_df

Unnamed: 0,year,make,model,trim,doors,type,mileage,price,Title Details,Salvage Vehicle,Frame Damage,Theft History,Lemon Status,Certified Pre-Owned,Accidents,Owners,Usage,Listed since,car_url
0,2010,Toyota,RAV4,Base,4dr,SUV,154830.0,6999.0,Clean Title,No,No,No,No,No,No accidents,4 Owners,Corporate fleet vehicle,,https://www.edmunds.com/toyota/rav4/2010/vin/2...
1,2021,Toyota,RAV4,XLE,4dr,SUV,59711.0,18879.0,Clean Title,No,No,No,No,No,1 Accident,1 Owner,Rental vehicle,2024-01-24,https://www.edmunds.com/toyota/rav4/2021/vin/2...
2,2022,Toyota,RAV4,LE,4dr,SUV,31372.0,20900.0,Clean Title,No,No,No,No,No,1 Accident,2 Owners,Corporate fleet vehicle,2023-12-30,https://www.edmunds.com/toyota/rav4/2022/vin/2...
3,2021,Toyota,RAV4,XLE,4dr,SUV,64709.0,16900.0,Clean Title,No,No,No,No,No,1 Accident,3 Owners,Corporate fleet vehicle,2024-01-25,https://www.edmunds.com/toyota/rav4/2021/vin/2...
4,2020,Toyota,RAV4,LE,4dr,SUV,91273.0,18980.0,Clean Title,No,No,No,No,No,No accidents,2 Owners,Rental vehicle,2024-02-21,https://www.edmunds.com/toyota/rav4/2020/vin/2...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2277,2017,Toyota,RAV4,Limited,4dr,SUV,88059.0,,Clean Title,No,No,No,No,No,1 Accident,4 Owners,Corporate fleet vehicle,2023-11-02,https://www.edmunds.com/toyota/rav4/2017/vin/2...
2278,2021,Toyota,RAV4,XLE,4dr,SUV,65205.0,25698.0,Clean Title,No,No,No,No,No,1 Accident,1 Owner,Personal use only,2024-04-04,https://www.edmunds.com/toyota/rav4/2021/vin/2...
2279,2021,Toyota,RAV4,XLE,4dr,SUV,65875.0,25698.0,,,,,,No,4cyl Automatic,,,2024-04-01,https://www.edmunds.com/toyota/rav4/2021/vin/2...
2280,2017,Toyota,RAV4,Limited,4dr,SUV,90000.0,,Clean Title,No,No,No,No,No,1 Accident,4 Owners,Personal use,2023-12-01,https://www.edmunds.com/toyota/rav4/2017/vin/2...


In [74]:
toyota_df.dtypes

year                    object
make                    object
model                   object
trim                    object
doors                   object
type                    object
mileage                float64
price                  float64
Title Details           object
Salvage Vehicle         object
Frame Damage            object
Theft History           object
Lemon Status            object
Certified Pre-Owned     object
Accidents               object
Owners                  object
Usage                   object
Listed since            object
car_url                 object
dtype: object

In [76]:
toyota_df['year'] = pd.to_numeric(toyota_df['year'], errors='coerce')
toyota_df['doors'] = pd.to_numeric(toyota_df['doors'], errors='coerce')
toyota_df['Listed since'] = pd.to_datetime(toyota_df['Listed since'], errors='coerce')