### Car Analysis


In [48]:
import pandas as pd

# Read toyota cars information
# Column names
cols = ['car_url', 'year_make_model', 'price', 'details_short', 'details_long' ]

toyota_df = pd.read_csv('../car_dataset_original/toyota_data.csv', names=cols)

In [77]:
import numpy as np

In [49]:
toyota_df.head()

Unnamed: 0,car_url,year_make_model,price,details_short,details_long
0,https://www.edmunds.com/toyota/rav4/2010/vin/2...,2010 Toyota RAV4\nBase 4dr SUV,"$6,999\ngreat price\n$2,293 below market","154,830 miles\nNo accidents, 4 Owners, Corpora...","['Located in South Hackensack, NJ / 1 miles aw..."
1,https://www.edmunds.com/toyota/rav4/2021/vin/2...,2021 Toyota RAV4\nXLE 4dr SUV,"$18,879\ngreat price\n$5,582 below market","59,711 miles\n1 Accident, 1 Owner, Rental vehi...","['Located in Elmont, NY / 23 miles away from H..."
2,https://www.edmunds.com/toyota/rav4/2022/vin/2...,2022 Toyota RAV4\nLE 4dr SUV,"$20,900\ngreat price\n$4,722 below market","31,372 miles\n1 Accident, 2 Owners, Corporate ...","['Located in Elizabeth, NJ / 19 miles away fro..."
3,https://www.edmunds.com/toyota/rav4/2021/vin/2...,2021 Toyota RAV4\nXLE 4dr SUV,"$16,900\ngreat price\n$4,250 below market","64,709 miles\n1 Accident, 3 Owners, Corporate ...","['Located in Elizabeth, NJ / 19 miles away fro..."
4,https://www.edmunds.com/toyota/rav4/2020/vin/2...,2020 Toyota RAV4\nLE 4dr SUV,"$18,980\ngreat price\n$2,402 below market","91,273 miles\nNo accidents, 2 Owners, Rental v...","['Located in Queens, NY / 18 miles away from H..."


# Cleaning process

- Separate the **year**, **make**, **model**, **trim**, **doors**.
- Separate the **price**.
- Separate details short into four columns: **mileage**, **Accidents**, **Owners**, **Car_usage**, **Cyl**, **Dealer**
- Separate details long into the following columns: 'Title Details', 'Salvage Vehicle', 'Frame Damage', 'Theft History', 'Lemon Status', 'VIN', 'Stock', 'Certified Pre-Owned'

In [50]:
year_make_model_split = pd.DataFrame()
year_make_model_split[['col1', 'col2']] = toyota_df['year_make_model'].str.split("\n", expand=True)

In [51]:
year_make_model_split

Unnamed: 0,col1,col2
0,2010 Toyota RAV4,Base 4dr SUV
1,2021 Toyota RAV4,XLE 4dr SUV
2,2022 Toyota RAV4,LE 4dr SUV
3,2021 Toyota RAV4,XLE 4dr SUV
4,2020 Toyota RAV4,LE 4dr SUV
...,...,...
2277,2017 Toyota RAV4,Limited 4dr SUV
2278,2021 Toyota RAV4,XLE 4dr SUV
2279,2021 Toyota RAV4,XLE 4dr SUV
2280,2017 Toyota RAV4,Limited 4dr SUV


In [52]:
toyota_df[['year', 'make', 'model', 'trim', 'doors', 'type', 'none1', 'none2']] = toyota_df['year_make_model'].str.split(r'[\n\s]', expand=True)

In [53]:
price = toyota_df['price'].str.split(r'[\n\s]', expand=True)

In [54]:
toyota_df['price'] = price[0]

In [55]:
details_short_tmp = toyota_df['details_short'].str.split(r'[\n]', expand=True)
toyota_df[['mileage', 'details1', 'cyl', 'dealer']] = details_short_tmp.iloc[:, 0:4]

In [56]:
# Define the key phrases you want to extract
keys_to_extract = {
    'Title Details': 'Title Details:',
    'Salvage Vehicle': 'Salvage Vehicle:',
    'Frame Damage': 'Frame Damage:',
    'Theft History': 'Theft History:',
    'Lemon Status': 'Lemon Status:',
    'VIN': 'VIN:',
    'Stock': 'Stock:',
    'Certified Pre-Owned': 'Certified Pre-Owned:',
    'Listed since': 'Listed since'
}

# Initialize empty columns for the keys
for key in keys_to_extract:
    toyota_df[key] = None

# Function to extract values based on the specified keys
def extract_information(details_list, keys):
    # Create a dictionary to hold the extracted values
    extracted_info = {}
    
    # Iterate through each string in the list
    for detail in details_list:
        # Check each key to see if it is present in the current detail string
        for key, prefix in keys.items():
            if prefix in detail:
                # Extract the value after the key prefix
                value = detail.split(prefix)[1].strip()
                # Store the value in the dictionary
                extracted_info[key] = value
    
    return extracted_info

# Iterate through each row in the DataFrame
for index, row in toyota_df.iterrows():
    # Parse the list of strings from the 'details_long' column
    details_list = eval(row['details_long'])  # Convert string representation of list to actual list
    
    # Extract the information using the function
    extracted_info = extract_information(details_list, keys_to_extract)
    
    # Assign the extracted information to the appropriate columns
    for key, value in extracted_info.items():
        toyota_df.loc[index, key] = value

In [57]:
toyota_v1 = toyota_df[['car_url', 'price', 'year', 'make', 'model', 'trim', 'doors', 'type',
                       'mileage', 'details1', 'cyl', 'dealer', 'Title Details', 'Salvage Vehicle', 
                       'Frame Damage', 'Theft History', 'Lemon Status', 'VIN', 'Certified Pre-Owned',
                       'Listed since']]

In [58]:
vin = toyota_v1['VIN'].str.split("\n", expand=True)
toyota_v1.loc[:, 'VIN'] = vin[0]

In [59]:
certified = toyota_v1['Certified Pre-Owned'].str.split("\n", expand=True)

In [60]:
toyota_v1.loc[:, 'Certified Pre-Owned'] = certified[0]

In [61]:
listed = toyota_v1['Listed since'].str.split(":", expand=True)
toyota_v1.loc[:, 'Listed since'] = listed[1]

In [69]:
# Create a copy of toyota_v1
toyota_v1_copy = toyota_v1.copy()

# Split the 'details1' column by commas and expand into multiple columns
details1_split = toyota_v1_copy['details1'].str.split(',', expand=True)

# Assign the expanded columns to the DataFrame with new column names
toyota_v1_copy[['Accidents', 'Owners', 'Usage']] = details1_split

In [73]:
toyota_v1_copy['mileage'] = toyota_v1_copy['mileage'].str.replace(r'[^0-9]', '', regex=True)

In [78]:
# Step 1: Identify rows containing the word 'Not' and replace them with NaN
toyota_v1_copy.loc[toyota_v1_copy['price'].str.contains('Not', case=False, na=False), 'price'] = np.nan

# Step 2: Remove dollar signs and commas from the 'price' column
toyota_v1_copy['price'] = toyota_v1_copy['price'].str.replace(r'[$,]', '', regex=True)

# Step 3: Convert the cleaned 'price' column to numeric
toyota_v1_copy['price'] = pd.to_numeric(toyota_v1_copy['price'])

In [80]:
toyota_v1_copy['Listed since'] = pd.to_datetime(toyota_v1_copy['Listed since'], errors='coerce')

In [81]:
toyota_v1_copy.columns

Index(['car_url', 'price', 'year', 'make', 'model', 'trim', 'doors', 'type',
       'mileage', 'details1', 'cyl', 'dealer', 'Title Details',
       'Salvage Vehicle', 'Frame Damage', 'Theft History', 'Lemon Status',
       'VIN', 'Certified Pre-Owned', 'Listed since', 'Accidents', 'Owners',
       'Usage'],
      dtype='object')

In [88]:
final_toyota_df = toyota_v1_copy[['year', 'make', 'model', 'trim', 'doors', 'type',
                             'mileage', 'price', 'cyl', 'dealer', 'Title Details',
                             'Salvage Vehicle', 'Frame Damage', 'Theft History', 'Lemon Status',
                             'VIN', 'Certified Pre-Owned', 'Accidents', 'Owners',
                             'Usage', 'Listed since', 'car_url', ]]

In [89]:
final_toyota_df.to_csv("toyota_clean.csv", index=False)

In [90]:
final_toyota_df

Unnamed: 0,year,make,model,trim,doors,type,mileage,price,cyl,dealer,...,Frame Damage,Theft History,Lemon Status,VIN,Certified Pre-Owned,Accidents,Owners,Usage,Listed since,car_url
0,2010,Toyota,RAV4,Base,4dr,SUV,154830,6999.0,4cyl Automatic,Auto Spot (1 mi away),...,No,No,No,2T3BF4DV2AW034006,No,No accidents,4 Owners,Corporate fleet vehicle,NaT,https://www.edmunds.com/toyota/rav4/2010/vin/2...
1,2021,Toyota,RAV4,XLE,4dr,SUV,59711,18879.0,4cyl Automatic,Auto Lux (23 mi away),...,No,No,No,2T3P1RFV4MC144673,No,1 Accident,1 Owner,Rental vehicle,2024-01-24,https://www.edmunds.com/toyota/rav4/2021/vin/2...
2,2022,Toyota,RAV4,LE,4dr,SUV,31372,20900.0,4cyl Automatic,JD Motors (19 mi away),...,No,No,No,2T3F1RFV5NW290088,No,1 Accident,2 Owners,Corporate fleet vehicle,2023-12-30,https://www.edmunds.com/toyota/rav4/2022/vin/2...
3,2021,Toyota,RAV4,XLE,4dr,SUV,64709,16900.0,4cyl Automatic,Auto Outlet (19 mi away),...,No,No,No,2T3W1RFV3MW115112,No,1 Accident,3 Owners,Corporate fleet vehicle,2024-01-25,https://www.edmunds.com/toyota/rav4/2021/vin/2...
4,2020,Toyota,RAV4,LE,4dr,SUV,91273,18980.0,4cyl Automatic,Queens Auto Mall (18 mi away),...,No,No,No,2T3F1RFV4LW103405,No,No accidents,2 Owners,Rental vehicle,2024-02-21,https://www.edmunds.com/toyota/rav4/2020/vin/2...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2277,2017,Toyota,RAV4,Limited,4dr,SUV,88059,,4cyl Automatic,Auto Outlet of Irvington (14 mi away),...,No,No,No,2T3DFREVXHW578185,No,1 Accident,4 Owners,Corporate fleet vehicle,2023-11-02,https://www.edmunds.com/toyota/rav4/2017/vin/2...
2278,2021,Toyota,RAV4,XLE,4dr,SUV,65205,25698.0,4cyl Automatic,Enterprise Car Sales (In-stock online),...,No,No,No,2T3P1RFV2MC213912,No,1 Accident,1 Owner,Personal use only,2024-04-04,https://www.edmunds.com/toyota/rav4/2021/vin/2...
2279,2021,Toyota,RAV4,XLE,4dr,SUV,65875,25698.0,Enterprise Car Sales (In-stock online),Home delivery*,...,,,,2T3P1RFV0MW210702,No,4cyl Automatic,,,2024-04-01,https://www.edmunds.com/toyota/rav4/2021/vin/2...
2280,2017,Toyota,RAV4,Limited,4dr,SUV,90000,,4cyl Automatic,Buy Here Pay Here Auto Sales (13 mi away),...,No,No,No,2T3DFREV4HW679819,No,1 Accident,4 Owners,Personal use,2023-12-01,https://www.edmunds.com/toyota/rav4/2017/vin/2...


# Analysis

In [91]:
df = pd.read_csv("../car_dataset_original/toyota_clean.csv")

In [92]:
df.shape

(2282, 22)

In [94]:
df.describe()

Unnamed: 0,mileage,price
count,2279.0,2209.0
mean,52560.086441,24568.946582
std,36351.201132,5887.428206
min,1.0,4477.0
25%,26948.5,21590.0
50%,43736.0,25452.0
75%,65633.5,27999.0
max,259410.0,40998.0
