In [8]:
import numpy as np
import pandas as pd

df_sales = pd.read_csv('../dataset/vehicles.csv')
df_details = pd.read_json('../dataset/vehicle_details.json')

In [9]:
df_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426880 entries, 0 to 426879
Data columns (total 26 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            426880 non-null  int64  
 1   url           426880 non-null  object 
 2   region        426880 non-null  object 
 3   region_url    426880 non-null  object 
 4   price         426880 non-null  int64  
 5   year          425675 non-null  float64
 6   manufacturer  409234 non-null  object 
 7   model         421603 non-null  object 
 8   condition     252776 non-null  object 
 9   cylinders     249202 non-null  object 
 10  fuel          423867 non-null  object 
 11  odometer      422480 non-null  float64
 12  title_status  418638 non-null  object 
 13  transmission  424324 non-null  object 
 14  VIN           265838 non-null  object 
 15  drive         296313 non-null  object 
 16  size          120519 non-null  object 
 17  type          334022 non-null  object 
 18  pain

In [10]:
# Pick a small sample of the data for testing
# Store processed data to prevent reload
sample_df_sales = df_sales.sample(n=10000).reset_index(drop=True)

## #1 - Select relevant columns
We start by selecting only the relevant columns from the `sample_df_sales` and `df_details` dataframes. This step ensures that we focus on the necessary information for further processing and analysis. Irrelevant or redundant columns are dropped to streamline the workflow and reduce memory usage.

In [12]:
# Data preprocessing
sample_df_sales = sample_df_sales.drop(columns=['id', 'url', 'region_url', 'VIN', 'image_url',
              'description', 'county', 'lat', 'long', 'size'], errors='ignore')

df_details = df_details[['make', 'model', 'year', 'cylinders',
                         'displ', 'drive', 'fueltype1', 'vclass']]

# Remove rows with suspicious values
sample_df_sales = sample_df_sales[sample_df_sales['price'] > 1000] # Remove rows with price less than 1000
sample_df_sales = sample_df_sales[sample_df_sales['year'] > 1990] # Remove rows with year less than 1990

## #2 - Fill the car details with actual data
The `sample_df_sales` may have missing data or information that doesn't match the model of the car. We can fill that information using `df_details`.

In [13]:
from fuzzywuzzy import process

# Merge sample_df_sales with df_details using fuzzy matching for make and model
# Use a dictionary to store previously seen matches
match_cache = {}

def get_closest_match(row, column, choices):
    if pd.isnull(row[column]):
        return None
    value = row[column]
    if value in match_cache:
        return match_cache[value]
    match, score = process.extractOne(value, choices)
    result = match if score > 80 else None
    
    # if score <= 80 :
        # print(f"Value: {value}, Match: {match}, Score: {score}")
    match_cache[value] = result
    return result

# Apply fuzzy matching for manufacturer first
df_details['make'] = df_details['make'].str.lower()
sample_df_sales['matched_make'] = sample_df_sales.apply(get_closest_match, axis=1, column='manufacturer', choices=df_details['make'].unique())

# Filter df_details to only include rows with the matched manufacturer
def filter_models(row):
    if pd.isnull(row['matched_make']):
        return np.array([])  # Return an empty NumPy array
    return df_details[df_details['make'] == row['matched_make']]['model'].unique()

# Apply fuzzy matching for model based on the filtered models
def get_closest_model(row):
    models = filter_models(row)
    if models.size == 0:  # Explicitly check if the array is empty
        return None
    return get_closest_match(row, 'model', models)

sample_df_sales['matched_model'] = sample_df_sales.apply(get_closest_model, axis=1)
# Calculate the number of matched and unmatched rows

matched_count = sample_df_sales['matched_model'].notnull().sum()
print(f"Matched: {matched_count} out of {len(sample_df_sales)} rows")

sample_df_sales = sample_df_sales[sample_df_sales['matched_model'].notnull()]

Matched: 7771 out of 8579 rows


In [14]:
def get_detail_value(row, details_col, fallback_col):
    if pd.isnull(row['matched_make']) or pd.isnull(row['matched_model']):
        return row.get(fallback_col, None)

    match = df_details[
        (df_details['make'] == row['matched_make']) &
        (df_details['model'] == row['matched_model'])
    ]

    if not match.empty:
        val = match[details_col].iloc[0]
        return val if pd.notnull(val) else row.get(fallback_col, None)

    return row.get(fallback_col, None)


# Map df_details columns to new or existing sample_df_sales columns
columns_to_replace = {
    'year': 'year',
    'cylinders': 'cylinders',
    'displ': 'engine_displacement',
    'drive': 'drive',
    'fueltype1': 'fuel',
    'vclass': 'type'
}

# Apply replacements
for details_col, sales_col in columns_to_replace.items():
    sample_df_sales[sales_col] = sample_df_sales.apply(
        lambda row: get_detail_value(row, details_col, sales_col),
        axis=1
    )

# Replace 'make' and 'model' with matched values
sample_df_sales[['make', 'model']] = sample_df_sales[[
    'matched_make', 'matched_model']]

# Drop matched columns
sample_df_sales.drop(
    columns=['matched_make', 'matched_model'], inplace=True, errors='ignore')

In [15]:
sample_df_sales.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7771 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   region               7771 non-null   object 
 1   price                7771 non-null   int64  
 2   year                 7771 non-null   float64
 3   manufacturer         7771 non-null   object 
 4   model                7771 non-null   object 
 5   condition            4940 non-null   object 
 6   cylinders            7598 non-null   object 
 7   fuel                 7771 non-null   object 
 8   odometer             7727 non-null   float64
 9   title_status         7635 non-null   object 
 10  transmission         7734 non-null   object 
 11  drive                7735 non-null   object 
 12  type                 7767 non-null   object 
 13  paint_color          5644 non-null   object 
 14  state                7771 non-null   object 
 15  posting_date         7771 non-null   object

In [16]:
sample_df_sales

Unnamed: 0,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,type,paint_color,state,posting_date,engine_displacement,make
0,meadville,13895,2015.0,chevrolet,Cruze,good,4.0,Regular Gasoline,70422.0,clean,automatic,Front-Wheel Drive,Midsize Cars,blue,pa,2021-04-14T16:54:17-0400,1.4,chevrolet
1,stockton,7999,2015.0,dodge,Grand Caravan,excellent,6.0,Regular Gasoline,151220.0,clean,automatic,Front-Wheel Drive,Minivan - 2WD,blue,ca,2021-04-12T13:37:06-0700,3.6,dodge
2,las vegas,8995,2014.0,porsche,Cayenne,,6.0,Premium Gasoline,104297.0,clean,automatic,All-Wheel Drive,Standard Sport Utility Vehicle 4WD,,nv,2021-05-02T11:01:08-0700,3.6,porsche
3,SF bay area,12990,2008.0,bmw,128i Convertible,like new,6.0,Premium Gasoline,64138.0,clean,automatic,Rear-Wheel Drive,Subcompact Cars,black,ca,2021-05-02T15:14:43-0700,3.0,bmw
4,lexington,13535,2013.0,hyundai,Genesis,,6.0,Regular Gasoline,97809.0,clean,automatic,Rear-Wheel Drive,Large Cars,white,ky,2021-04-26T11:36:34-0400,3.8,hyundai
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,"st louis, MO",12795,2015.0,ford,Escape AWD,good,4.0,Regular Gasoline,136000.0,clean,automatic,All-Wheel Drive,Small Sport Utility Vehicle 4WD,blue,il,2021-04-28T10:54:40-0500,2.0,ford
9995,orlando,10900,2020.0,ford,Escape FWD PHEV,excellent,4.0,Regular Gasoline,108000.0,clean,automatic,Front-Wheel Drive,Small Sport Utility Vehicle 2WD,blue,fl,2021-05-03T14:36:42-0400,2.5,ford
9996,inland empire,12995,2013.0,nissan,Xterra 4WD,,6.0,Regular Gasoline,129131.0,clean,automatic,4-Wheel Drive,Small Sport Utility Vehicle 4WD,custom,ca,2021-04-29T12:06:42-0700,4.0,nissan
9997,"washington, DC",15990,1984.0,audi,Quattro,good,5.0,Regular Gasoline,75159.0,clean,other,,Subcompact Cars,,dc,2021-05-03T09:51:24-0400,2.1,audi


## #3 - Replace the unknowns
In this step, we handle missing values in the `sample_df_sales` dataframe. Numeric columns are filled with `median` to indicate missing data, while categorical columns are filled with the string `'mode'`. This ensures that the dataset is complete and ready for further analysis or modeling without introducing biases due to missing values.

In [17]:
# Fill missing numeric values with the median of each column
numeric_columns = sample_df_sales.select_dtypes(include=['float64', 'int64']).columns
for col in numeric_columns:
    median = sample_df_sales[col].median()  # Calculate the median for the column
    sample_df_sales[col] = sample_df_sales[col].fillna(median)

# Fill missing categorical values with the mode of each column
categorical_columns = sample_df_sales.select_dtypes(include=['object']).columns
for col in categorical_columns:
    mode = sample_df_sales[col].mode()[0]  # Get the most frequent value (mode) of the column
    sample_df_sales[col] = sample_df_sales[col].fillna(mode)


In [18]:
sample_df_sales.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7771 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   region               7771 non-null   object 
 1   price                7771 non-null   int64  
 2   year                 7771 non-null   float64
 3   manufacturer         7771 non-null   object 
 4   model                7771 non-null   object 
 5   condition            7771 non-null   object 
 6   cylinders            7771 non-null   object 
 7   fuel                 7771 non-null   object 
 8   odometer             7771 non-null   float64
 9   title_status         7771 non-null   object 
 10  transmission         7771 non-null   object 
 11  drive                7771 non-null   object 
 12  type                 7771 non-null   object 
 13  paint_color          7771 non-null   object 
 14  state                7771 non-null   object 
 15  posting_date         7771 non-null   object

In [19]:
sample_df_sales.to_csv('../dataset/cleaned_sales_data.csv', index=False)