In [66]:
import numpy as np
import pandas as pd

df_sales = pd.read_csv('../dataset/vehicles.csv')
df_details = pd.read_json('../dataset/vehicle_details.json')

In [70]:
# Pick a small sample of the data for testing
df_sales = df_sales.sample(n=500, random_state=42).reset_index(drop=True)

## #1 - Select relevant columns
We start by selecting only the relevant columns from the `df_sales` and `df_details` dataframes. This step ensures that we focus on the necessary information for further processing and analysis. Irrelevant or redundant columns are dropped to streamline the workflow and reduce memory usage.

In [72]:
# Data preprocessing
df_sales = df_sales.drop(columns=['id', 'url', 'region_url', 'VIN', 'image_url',
              'description', 'county', 'lat', 'long', 'size'], errors='ignore')

df_details = df_details[['make', 'model', 'year', 'cylinders',
                         'displ', 'drive', 'fueltype1', 'vclass']]

# Remove rows with suspicious values
df_sales = df_sales[df_sales['price'] > 1000] # Remove rows with price less than 1000
df_sales = df_sales[df_sales['year'] > 1990] # Remove rows with year less than 1990

## #2 - Fill the car details with actual data
The `df_sales` may have missing data or information that doesn't match the model of the car. We can fill that information using `df_details`.

In [73]:
from fuzzywuzzy import process

# Merge df_sales with df_details using fuzzy matching for make and model
# Use a dictionary to store previously seen matches
match_cache = {}

def get_closest_match(row, column, choices):
    if pd.isnull(row[column]):
        return None
    value = row[column]
    if value in match_cache:
        return match_cache[value]
    match, score = process.extractOne(value, choices)
    result = match if score > 80 else None
    
    # if score <= 80 :
        # print(f"Value: {value}, Match: {match}, Score: {score}")
    match_cache[value] = result
    return result

# Apply fuzzy matching for manufacturer first
df_details['make'] = df_details['make'].str.lower()
df_sales['matched_make'] = df_sales.apply(get_closest_match, axis=1, column='manufacturer', choices=df_details['make'].unique())

# Filter df_details to only include rows with the matched manufacturer
def filter_models(row):
    if pd.isnull(row['matched_make']):
        return np.array([])  # Return an empty NumPy array
    return df_details[df_details['make'] == row['matched_make']]['model'].unique()

# Apply fuzzy matching for model based on the filtered models
def get_closest_model(row):
    models = filter_models(row)
    if models.size == 0:  # Explicitly check if the array is empty
        return None
    return get_closest_match(row, 'model', models)

df_sales['matched_model'] = df_sales.apply(get_closest_model, axis=1)
# Calculate the number of matched and unmatched rows

matched_count = df_sales['matched_model'].notnull().sum()
print(f"Matched: {matched_count} out of {len(df_sales)} rows")

df_sales = df_sales[df_sales['matched_model'].notnull()]

# Replace or add the listed columns from df_details to df_sales
columns_to_replace = {
    'year': 'year',
    'cylinders': 'cylinders',
    'displ': 'engine_displacement',
    'drive': 'drive',
    'fueltype1': 'fuel',
    'vclass': 'type'
}

for details_col, sales_col in columns_to_replace.items():
    df_sales[sales_col] = df_sales.apply(
        lambda row: df_details.loc[
            (df_details['make'] == row['matched_make']) &
            (df_details['model'] == row['matched_model']),
            details_col
        ].iloc[0] if not pd.isnull(row['matched_make']) and not pd.isnull(row['matched_model']) else row[sales_col],
        axis=1
    )

# Replace the model and make with matched_model and matched_make
df_sales['model'] = df_sales['matched_model']
df_sales['make'] = df_sales['matched_make']

# Drop the matched columns
df_sales = df_sales.drop(columns=['matched_make', 'matched_model'], errors='ignore')

Matched: 465 out of 500 rows


In [75]:
df_sales.info()

<class 'pandas.core.frame.DataFrame'>
Index: 465 entries, 0 to 499
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   region               465 non-null    object 
 1   price                465 non-null    int64  
 2   year                 465 non-null    int64  
 3   manufacturer         465 non-null    object 
 4   model                465 non-null    object 
 5   condition            289 non-null    object 
 6   cylinders            434 non-null    float64
 7   fuel                 465 non-null    object 
 8   odometer             456 non-null    float64
 9   title_status         459 non-null    object 
 10  transmission         464 non-null    object 
 11  drive                461 non-null    object 
 12  type                 465 non-null    object 
 13  paint_color          321 non-null    object 
 14  state                465 non-null    object 
 15  posting_date         465 non-null    object 


## #3 - Replace the unknowns
In this step, we handle missing values in the `df_sales` dataframe. Numeric columns are filled with `-1` to indicate missing data, while categorical columns are filled with the string `'unknown'`. This ensures that the dataset is complete and ready for further analysis or modeling without introducing biases due to missing values.

In [76]:
# Fill missing numeric values with -1
numeric_columns = df_sales.select_dtypes(include=['float64', 'int64']).columns
df_sales[numeric_columns] = df_sales[numeric_columns].fillna(-1)

# Fill missing categorical values with 'unknown'
categorical_columns = df_sales.select_dtypes(include=['object']).columns
df_sales[categorical_columns] = df_sales[categorical_columns].fillna('unknown')

In [78]:
df_sales.info()

<class 'pandas.core.frame.DataFrame'>
Index: 465 entries, 0 to 499
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   region               465 non-null    object 
 1   price                465 non-null    int64  
 2   year                 465 non-null    int64  
 3   manufacturer         465 non-null    object 
 4   model                465 non-null    object 
 5   condition            465 non-null    object 
 6   cylinders            465 non-null    float64
 7   fuel                 465 non-null    object 
 8   odometer             465 non-null    float64
 9   title_status         465 non-null    object 
 10  transmission         465 non-null    object 
 11  drive                465 non-null    object 
 12  type                 465 non-null    object 
 13  paint_color          465 non-null    object 
 14  state                465 non-null    object 
 15  posting_date         465 non-null    object 
