In [160]:
import numpy as np
import pandas as pd

df_sales = pd.read_csv('../dataset/vehicles.csv')
df_details = pd.read_json('../dataset/vehicle_details.json')

In [181]:
# Pick a small sample of the data for testing
# Store processed data to prevent reload
sample_df_sales = df_sales.sample(n=10000).reset_index(drop=True)

## #1 - Select relevant columns
We start by selecting only the relevant columns from the `sample_df_sales` and `df_details` dataframes. This step ensures that we focus on the necessary information for further processing and analysis. Irrelevant or redundant columns are dropped to streamline the workflow and reduce memory usage.

In [174]:
# Data preprocessing
def select_columns(df, df_details):
    """
    Preprocess the sales and details dataframes by dropping irrelevant columns,
    removing rows with suspicious values, and normalizing the posting_date column.

    Parameters:
    df_sales (pd.DataFrame): The sales dataframe.
    df_details (pd.DataFrame): The details dataframe.

    Returns:
    pd.DataFrame, pd.DataFrame: The preprocessed sales and details dataframes.
    """
    # Select relevant columns
    df = df[['region', 'price', 'year', 'manufacturer', 'model', 
                         'condition', 'cylinders', 'fuel', 'odometer', 
                         'title_status', 'transmission', 'drive', 'type', 
                         'paint_color', 'state', 'posting_date']]
    # Useful details columns
    df_details = df_details[['make', 'model', 'pv4', 'lv4', 
                         'displ', 'fuelcost08', 'yousavespend', 'fescore', 
                         'ghgscore', 'barrels08', 'co2tailpipegpm', 'vclass', 
                         'highway08', 'uhighway', 'comb08', 'ghgscorea']]

    return df, df_details

## #2 - Fill the car details with actual data
The `sample_df_sales` may have missing data or information that doesn't match the model of the car. We can fill that information using `df_details`.

In [175]:
from fuzzywuzzy import process
from tqdm import tqdm

# Use a dictionary to store previously seen matches
fuzzy_cache = {}

def get_matched_models(df, df_details):
    """
    Perform fuzzy matching to match makes and models between sample_df_sales and df_details.

    Parameters:
    sample_df_sales (pd.DataFrame): The sales dataframe containing manufacturer and model information.
    df_details (pd.DataFrame): The details dataframe containing make and model information.

    Returns:
    pd.DataFrame: The updated sample_df_sales dataframe with matched_make and matched_model columns.
    """

    def get_closest_match(row, column, choices):
        if pd.isnull(row[column]):
            return None
        value = row[column]
        if value in fuzzy_cache:
            return fuzzy_cache[value]
        match, score = process.extractOne(value, choices)
        result = match if score > 80 else None
        fuzzy_cache[value] = result
        return result

    # Apply fuzzy matching for manufacturer first
    df_details.loc[:, 'make'] = df_details['make'].str.lower()
    
    # Add a progress bar to the process
    tqdm.pandas(desc="Matching manufacturers")
    df.loc[:, 'matched_make'] = df.apply(
        lambda row: get_closest_match(row, 'manufacturer', df_details['make'].unique()), axis=1
    )

    # Filter df_details to only include rows with the matched manufacturer
    def filter_models(row):
        if pd.isnull(row['matched_make']):
            return np.array([])  # Return an empty NumPy array
        return df_details[df_details['make'] == row['matched_make']]['model'].unique()

    # Apply fuzzy matching for model based on the filtered models
    def get_closest_model(row):
        models = filter_models(row)
        if models.size == 0:  # Explicitly check if the array is empty
            return None
        return get_closest_match(row, 'model', models)

    # Add a progress bar to the process
    tqdm.pandas(desc="Matching models")
    df['matched_model'] = df.progress_apply(get_closest_model, axis=1)

    # Calculate the number of matched and unmatched rows
    matched_count = df['matched_model'].notnull().sum()
    print(f"Matched: {matched_count} out of {len(df)} rows")

    return df

In [None]:
def clean_data(df_data, df_details):
    """
    Clean the sales and details dataframes by removing rows with missing values
    and resetting the index.

    Parameters:
    df_data (pd.DataFrame): The sales dataframe.
    df_details (pd.DataFrame): The details dataframe.

    Returns:
    pd.DataFrame, pd.DataFrame: The cleaned sales and details dataframes.
    """
    
    # Select relevant columns
    df_data, df_details = select_columns(df_data, df_details)
    
    # Merge the dataframes to get more details
    df_data = get_matched_models(df_data, df_details)
    df_data = df_data.merge(
        df_details,
        left_on=['matched_make', 'matched_model'],
        right_on=['make', 'model'],
        how='left',
        suffixes=('', '_details')
    )
    df_data = df_data.drop(columns=['matched_make', 'matched_model', 'make', 'model_details'])
    
    # Impute missing values
    numeric_columns = df_data.select_dtypes(include=[np.number]).columns
    categorical_columns = df_data.select_dtypes(include=[object]).columns
    
    # Fill missing numeric values with the median
    df_data[numeric_columns] = df_data[numeric_columns].fillna(df_data[numeric_columns].median())

    # Fill missing categorical values with the mode
    df_data[categorical_columns] = df_data[categorical_columns].fillna(df_data[categorical_columns].mode().iloc[0])
    
    return df_data

In [182]:
sample_df_sales = clean_data(sample_df_sales, df_details)
sample_df_sales.info()

10000
10000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'matched_make'] = df.apply(
Matching models: 100%|██████████| 10000/10000 [01:43<00:00, 96.54it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['matched_model'] = df.progress_apply(get_closest_model, axis=1)


Matched: 8892 out of 10000 rows
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 376219 entries, 0 to 376218
Data columns (total 30 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   region          376219 non-null  object 
 1   price           376219 non-null  int64  
 2   year            376219 non-null  float64
 3   manufacturer    376219 non-null  object 
 4   model           376219 non-null  object 
 5   condition       376219 non-null  object 
 6   cylinders       376219 non-null  object 
 7   fuel            376219 non-null  object 
 8   odometer        376219 non-null  float64
 9   title_status    376219 non-null  object 
 10  transmission    376219 non-null  object 
 11  drive           376219 non-null  object 
 12  type            376219 non-null  object 
 13  paint_color     376219 non-null  object 
 14  state           376219 non-null  object 
 15  posting_date    376219 non-null  object 
 16  pv4             376219 n

In [183]:
sample_df_sales.to_csv('../dataset/cleaned_sales_data.csv', index=False)