In [None]:
import numpy as np
import pandas as pd

df_car_sales = pd.read_csv('../dataset/vehicles.csv')
df_car_details = pd.read_json('../dataset/vehicle_details.json')

In [None]:
# Pick a small sample of the data for testing
# Store processed data to prevent reload
sample_df_sales = df_car_sales.sample(n=50000).reset_index(drop=True)
# sample_df_sales = df_car_sales

## #1 - Select relevant columns
We start by selecting only the relevant columns from the `sample_df_sales` and `df_details` dataframes. This step ensures that we focus on the necessary information for further processing and analysis. Irrelevant or redundant columns are dropped to streamline the workflow and reduce memory usage.

In [None]:
# Data preprocessing
def select_columns(df_data, df_details):
    """
    Preprocess the sales and details dataframes by dropping irrelevant columns,
    removing rows with suspicious values, and normalizing the posting_date column.

    Parameters:
    df_sales (pd.DataFrame): The sales dataframe.
    df_details (pd.DataFrame): The details dataframe.

    Returns:
    pd.DataFrame, pd.DataFrame: The preprocessed sales and details dataframes.
    """
    # Select relevant columns
    df_data = df_data[['region', 'price', 'year', 'manufacturer', 'model', 
                         'condition', 'cylinders', 'fuel', 'odometer', 
                         'title_status', 'transmission', 'drive', 'type', 
                         'paint_color', 'state', 'posting_date']]
    # Useful details columns
    df_details = df_details[['make', 'model', 'year', 'pv4', 'lv4', 
                         'displ', 'fuelcost08', 'yousavespend', 'fescore', 
                         'ghgscore', 'barrels08', 'co2tailpipegpm', 'vclass', 
                         'highway08', 'uhighway', 'comb08', 'ghgscorea']]

    return df_data, df_details

## #2 - Fill the car details with actual data
The `sample_df_sales` may have missing data or information that doesn't match the model of the car. We can fill that information using `df_details`.

In [None]:
from fuzzywuzzy import process
from tqdm import tqdm

# Use a dictionary to store previously seen matches
fuzzy_cache = {}

def get_matched_models(df_data, df_details):
    """
    Perform fuzzy matching to match makes and models between sample_df_sales and df_details.

    Parameters:
    sample_df_sales (pd.DataFrame): The sales dataframe containing manufacturer and model information.
    df_details (pd.DataFrame): The details dataframe containing make and model information.

    Returns:
    pd.DataFrame: The updated sample_df_sales dataframe with matched_make and matched_model columns.
    """

    def get_closest_match(row, column, choices):
        if pd.isnull(row[column]):
            return None
        value = row[column]
        if value in fuzzy_cache:
            return fuzzy_cache[value]
        match, score = process.extractOne(value, choices)
        result = match if score > 80 else None
        fuzzy_cache[value] = result
        return result

    # Apply fuzzy matching for manufacturer first
    df_details['make'] = df_details['make'].str.lower()
    
    # Add a progress bar to the process
    tqdm.pandas(desc="Matching manufacturers")
    df_data['matched_make'] = df_data.apply(
        lambda row: get_closest_match(row, 'manufacturer', df_details['make'].unique()), axis=1
    )

    # Filter df_details to only include rows with the matched manufacturer
    def filter_models(row):
        if pd.isnull(row['matched_make']):
            return np.array([])  # Return an empty NumPy array
        return df_details[df_details['make'] == row['matched_make']]['model'].unique()

    # Apply fuzzy matching for model based on the filtered models
    def get_closest_model(row):
        models = filter_models(row)
        if models.size == 0:  # Explicitly check if the array is empty
            return None
        return get_closest_match(row, 'model', models)

    # Add a progress bar to the process
    tqdm.pandas(desc="Matching models")
    df_data['matched_model'] = df_data.progress_apply(get_closest_model, axis=1)

    # Calculate the number of matched and unmatched rows
    matched_count = df_data['matched_model'].notnull().sum()
    print(f"Matched: {matched_count} out of {len(df_data)} rows")

    return df_data

In [None]:
def clean_data(df_data, df_details):
    """
    Clean the sales and details dataframes by removing rows with missing values
    and resetting the index.

    Parameters:
    df_data (pd.DataFrame): The sales dataframe.
    df_details (pd.DataFrame): The details dataframe.

    Returns:
    pd.DataFrame, pd.DataFrame: The cleaned sales and details dataframes.
    """
    
    # Select relevant columns
    df_data, df_details = select_columns(df_data, df_details)
    
    # Merge the dataframes to get more details
    df_data = get_matched_models(df_data, df_details)
    df_data = df_data[~df_data['matched_make'].isnull() & ~df_data['matched_model'].isnull()]

    df_data = pd.merge(
        df_data,
        df_details.drop_duplicates(subset=['make', 'model', 'year']),
        left_on=['matched_make', 'matched_model', 'year'],
        right_on=['make', 'model', 'year'],
        how='left',
        suffixes=('', '_details')
    )
    df_data.drop(columns=['matched_make', 'matched_model', 'make', 'model_details'], inplace=True)
    
    # Impute missing values
    numeric_columns = df_data.select_dtypes(include=[np.number]).columns
    categorical_columns = df_data.select_dtypes(include=[object]).columns
    
    # Fill missing numeric values with the median
    # df_data[numeric_columns] = df_data[numeric_columns].fillna(df_data[numeric_columns].median())

    # Fill missing categorical values with the mode
    # df_data[categorical_columns] = df_data[categorical_columns].fillna(df_data[categorical_columns].mode().iloc[0])
    
    return df_data

In [None]:
sample_df_sales = clean_data(sample_df_sales, df_car_details)
sample_df_sales.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns
numeric_columns = sample_df_sales.select_dtypes(include=[np.number]).columns
categorical_columns = sample_df_sales.select_dtypes(include=[object]).columns
encoded_sample_df = sample_df_sales.copy()

# Encode categorical columns
encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    encoded_sample_df[col] = le.fit_transform(encoded_sample_df[col])
    encoders[col] = le

# Compute correlation matrix
correlation_matrix = encoded_sample_df.corr()
# Plot the heatmap
plt.figure(figsize=(10, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()
# Show correlation with 'price'
print(correlation_matrix['price'].apply(lambda x: abs(x)).sort_values(ascending=False))

In [None]:
sample_df_sales.to_csv('../dataset/cleaned_sales_data.csv', index=False)