In [None]:
import numpy as np
import pandas as pd

df_sales = pd.read_csv('../dataset/vehicles.csv')
df_details = pd.read_csv('../dataset/vehicle_details.csv')

In [None]:
from fuzzywuzzy import process

# Drop unnecessary columns
sales_df.drop(columns=['id', 'url', 'region_url', 'image_url', 'description'], inplace=True, errors='ignore')

# Select only relevant columns
detail_df = detail_df[['make', 'model', 'year', 'cylinders',
                       'engine displacement', 'drive', 'fuel type1', 'vehicle size class']]

# Remove rows with suspicious values
sales_df = sales_df[sales_df['price'] > 2000]
print("Minimum Price of the Car is", sales_df['price'].min())
print("Maximum Price of the Car is", sales_df['price'].max())

sales_df = sales_df[sales_df['year'] > 1990]
print("Minimum Year of the Car is", sales_df['year'].min())

# Impute missing values
sales_df['lat'].fillna(sales_df['lat'].dropna().mean(), inplace=True)
sales_df['long'].fillna(sales_df['long'].dropna().mean(), inplace=True)

# Merge sales_df with detail_df using fuzzy matching for make and model
# Use a dictionary to store previously seen matches
match_cache = {}

def get_closest_match(row, column, choices):
    if pd.isnull(row[column]):
        return None
    value = row[column]
    if value in match_cache:
        return match_cache[value]
    match, score = process.extractOne(value, choices)
    result = match if score > 80 else None
    
    if score <= 80 :
        print(f"Value: {value}, Match: {match}, Score: {score}")
    match_cache[value] = result
    return result

# Apply fuzzy matching for manufacturer first
detail_df['make'] = detail_df['make'].str.lower()
sales_df['matched_make'] = sales_df.apply(get_closest_match, axis=1, column='manufacturer', choices=detail_df['make'].unique())

# Filter detail_df to only include rows with the matched manufacturer
def filter_models(row):
    if pd.isnull(row['matched_make']):
        return np.array([])  # Return an empty NumPy array
    return detail_df[detail_df['make'] == row['matched_make']]['model'].unique()

# Apply fuzzy matching for model based on the filtered models
def get_closest_model(row):
    models = filter_models(row)
    if models.size == 0:  # Explicitly check if the array is empty
        return None
    return get_closest_match(row, 'model', models)

sales_df['matched_model'] = sales_df.apply(get_closest_model, axis=1)