In [49]:
import numpy as np
import pandas as pd

df_sales = pd.read_csv('../dataset/vehicles.csv')
df_details = pd.read_json('../dataset/vehicle_details.json')

In [59]:
df_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426880 entries, 0 to 426879
Data columns (total 26 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            426880 non-null  int64  
 1   url           426880 non-null  object 
 2   region        426880 non-null  object 
 3   region_url    426880 non-null  object 
 4   price         426880 non-null  int64  
 5   year          425675 non-null  float64
 6   manufacturer  409234 non-null  object 
 7   model         421603 non-null  object 
 8   condition     252776 non-null  object 
 9   cylinders     249202 non-null  object 
 10  fuel          423867 non-null  object 
 11  odometer      422480 non-null  float64
 12  title_status  418638 non-null  object 
 13  transmission  424324 non-null  object 
 14  VIN           265838 non-null  object 
 15  drive         296313 non-null  object 
 16  size          120519 non-null  object 
 17  type          334022 non-null  object 
 18  pain

In [None]:
# Pick a small sample of the data for testing
# Store processed data to prevent reload
sample_df_sales = df_sales.sample(n=10000).reset_index(drop=True)

## #1 - Select relevant columns
We start by selecting only the relevant columns from the `sample_df_sales` and `df_details` dataframes. This step ensures that we focus on the necessary information for further processing and analysis. Irrelevant or redundant columns are dropped to streamline the workflow and reduce memory usage.

In [61]:
# Data preprocessing
sample_df_sales = sample_df_sales.drop(columns=['id', 'url', 'region_url', 'VIN', 'image_url',
              'description', 'county', 'lat', 'long', 'size'], errors='ignore')

df_details = df_details[['make', 'model', 'year', 'cylinders',
                         'displ', 'drive', 'fueltype1', 'vclass']]

# Remove rows with suspicious values
sample_df_sales = sample_df_sales[sample_df_sales['price'] > 1000] # Remove rows with price less than 1000
sample_df_sales = sample_df_sales[sample_df_sales['year'] > 1990] # Remove rows with year less than 1990

## #2 - Fill the car details with actual data
The `sample_df_sales` may have missing data or information that doesn't match the model of the car. We can fill that information using `df_details`.

In [None]:
from fuzzywuzzy import process

# Merge sample_df_sales with df_details using fuzzy matching for make and model
# Use a dictionary to store previously seen matches
match_cache = {}

def get_closest_match(row, column, choices):
    if pd.isnull(row[column]):
        return None
    value = row[column]
    if value in match_cache:
        return match_cache[value]
    match, score = process.extractOne(value, choices)
    result = match if score > 80 else None
    
    # if score <= 80 :
        # print(f"Value: {value}, Match: {match}, Score: {score}")
    match_cache[value] = result
    return result

# Apply fuzzy matching for manufacturer first
df_details['make'] = df_details['make'].str.lower()
sample_df_sales['matched_make'] = sample_df_sales.apply(get_closest_match, axis=1, column='manufacturer', choices=df_details['make'].unique())

# Filter df_details to only include rows with the matched manufacturer
def filter_models(row):
    if pd.isnull(row['matched_make']):
        return np.array([])  # Return an empty NumPy array
    return df_details[df_details['make'] == row['matched_make']]['model'].unique()

# Apply fuzzy matching for model based on the filtered models
def get_closest_model(row):
    models = filter_models(row)
    if models.size == 0:  # Explicitly check if the array is empty
        return None
    return get_closest_match(row, 'model', models)

sample_df_sales['matched_model'] = sample_df_sales.apply(get_closest_model, axis=1)
# Calculate the number of matched and unmatched rows

matched_count = sample_df_sales['matched_model'].notnull().sum()
print(f"Matched: {matched_count} out of {len(sample_df_sales)} rows")

sample_df_sales = sample_df_sales[sample_df_sales['matched_model'].notnull()]

# Replace or add the listed columns from df_details to sample_df_sales
columns_to_replace = {
    'year': 'year',
    'cylinders': 'cylinders',
    'displ': 'engine_displacement',
    'drive': 'drive',
    'fueltype1': 'fuel',
    'vclass': 'type'
}

for details_col, sales_col in columns_to_replace.items():
    sample_df_sales[sales_col] = sample_df_sales.apply(
        lambda row: df_details.loc[
            (df_details['make'] == row['matched_make']) &
            (df_details['model'] == row['matched_model']),
            details_col
        ].iloc[0] if not pd.isnull(row['matched_make']) and not pd.isnull(row['matched_model']) else row[sales_col],
        axis=1
    )

# Replace the model and make with matched_model and matched_make
sample_df_sales['model'] = sample_df_sales['matched_model']
sample_df_sales['make'] = sample_df_sales['matched_make']

# Drop the matched columns
sample_df_sales = sample_df_sales.drop(columns=['matched_make', 'matched_model'], errors='ignore')

Matched: 7718 out of 8540 rows


IndexError: single positional indexer is out-of-bounds

In [63]:
sample_df_sales.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7718 entries, 1 to 9998
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   region         7718 non-null   object 
 1   price          7718 non-null   int64  
 2   year           7718 non-null   float64
 3   manufacturer   7718 non-null   object 
 4   model          7718 non-null   object 
 5   condition      4751 non-null   object 
 6   cylinders      4492 non-null   object 
 7   fuel           7655 non-null   object 
 8   odometer       7664 non-null   float64
 9   title_status   7583 non-null   object 
 10  transmission   7673 non-null   object 
 11  drive          5389 non-null   object 
 12  type           6167 non-null   object 
 13  paint_color    5492 non-null   object 
 14  state          7718 non-null   object 
 15  posting_date   7718 non-null   object 
 16  matched_make   7718 non-null   object 
 17  matched_model  7718 non-null   object 
dtypes: float64(2)

In [64]:
sample_df_sales

Unnamed: 0,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,type,paint_color,state,posting_date,matched_make,matched_model
1,akron / canton,3995,2005.0,chevrolet,trailblazer lt,,,gas,166000.0,clean,automatic,,,,oh,2021-04-26T21:39:48-0400,chevrolet,Trailblazer FWD
2,raleigh / durham / CH,9931,2014.0,chevrolet,cruze,,4 cylinders,gas,95550.0,clean,automatic,fwd,sedan,red,nc,2021-04-29T16:00:06-0400,chevrolet,Cruze
3,houston,24999,2013.0,ram,1500 sport,excellent,,gas,5000.0,clean,automatic,,truck,black,tx,2021-05-01T09:19:25-0500,ram,1500 Classic 2WD
4,austin,19400,2011.0,toyota,4runner,good,6 cylinders,gas,116309.0,clean,automatic,,SUV,white,tx,2021-04-30T13:53:09-0500,toyota,4Runner 4WD
5,rhode island,1500,2009.0,nissan,rogue,,,gas,168.0,clean,automatic,,,white,ri,2021-04-06T13:02:14-0400,nissan,Rogue Sport FWD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,el paso,8500,2016.0,chrysler,town country,,,gas,89498.0,rebuilt,automatic,,van,,tx,2021-05-03T09:02:43-0600,chrysler,Town and Country
9995,ann arbor,5400,1996.0,ram,1500 4wd,good,8 cylinders,gas,226000.0,clean,automatic,4wd,truck,white,mi,2021-04-15T18:28:31-0400,ram,1500 4WD
9996,norfolk / hampton roads,3200,2000.0,ford,explorer eddie bauer 4x4,good,6 cylinders,gas,174000.0,clean,automatic,4wd,SUV,red,va,2021-04-30T12:20:22-0400,ford,EXP
9997,oklahoma city,16900,2017.0,volvo,s60,good,,gas,55000.0,clean,automatic,,,,ok,2021-04-20T21:20:25-0500,volvo,S60


## #3 - Replace the unknowns
In this step, we handle missing values in the `sample_df_sales` dataframe. Numeric columns are filled with `median` to indicate missing data, while categorical columns are filled with the string `'mode'`. This ensures that the dataset is complete and ready for further analysis or modeling without introducing biases due to missing values.

In [65]:
# Fill missing numeric values with the median of each column
numeric_columns = sample_df_sales.select_dtypes(include=['float64', 'int64']).columns
for col in numeric_columns:
    median = sample_df_sales[col].median()  # Calculate the median for the column
    sample_df_sales[col] = sample_df_sales[col].fillna(median)

# Fill missing categorical values with the mode of each column
categorical_columns = sample_df_sales.select_dtypes(include=['object']).columns
for col in categorical_columns:
    mode = sample_df_sales[col].mode()[0]  # Get the most frequent value (mode) of the column
    sample_df_sales[col] = sample_df_sales[col].fillna(mode)


In [66]:
sample_df_sales.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7718 entries, 1 to 9998
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   region         7718 non-null   object 
 1   price          7718 non-null   int64  
 2   year           7718 non-null   float64
 3   manufacturer   7718 non-null   object 
 4   model          7718 non-null   object 
 5   condition      7718 non-null   object 
 6   cylinders      7718 non-null   object 
 7   fuel           7718 non-null   object 
 8   odometer       7718 non-null   float64
 9   title_status   7718 non-null   object 
 10  transmission   7718 non-null   object 
 11  drive          7718 non-null   object 
 12  type           7718 non-null   object 
 13  paint_color    7718 non-null   object 
 14  state          7718 non-null   object 
 15  posting_date   7718 non-null   object 
 16  matched_make   7718 non-null   object 
 17  matched_model  7718 non-null   object 
dtypes: float64(2)

In [67]:
sample_df_sales.to_csv('../dataset/cleaned_sales_data.csv', index=False)