In [15]:
# Title: Exploratory Data Analysis (EDA) for Vehicle Ads Dataset
# Introduction:
# This project aims to analyze a dataset of vehicle advertisements.
# We will preprocess the data to handle missing values and create visualizations
# to better understand the distribution of the data.

import pandas as pd
import numpy as np
import plotly.express as px

# Load the CSV file

df = pd.read_csv("C:/Users/DCV/Cars/vehicles_us.csv")

In [16]:
# Display the first few rows of the dataset
df.head()

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28


In [17]:
#General Information

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


In [18]:
# Descriptive Statistics

df.describe()

Unnamed: 0,price,model_year,cylinders,odometer,is_4wd,days_listed
count,51525.0,47906.0,46265.0,43633.0,25572.0,51525.0
mean,12132.46492,2009.75047,6.125235,115553.461738,1.0,39.55476
std,10040.803015,6.282065,1.66036,65094.611341,0.0,28.20427
min,1.0,1908.0,3.0,0.0,1.0,0.0
25%,5000.0,2006.0,4.0,70000.0,1.0,19.0
50%,9000.0,2011.0,6.0,113000.0,1.0,33.0
75%,16839.0,2014.0,8.0,155000.0,1.0,53.0
max,375000.0,2019.0,12.0,990000.0,1.0,271.0


In [19]:
# Data Preprocessing
# 1. Fill missing values in 'model_year' using the median by model
df['model_year'] = df.groupby('model')['model_year'].transform(lambda x: x.fillna(x.median()))

In [20]:
# 2. Fill missing values in 'cylinders' using the median by model
df['cylinders'] = df.groupby('model')['cylinders'].transform(lambda x: x.fillna(x.median()))

In [21]:
# 3. Fill missing values in 'odometer' using the median by model and year
df['odometer'] = df.groupby(['model', 'model_year'])['odometer'].transform(lambda x: x.fillna(x.median()))

In [22]:
# Remove outliers in 'model_year' and 'price'
# Define limits for 'model_year' and 'price'
year_lower_limit = df['model_year'].quantile(0.05)
year_upper_limit = df['model_year'].quantile(0.95)

price_lower_limit = df['price'].quantile(0.05)
price_upper_limit = df['price'].quantile(0.95)

In [23]:
# Filter the dataset to remove outliers
df = df[(df['model_year'] >= year_lower_limit) & (df['model_year'] <= year_upper_limit)]
df = df[(df['price'] >= price_lower_limit) & (df['price'] <= price_upper_limit)]


In [24]:
# Exploratory Visualizations
# 1. Price Histogram
fig_hist = px.histogram(df, x="price", title="Price Distribution")
fig_hist.show()


In [25]:
# 2. Scatter Plot: Price vs Odometer
fig_scatter = px.scatter(df, x="odometer", y="price", title="Price vs Odometer")
fig_scatter.show()

In [26]:
# 3. Bar Chart: Vehicle Condition Distribution
fig_bar = px.bar(df['condition'].value_counts(), title="Vehicle Condition Distribution")
fig_bar.show()

In [27]:
# Conclusion:
# We have preprocessed the data to handle missing values and removed outliers to improve the quality of the analysis.
# The visualizations allow us to explore the distribution of prices, the relationship between price and odometer, and the condition of the vehicles.