## Importing important libraries

In [11]:
import pandas as pd
import numpy as np
import plotly.express as px
import altair as alt

## Reading data from CSV file

In [2]:
data = pd.read_csv('../vehicles_us.csv')

## Checking data

In [3]:
data.sample(10)

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
21976,11995,2009.0,chevrolet tahoe,good,8.0,gas,151373.0,automatic,SUV,red,1.0,2019-04-06,56
46155,13000,2012.0,toyota tacoma,good,,gas,,automatic,pickup,silver,1.0,2018-12-18,41
31809,50900,2015.0,ram 3500,like new,6.0,diesel,32844.0,automatic,truck,white,1.0,2019-02-04,19
14421,24990,2006.0,ford f-350 sd,good,8.0,gas,65603.0,automatic,truck,red,1.0,2018-06-11,40
4114,32995,2018.0,toyota 4runner,excellent,6.0,gas,45587.0,automatic,SUV,red,1.0,2018-06-17,12
3965,16750,1985.0,chevrolet corvette,like new,8.0,gas,24540.0,automatic,hatchback,white,,2018-10-14,0
21876,4490,2008.0,ford f-150,excellent,8.0,gas,116445.0,automatic,truck,blue,,2018-12-26,63
36576,37995,2018.0,jeep wrangler,good,6.0,gas,,automatic,SUV,white,1.0,2018-10-11,53
1520,12000,2011.0,ford f-150,excellent,,gas,126000.0,automatic,truck,white,1.0,2018-11-30,15
27920,16750,2009.0,jeep wrangler unlimited,excellent,6.0,gas,59890.0,automatic,SUV,white,1.0,2018-09-02,38


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


So now we know that we have 5 columns have missing data.

Now lets start cleaning the data by fill the missing data first

In [5]:
data['is_4wd'] = data['is_4wd'].fillna(0)
data['paint_color'] = data['paint_color'].fillna('Unknown')
data['model_year'] = data['model_year'].fillna(data.groupby(['model'])['model_year'].transform('median'))
data['odometer'] = data['odometer'].fillna(np.floor(data['odometer'].median()))
data['cylinders'] = data['cylinders'].fillna(data['cylinders'].median())
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    51525 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     51525 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      51525 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   51525 non-null  object 
 10  is_4wd        51525 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


After we fill the missing data, now we should look for the duplicate data if any.

In [6]:
data[data.duplicated()]

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed


We did not find duplicated data  so we are good to go and process the EDA.

## EDA Analysis

In [16]:
fig = px.histogram(data, x='odometer', nbins=30, title='Distribution of Odometer Readings')
fig.update_layout(xaxis_title='Odometer', yaxis_title='Count')
fig

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

As we can see here most of the cars odometer is between 100k-150k miles

In [17]:
fig = px.scatter(data, x='model_year', y='price', color='condition',
                 title='Price vs. Model Year',
                 labels={'model_year': 'Model Year', 'price': 'Price'},
                 hover_data=['model', 'cylinders'])
fig

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

Newer model years tend to have higher prices. Vehicles with poor condition cluster at lower price points