# Exploratory Data Analysis (EDA)

## Overview
This notebook contains exploratory analysis of the dataset.

## 1. Setup and Data Loading

In [11]:
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

## 2. Load Dataset

In [12]:
# Load the dataset
df = pd.read_csv('../vehicles_us.csv')
print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
df.head()

Dataset loaded successfully!
Shape: (51525, 13)


Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28


## 3. Basic Data Exploration

In [13]:
# Header
st.header("🚗 Vehicle Data Explorer")

# Basic dataset information
print("Dataset Info:")
df.info()
print("\n" + "="*50 + "\n")
print("Summary Statistics:")
df.describe()

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


Summary Statistics:


Unnamed: 0,price,model_year,cylinders,odometer,is_4wd,days_listed
count,51525.0,47906.0,46265.0,43633.0,25572.0,51525.0
mean,12132.46492,2009.75047,6.125235,115553.461738,1.0,39.55476
std,10040.803015,6.282065,1.66036,65094.611341,0.0,28.20427
min,1.0,1908.0,3.0,0.0,1.0,0.0
25%,5000.0,2006.0,4.0,70000.0,1.0,19.0
50%,9000.0,2011.0,6.0,113000.0,1.0,33.0
75%,16839.0,2014.0,8.0,155000.0,1.0,53.0
max,375000.0,2019.0,12.0,990000.0,1.0,271.0


## 4. Data Visualizations
This section contains histograms and scatterplots created with plotly-express.

In [14]:
# 1. Histogram of vehicle prices (inside checkbox)
if st.checkbox("Show Histogram of Vehicle Prices"):
    fig1 = px.histogram(df, x='price', nbins=50, 
                        title='Distribution of Vehicle Prices',
                        labels={'price': 'Price ($)', 'count': 'Number of Vehicles'})
    fig1.update_layout(showlegend=False)
    st.plotly_chart(fig1)

# 2. Histogram of vehicle age (always shown)
st.write("### Distribution of Vehicle Model Years")
fig2 = px.histogram(df, x='model_year', nbins=30,
                    title='Distribution of Vehicle Model Years',
                    labels={'model_year': 'Model Year', 'count': 'Number of Vehicles'})
fig2.update_layout(showlegend=False)
st.plotly_chart(fig2)

# 3. Scatterplot: Price vs Odometer reading (always shown)
st.write("### Vehicle Price vs Odometer Reading")
fig3 = px.scatter(df, x='odometer', y='price', 
                  title='Vehicle Price vs Odometer Reading',
                  labels={'odometer': 'Odometer (miles)', 'price': 'Price ($)'},
                  opacity=0.6)
st.plotly_chart(fig3)

# 4. Scatterplot: Price vs Model Year, colored by condition (always shown)
st.write("### Vehicle Price vs Model Year by Condition")
fig4 = px.scatter(df, x='model_year', y='price', color='condition',
                  title='Vehicle Price vs Model Year by Condition',
                  labels={'model_year': 'Model Year', 'price': 'Price ($)'})
st.plotly_chart(fig4)


DeltaGenerator()