In [106]:
#INTRO:This project focuses on car models that have over 200,000 miles.
#It showcases the the models that have historically shown reliability of over 100s of thousands of miles.




In [109]:

import pandas as pd
import numpy as np
import streamlit as st

import plotly.express as px



df = pd.read_csv("vehicles_us.csv")




In [110]:
# Load the data
vehicles_df = pd.read_csv('vehicles_us.csv')

In [91]:
#Explore the data
vehicles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


In [92]:
vehicles_df.head()

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28


In [93]:
vehicles_df.columns

Index(['price', 'model_year', 'model', 'condition', 'cylinders', 'fuel',
       'odometer', 'transmission', 'type', 'paint_color', 'is_4wd',
       'date_posted', 'days_listed'],
      dtype='object')

In [115]:
# Check for missing values
st.write("### Missing Values")
missing_values = vehicles_df.isnull().sum()
st.write(missing_values[missing_values > 0])

display(missing_values)

# Check for duplicates
st.write("### Duplicate Rows")
duplicate_count = vehicles_df.duplicated().sum()
st.write(f"Number of duplicate rows: {duplicate_count}")

display(duplicate_count)

# Detect outliers in numerical columns using IQR
st.write("### Outliers Detection")
numerical_columns = vehicles_df.select_dtypes(include=['int64', 'float64']).columns

outliers_info = {}
for col in numerical_columns:
    Q1 = vehicles_df[col].quantile(0.25)
    Q3 = vehicles_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = vehicles_df[(vehicles_df[col] < lower_bound) | (vehicles_df[col] > upper_bound)]
    outliers_info[col] = len(outliers)

# Display outlier counts per column
outliers_info = pd.DataFrame.from_dict(outliers_info, orient='index', columns=['Outlier Count'])
st.write(outliers_info.sort_values(by='Outlier Count', ascending=False))

display(outliers_info)


price               0
model_year       3619
model               0
condition           0
cylinders        5260
fuel                0
odometer         7892
transmission        0
type                0
paint_color      9267
is_4wd          25953
date_posted         0
days_listed         0
dtype: int64

0

Unnamed: 0,Outlier Count
price,1646
model_year,709
cylinders,0
odometer,487
is_4wd,0
days_listed,1618


In [None]:
# Fill missing values with the mean for numerical columns
numerical_columns = vehicles_df.select_dtypes(include=['int64', 'float64']).columns
vehicles_df[numerical_columns] = vehicles_df[numerical_columns].fillna(vehicles_df[numerical_columns].mean())

# Fill missing values with mode for non-numerical columns (optional)
categorical_columns = vehicles_df.select_dtypes(include=['object']).columns
vehicles_df[categorical_columns] = vehicles_df[categorical_columns].fillna(vehicles_df[categorical_columns].mode().iloc[0])

# Check for missing values after filling
st.write("### Missing Values After Filling")
st.write(vehicles_df.isnull().sum()[vehicles_df.isnull().sum() > 0])

# Display the cleaned DataFrame
st.write("### Cleaned Data")
st.write(vehicles_df.head())


In [94]:
# Filter data for cars with over 200,000 miles
high_mileage_cars = vehicles_df[vehicles_df['odometer'] > 200000]


In [95]:
# Create a histogram of manufacturers for cars with over 200,000 miles
fig = px.histogram(
    high_mileage_cars,
    x="model",
    title="Number of Cars from Manufacturers with Over 200,000 Miles",
    labels={"manufacturer": "Manufacturer"},
    color_discrete_sequence=["blue"]
)


In [96]:
# Update layout for better readability
fig.update_layout(
    xaxis_title="  ",
    yaxis_title="Count of Cars",
    xaxis={'categoryorder': 'total descending'}  # Sort by count
)

In [97]:



st.header('Vehicle types by manufacturer')
st.write(px.histogram(df, x='model', color='type'))

st.header('Histogram of `condition` vs `model_year`')



DeltaGenerator()

In [99]:


# Identify the top 5 car models with the most cars over 200,000 miles
top_5_models = (
    high_mileage_cars['model']
    .value_counts()
    .head(5)
    .index.tolist()
)





In [108]:


# Streamlit App
st.title("High Mileage Cars Analysis")

# Checkbox to toggle between all models and top 5 models
if st.checkbox("Show Only Top 5 Models",help="Toggle to show only the 5 most common high-mileage models"): 


    st.write("### Histogram of Vehicle Types for Top 5 Models")
    # Filter data for the top 5 models
    filtered_cars = high_mileage_cars[high_mileage_cars['model'].isin(top_5_models)]
else:
    st.write("### Histogram of Vehicle Types for All Models")
    # Use all high mileage cars
    filtered_cars = high_mileage_cars




In [101]:

# Create the histogram
fig = px.histogram(
    filtered_cars,
    x="model",
    color="type",  # Optionally group by vehicle type
    title="Number of Vehicles by Model and Type",
    labels={"model": "Car Model", "type": "Vehicle Type"},
    barmode="stack",  # Stack bars for clarity
)

# Display the histogram
st.write(fig)


In [102]:


# Filter data for cars with over 200,000 miles
high_mileage_cars = vehicles_df[vehicles_df['odometer'] > 200000]

# Streamlit App
st.title("High Mileage Cars Analysis")



DeltaGenerator()

In [103]:


# Scatter plot with Streamlit interaction
if st.checkbox("Show Scatter Plot"):
    st.write("### Scatter Plot of High Mileage Cars")
    



In [104]:
# Scatter plot: price vs. odometer
fig = px.scatter(
        high_mileage_cars,
        x="odometer", 
        y="price",
        color="model",  # Optional grouping by manufacturer
        title="Price vs Odometer for High Mileage Cars",
        labels={"odometer": "Odometer (miles)", "price": "Price ($)"}
    )
    
    
   

In [105]:
 # Display the plot
st.write(fig)
 