In [None]:
import pandas as pd
import streamlit as st
import plotly.express as px
from matplotlib import pyplot as plt
import plotly.io as pio



df = pd.read_csv('vehicles_us.csv')

print(df.head())
print(df.columns.tolist())


print(df.columns.tolist())
df.info()

def clean_data1(df):
    # Handle missing values
    df['model_year'] = df['model_year'].fillna(0).astype(int)
    df['cylinders'] = df['cylinders'].fillna(0)
    df['odometer'] = df['odometer'].fillna(df['odometer'].median())
    df['paint_color'] = df['paint_color'].fillna('unknown')
    
    # Convert boolean columns
    df['is_4wd'] = df['is_4wd'].fillna(0).astype(bool)
    
    return df

if __name__ == '__main__':
    clean_df = clean_data1(df)
    clean_df.to_csv('cleaned_vehicles.csv', index=False)


@st.cache_data
def load_data1():
    return pd.read_csv('cleaned_vehicles.csv')

df = load_data1()

# App title
st.title('Car Sales Dashboard')

# Sidebar filters
st.sidebar.header('Filters')
selected_types = st.sidebar.multiselect(
    'Vehicle Types', 
    df['type'].unique(), 
    df['type'].unique()
)

price_range = st.sidebar.slider(
    'Price Range',
    float(df['price'].min()),
    float(df['price'].max()),
    (float(df['price'].min()), float(df['price'].max()))
)

# Filter data
filtered_df = df[
    (df['type'].isin(selected_types)) &
    (df['price'] >= price_range[0]) &
    (df['price'] <= price_range[1])
]

# Show filtered data
st.write(f"Displaying {len(filtered_df)} vehicles")

# Interactive plots
tab1, tab2, tab3 = st.tabs(["Price Analysis", "Odometer vs Price", "Condition Analysis"])

with tab1:
    st.header("Price Distribution")
    fig1 = px.histogram(filtered_df, x='price', nbins=50)
    st.plotly_chart(fig1, use_container_width=True)

with tab2:
    st.header("Odometer vs Price")
    fig2 = px.scatter(
        filtered_df, 
        x='odometer', 
        y='price', 
        color='type',
        hover_data=['model_year', 'model']
    )
    st.plotly_chart(fig2, use_container_width=True)

with tab3:
    st.header("Condition Analysis")
    fig3 = px.box(filtered_df, x='condition', y='price', color='type')
    st.plotly_chart(fig3, use_container_width=True)

# Raw data view
if st.checkbox('Show raw data'):
    st.subheader('Raw Data')
    st.dataframe(filtered_df)

# ====================================


# Basic info
print(df.info())
print(df.describe())

# ============================================== Test ================
# Check data types
print(df.dtypes)

# Check for null values
print(df['price'].isnull().sum())

# Check sample values
print(df['price'].head())

# Convert price to numeric, coercing errors
df['price'] = pd.to_numeric(df['price'], errors='coerce')

# Drop rows with NaN prices (or fill them)
df = df.dropna(subset=['price'])

# Remove extreme outliers if they exist
df = df[df['price'].between(df['price'].quantile(0.01), df['price'].quantile(0.99))]

# ============================================ Test============================
# Histogram of prices
#Remove rows where price is missing: 
df_clean = df.dropna(subset=['price'])
# fig1 = px.histogram(df_clean, x='price', title='Distribution of Car Prices')

# t pyplot as plt
# import plotly.io as pio



df = pd.read_csv('D:/TripleTen/Projects/Sprint4projects/sprint4mainproject/vehicles_us.csv')

print(df.head())
print(df.columns.tolist())


print(df.columns.tolist())
df.info()

def clean_data(df):
    # Handle missing values
    df['model_year'] = df['model_year'].fillna(0).astype(int)
    df['cylinders'] = df['cylinders'].fillna(0)
    df['odometer'] = df['odometer'].fillna(df['odometer'].median())
    df['paint_color'] = df['paint_color'].fillna('unknown')
    
    # Convert boolean columns
    df['is_4wd'] = df['is_4wd'].fillna(0).astype(bool)
    
    return df

if __name__ == '__main__':
    clean_df = clean_data(df)
    clean_df.to_csv('cleaned_vehicles.csv', index=False)


@st.cache_data
def load_data():
    return pd.read_csv('cleaned_vehicles.csv')

df = load_data()

# App title
st.title('Car Sales Dashboard')

# Sidebar filters
st.sidebar.header('Filters')
selected_types = st.sidebar.multiselect(
    'Vehicle Types', 
    df['type'].unique(), 
    df['type'].unique()
)

price_range = st.sidebar.slider(
    'Price Range',
    float(df['price'].min()),
    float(df['price'].max()),
    (float(df['price'].min()), float(df['price'].max()))
)

# Filter data
filtered_df = df[
    (df['type'].isin(selected_types)) &
    (df['price'] >= price_range[0]) &
    (df['price'] <= price_range[1])
]

# Show filtered data
st.write(f"Displaying {len(filtered_df)} vehicles")

# Interactive plots
tab1, tab2, tab3 = st.tabs(["Price Analysis", "Odometer vs Price", "Condition Analysis"])

with tab1:
    st.header("Price Distribution")
    fig1 = px.histogram(filtered_df, x='price', nbins=50)
    st.plotly_chart(fig1, use_container_width=True)

with tab2:
    st.header("Odometer vs Price")
    fig2 = px.scatter(
        filtered_df, 
        x='odometer', 
        y='price', 
        color='type',
        hover_data=['model_year', 'model']
    )
    st.plotly_chart(fig2, use_container_width=True)

with tab3:
    st.header("Condition Analysis")
    fig3 = px.box(filtered_df, x='condition', y='price', color='type')
    st.plotly_chart(fig3, use_container_width=True)

# Raw data view
if st.checkbox('Show raw data'):
    st.subheader('Raw Data')
    st.dataframe(filtered_df)

# ====================================


# Basic info
print(df.info())
print(df.describe())

# ============================================== Test ================
# Check data types
print(df.dtypes)

# Check for null values
print(df['price'].isnull().sum())

# Check sample values
print(df['price'].head())

# Convert price to numeric, coercing errors
df['price'] = pd.to_numeric(df['price'], errors='coerce')

# Drop rows with NaN prices (or fill them)
df = df.dropna(subset=['price'])

# Remove extreme outliers if they exist
df = df[df['price'].between(df['price'].quantile(0.01), df['price'].quantile(0.99))]

# ============================================ Test============================
# Histogram of prices
#Remove rows where price is missing: 
df_clean = df.dropna(subset=['price'])
# fig1 = px.histogram(df_clean, x='price', title='Distribution of Car Prices')



   price  model_year           model  condition  cylinders fuel  odometer  \
0   9400      2011.0          bmw x5       good        6.0  gas  145000.0   
1  25500         NaN      ford f-150       good        6.0  gas   88705.0   
2   5500      2013.0  hyundai sonata   like new        4.0  gas  110000.0   
3   1500      2003.0      ford f-150       fair        8.0  gas       NaN   
4  14900      2017.0    chrysler 200  excellent        4.0  gas   80903.0   

  transmission    type paint_color  is_4wd date_posted  days_listed  
0    automatic     SUV         NaN     1.0  2018-06-23           19  
1    automatic  pickup       white     1.0  2018-10-19           50  
2    automatic   sedan         red     NaN  2019-02-07           79  
3    automatic  pickup         NaN     NaN  2019-03-22            9  
4    automatic   sedan       black     NaN  2019-04-02           28  
['price', 'model_year', 'model', 'condition', 'cylinders', 'fuel', 'odometer', 'transmission', 'type', 'paint_color',

2025-06-15 08:53:35.639 No runtime found, using MemoryCacheStorageManager


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    51525 non-null  int64  
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     51525 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      51525 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   51525 non-null  object 
 10  is_4wd        51525 non-null  bool   
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: bool(1), float64(2), int64(3), object(7)
memory usage: 4.8+ MB
None
               price    model_year     cylinders       odometer  days_listed
count   51525.000000  51525.000000  51525.000000   51525.000000  51525.00000
mean    12132.46

2025-06-15 08:53:38.348 No runtime found, using MemoryCacheStorageManager


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    51525 non-null  int64  
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     51525 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      51525 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   51525 non-null  object 
 10  is_4wd        51525 non-null  bool   
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: bool(1), float64(2), int64(3), object(7)
memory usage: 4.8+ MB
None
               price    model_year     cylinders       odometer  days_listed
count   51525.000000  51525.000000  51525.000000   51525.000000  51525.00000
mean    12132.46

   price  model_year           model  condition  cylinders fuel  odometer  \
0   9400      2011.0          bmw x5       good        6.0  gas  145000.0   
1  25500         NaN      ford f-150       good        6.0  gas   88705.0   
2   5500      2013.0  hyundai sonata   like new        4.0  gas  110000.0   
3   1500      2003.0      ford f-150       fair        8.0  gas       NaN   
4  14900      2017.0    chrysler 200  excellent        4.0  gas   80903.0   

  transmission    type paint_color  is_4wd date_posted  days_listed  
0    automatic     SUV         NaN     1.0  2018-06-23           19  
1    automatic  pickup       white     1.0  2018-10-19           50  
2    automatic   sedan         red     NaN  2019-02-07           79  
3    automatic  pickup         NaN     NaN  2019-03-22            9  
4    automatic   sedan       black     NaN  2019-04-02           28  
['price', 'model_year', 'model', 'condition', 'cylinders', 'fuel', 'odometer', 'transmission', 'type', 'paint_color',

2025-06-15 08:11:16.023 No runtime found, using MemoryCacheStorageManager


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    51525 non-null  int64  
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     51525 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      51525 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   51525 non-null  object 
 10  is_4wd        51525 non-null  bool   
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: bool(1), float64(2), int64(3), object(7)
memory usage: 4.8+ MB
None
               price    model_year     cylinders       odometer  days_listed
count   51525.000000  51525.000000  51525.000000   51525.000000  51525.00000
mean    12132.46

In [None]:
import pandas as pd
import streamlit as st
import plotly.express as px
from matplotlib import pyplot as plt
import plotly.io as pio



df = pd.read_csv('D:/TripleTen/Projects/Sprint4projects/sprint4mainproject/vehicles_us.csv')

# Histogram
#Shows how vehicle prices are distributed across different ranges
#Reveals common price brackets and potential outliers
#Uses automatic binning to group prices into categories

fig1 = px.histogram(df, x='price', title='Distribution of Car Prices')
st.plotly_chart(fig1)  # ✅ Correct for Streamlit
fig1.show()

# Scatterplot
#Examines relationship between mileage (odometer) and price
#Color-coded by vehicle type to show category patterns
#Helps identify if higher mileage correlates with lower prices

fig2 = px.scatter(df, x='odometer', y='price', color='type', 
                 title='Price vs Odometer Reading by Vehicle Type')
st.plotly_chart(fig2)  # ✅ Correct for Streamlit
fig2.show()


# import plotly.io as pio
# # fig1.show()

# fig1 = px.histogram(df, x='price', title='Distribution of Car Prices')
# fig1.show()

# # Scatterplot of price vs odometer
# fig2 = px.scatter(df, x='odometer', y='price', color='type', 
#                  title='Price vs Odometer Reading by Vehicle Type')
# fig2.show()

# Boxplot of price by vehicle type
#Compares price distributions across different vehicle types
#Shows median price (center line), quartiles (box), and outliers (dots)
#Highlights which vehicle types have wider price ranges

fig3 = px.box(df, x='type', y='price', title='Price Distribution by Vehicle Type')
fig3.show()
