In [1]:
# Loading all the libraries
import pandas as pd
import numpy as np
import scipy.stats
import streamlit as st
import time
import plotly.express as px


In [2]:
#Load Data

df = pd.read_csv('../vehicles_us.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


In [4]:
print ("info: \n",df['type'].info())

print("head: \n",df['type'].head(n=15))

print("sample: \n",df['type'].sample(n=15))

<class 'pandas.core.series.Series'>
RangeIndex: 51525 entries, 0 to 51524
Series name: type
Non-Null Count  Dtype 
--------------  ----- 
51525 non-null  object
dtypes: object(1)
memory usage: 402.7+ KB
info: 
 None
head: 
 0        SUV
1     pickup
2      sedan
3     pickup
4      sedan
5      sedan
6      sedan
7        SUV
8        SUV
9        SUV
10    pickup
11     sedan
12    pickup
13     sedan
14       SUV
Name: type, dtype: object
sample: 
 45717    pickup
30352     truck
15687       SUV
49641     sedan
36577    pickup
45336     sedan
22657       SUV
30148     coupe
44630       van
12708     sedan
36301       SUV
1306      sedan
10369     truck
45123     sedan
35042    pickup
Name: type, dtype: object


In [5]:
def explore_df(df, name):
    print("="*50)
    print(f"Exploring Column: \"{name}\"")
    print("First few rows:")
    print(df.head(), "\n")
    print("Data types:")
    print(df.dtypes, "\n")
    print("Summary statistics:")
    print(df.describe(include='all'), "\n")
    print("Data Frame Info:")
    print(df.info(), "\n")
    print("Get 20 random rows:")
    print(df.sample(n=20), "\n")
    print("="*50, "\n")

df.info()

for col in df:
    explore_df(df[col],col)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB
Exploring Column: "price"
First few rows:
0     9400
1    25500
2     5500
3     1500
4    14900
Name: price, dtype: int64 

Data types:
int64 

Summary statistics:
count     51525.000

# Correct datatypes

0   price         51525 non-null  int64    #Should be float?
1   model_year    47906 non-null  float64  #Should be int
2   model         51525 non-null  object 
3   condition     51525 non-null  object   
4   cylinders     46265 non-null  float64  #Should be int, missing data
5   fuel          51525 non-null  object 
6   odometer      43633 non-null  float64  #missing data
7   transmission  51525 non-null  object
8   type          51525 non-null  object 
9   paint_color   42258 non-null  object   #missing data
10  is_4wd        25572 non-null  float64  #Should be boolean, not binary, not float
11  date_posted   51525 non-null  object   #Should be datetype
12  days_listed   51525 non-null  int64 

In [6]:


def dataprep(df):
    """
    Cleans and converts columns in the DataFrame to the desired data types and drops rows with missing data.

    Parameters:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame: The cleaned and converted DataFrame.
    """
    # Drop rows with missing data
    #df = df.dropna()

    #conversions datatype
    # Convert 'price' to float
    df['price'] = df['price'].astype('float64')

    # Convert 'model_year' to int, skipping NaN values
    df['model_year'] = df['model_year'].apply(lambda x: int(x) if not pd.isna(x) else np.nan)
    
    # Convert 'cylinders' to int, skipping NaN values
    df['cylinders'] = df['cylinders'].apply(lambda x: int(x) if not pd.isna(x) else np.nan)

    # Convert 'is_4wd' to boolean, skipping NaN values
    df['is_4wd'] = df['is_4wd'].map({1.0: True, 0.0: False})

    # Convert 'date_posted' to datetime
    df['date_posted'] = pd.to_datetime(df['date_posted']).dt.floor('D')
    
    
    
    return df

df_clean = dataprep(df)

df_clean.info()
print("First few rows:")
print(df_clean.head(n=50), "\n")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   price         51525 non-null  float64       
 1   model_year    47906 non-null  float64       
 2   model         51525 non-null  object        
 3   condition     51525 non-null  object        
 4   cylinders     46265 non-null  float64       
 5   fuel          51525 non-null  object        
 6   odometer      43633 non-null  float64       
 7   transmission  51525 non-null  object        
 8   type          51525 non-null  object        
 9   paint_color   42258 non-null  object        
 10  is_4wd        25572 non-null  object        
 11  date_posted   51525 non-null  datetime64[ns]
 12  days_listed   51525 non-null  int64         
dtypes: datetime64[ns](1), float64(4), int64(1), object(7)
memory usage: 5.1+ MB
First few rows:
      price  model_year                     

# What do we see?
I'm imagining the type of company similar to CoPart. I twould be useful to make it modular so the user is able to perform simple data visualization by taking in filters.

- What is the most popular car?
    + Frequency in dataset using:
        = Model / Yr
        = condition
        = type
        = color

- Which inventory is not going to sell?
    + Model / days listed

- Market trends ? -  Is it true that average car prices rose during the pandemic?
    + Normalize data for each year surrounding COVID19 event by model
    

In [7]:
def plot_histogram_price_filtered(df, model_year=None, cylinders=None, condition=None, fuel=None, transmission=None, 
                                  car_type=None, paint_color=None, is_4wd=None, models=None, aggregation='mean'):
    """
    Plots a histogram of car prices by model with optional filters.

    Parameters:
    df (pd.DataFrame): The input DataFrame.
    model_year (list or int, optional): The specific model year(s) to filter and plot. Defaults to None.
    cylinders (list or int, optional): The number of cylinders to filter and plot. Defaults to None.
    condition (list or str, optional): The condition of the car to filter and plot. Defaults to None.
    fuel (list or str, optional): The type of fuel to filter and plot. Defaults to None.
    transmission (list or str, optional): The type of transmission to filter and plot. Defaults to None.
    car_type (list or str, optional): The type of car to filter and plot. Defaults to None.
    paint_color (list or str, optional): The paint color of the car to filter and plot. Defaults to None.
    is_4wd (list or bool, optional): Whether the car is 4WD to filter and plot. Defaults to None.
    models (list of str, optional): List of substrings to filter models. Defaults to None.
    aggregation (str, optional): The aggregation method for price ('mean' or 'sum'). Defaults to 'mean'.
    """
    # Apply filters
    filters_applied = {
        'model_year': model_year,
        'cylinders': cylinders,
        'condition': condition,
        'fuel': fuel,
        'transmission': transmission,
        'car_type': car_type,
        'paint_color': paint_color,
        'is_4wd': is_4wd,
        'models': models
    }
    
    if model_year is not None:
        if isinstance(model_year, list):
            df = df[df['model_year'].isin(model_year)]
        else:
            df = df[df['model_year'] == model_year]
    if cylinders is not None:
        if isinstance(cylinders, list):
            df = df[df['cylinders'].isin(cylinders)]
        else:
            df = df[df['cylinders'] == cylinders]
    if condition is not None:
        if isinstance(condition, list):
            df = df[df['condition'].isin(condition)]
        else:
            df = df[df['condition'] == condition]
    if fuel is not None:
        if isinstance(fuel, list):
            df = df[df['fuel'].isin(fuel)]
        else:
            df = df[df['fuel'] == fuel]
    if transmission is not None:
        if isinstance(transmission, list):
            df = df[df['transmission'].isin(transmission)]
        else:
            df = df[df['transmission'] == transmission]
    if car_type is not None:
        if isinstance(car_type, list):
            df = df[df['type'].isin(car_type)]
        else:
            df = df[df['type'] == car_type]
    if paint_color is not None:
        if isinstance(paint_color, list):
            df = df[df['paint_color'].isin(paint_color)]
        else:
            df = df[df['paint_color'] == paint_color]
    if is_4wd is not None:
        if isinstance(is_4wd, list):
            df = df[df['is_4wd'].isin(is_4wd)]
        else:
            df = df[df['is_4wd'] == is_4wd]
    if models is not None:
        model_pattern = '|'.join(models)
        df = df[df['model'].str.contains(model_pattern, case=False, na=False)]
    
    # Set the aggregation function for price
    if aggregation == 'mean':
        df_grouped = df.groupby('model')['price'].mean().reset_index()
        y_title = 'Mean Price'
    else:
        df_grouped = df.groupby('model')['price'].sum().reset_index()
        y_title = 'Total Inventory Value'
    
    # Create the bar chart
    fig = px.bar(df_grouped, x='model', y='price',
                 title="Histogram of Car Prices by Model" + (f" for {model_year}" if model_year else ""),
                 color_discrete_sequence=px.colors.qualitative.Pastel)
    fig.update_layout(xaxis_title='Model', yaxis_title=y_title, template='plotly_white')
    
    # Add annotation for filters
    filters_text = '<br>'.join([f"{key}: {value}" for key, value in filters_applied.items() if value is not None])
    fig.add_annotation(
        xref="paper", yref="paper",
        x=1.05, y=1,
        showarrow=False,
        text=filters_text,
        align="left",
        bordercolor="black",
        borderwidth=1
    )

    fig.show()
# Plot histogram with filters and aggregation
plot_histogram_price_filtered(df_clean, model_year=[2013, 2014], condition=['good'], fuel=['gas'], models=['bmw x5', 'ford f-150'], aggregation='sum')

# Plot histogram for all model years with specific filters and default aggregation (mean)
plot_histogram_price_filtered(df_clean, cylinders=[4, 6], transmission=['automatic'], car_type=['sedan'], models=['honda', 'toyota'])

Integrate this into a pretty app using streamlit


In [8]:
# Define the function to plot histogram with filters
def plot_histogram_price_filtered_streamlit(df, model_year=None, cylinders=None, condition=None, fuel=None, transmission=None, 
                                  car_type=None, paint_color=None, is_4wd=None, models=None, aggregation='mean'):
    """
    Plots a histogram of car prices by model with optional filters.

    Parameters:
    df (pd.DataFrame): The input DataFrame.
    model_year (list or int, optional): The specific model year(s) to filter and plot. Defaults to None.
    cylinders (list or int, optional): The number of cylinders to filter and plot. Defaults to None.
    condition (list or str, optional): The condition of the car to filter and plot. Defaults to None.
    fuel (list or str, optional): The type of fuel to filter and plot. Defaults to None.
    transmission (list or str, optional): The type of transmission to filter and plot. Defaults to None.
    car_type (list or str, optional): The type of car to filter and plot. Defaults to None.
    paint_color (list or str, optional): The paint color of the car to filter and plot. Defaults to None.
    is_4wd (list or bool, optional): Whether the car is 4WD to filter and plot. Defaults to None.
    models (list of str, optional): List of substrings to filter models. Defaults to None.
    aggregation (str, optional): The aggregation method for price ('mean' or 'sum'). Defaults to 'mean'.
    """
    # Apply filters
    if model_year is not None:
        if isinstance(model_year, list):
            df = df[df['model_year'].isin(model_year)]
        else:
            df = df[df['model_year'] == model_year]
    if cylinders is not None:
        if isinstance(cylinders, list):
            df = df[df['cylinders'].isin(cylinders)]
        else:
            df = df[df['cylinders'] == cylinders]
    if condition is not None:
        if isinstance(condition, list):
            df = df[df['condition'].isin(condition)]
        else:
            df = df[df['condition'] == condition]
    if fuel is not None:
        if isinstance(fuel, list):
            df = df[df['fuel'].isin(fuel)]
        else:
            df = df[df['fuel'] == fuel]
    if transmission is not None:
        if isinstance(transmission, list):
            df = df[df['transmission'].isin(transmission)]
        else:
            df = df[df['transmission'] == transmission]
    if car_type is not None:
        if isinstance(car_type, list):
            df = df[df['type'].isin(car_type)]
        else:
            df = df[df['type'] == car_type]
    if paint_color is not None:
        if isinstance(paint_color, list):
            df = df[df['paint_color'].isin(paint_color)]
        else:
            df = df[df['paint_color'] == paint_color]
    if is_4wd is not None:
        if isinstance(is_4wd, list):
            df = df[df['is_4wd'].isin(is_4wd)]
        else:
            df = df[df['is_4wd'] == is_4wd]
    if models is not None:
        model_pattern = '|'.join(models)
        df = df[df['model'].str.contains(model_pattern, case=False, na=False)]
    
    # Set the aggregation function for price
    if aggregation == 'mean':
        df_grouped = df.groupby('model')['price'].mean().reset_index()
        y_title = 'Mean Price'
    else:
        df_grouped = df.groupby('model')['price'].sum().reset_index()
        y_title = 'Total Price'
    
    # Create the bar chart
    fig = px.bar(df_grouped, x='model', y='price',
                 title="Histogram of Car Prices by Model" + (f" for {model_year}" if model_year else ""),
                 color_discrete_sequence=px.colors.qualitative.Pastel)
    fig.update_layout(xaxis_title='Model', yaxis_title=y_title, template='plotly_white')
    
    # Add annotation for filters
    filters_applied = {
        'model_year': model_year,
        'cylinders': cylinders,
        'condition': condition,
        'fuel': fuel,
        'transmission': transmission,
        'car_type': car_type,
        'paint_color': paint_color,
        'is_4wd': is_4wd,
        'models': models
    }
    filters_text = '<br>'.join([f"{key}: {value}" for key, value in filters_applied.items() if value is not None])
    fig.add_annotation(
        xref="paper", yref="paper",
        x=1.05, y=1,
        showarrow=False,
        text=filters_text,
        align="left",
        bordercolor="black",
        borderwidth=1
    )

    return fig

In [9]:
# Streamlit app
st.title("Car Price Histogram with Filters")

# Get unique filter options based on current selection
def get_unique_options(df, column):
    return df[column].dropna().unique().tolist()

# Sidebar filters
model_year = st.sidebar.multiselect("Model Year", get_unique_options(df, 'model_year'))
cylinders = st.sidebar.multiselect("Cylinders", get_unique_options(df, 'cylinders'))
condition = st.sidebar.multiselect("Condition", get_unique_options(df, 'condition'))
fuel = st.sidebar.multiselect("Fuel", get_unique_options(df, 'fuel'))
transmission = st.sidebar.multiselect("Transmission", get_unique_options(df, 'transmission'))
car_type = st.sidebar.multiselect("Car Type", get_unique_options(df, 'type'))
paint_color = st.sidebar.multiselect("Paint Color", get_unique_options(df, 'paint_color'))
is_4wd = st.sidebar.multiselect("Is 4WD", [True, False])
models = st.sidebar.multiselect("Models", get_unique_options(df, 'model'))
aggregation = st.sidebar.radio("Aggregation Method", ['mean', 'sum'])

# Apply button
if st.sidebar.button("Apply"):
    fig = plot_histogram_price_filtered(df, model_year=model_year, cylinders=cylinders, condition=condition, fuel=fuel, 
                                        transmission=transmission, car_type=car_type, paint_color=paint_color, 
                                        is_4wd=is_4wd, models=models, aggregation=aggregation)
    st.plotly_chart(fig)

2024-07-21 18:13:13.269 
  command:

    streamlit run c:\Users\Jose\miniconda3\envs\tripleten\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
