In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

df = pd.read_csv("/Users/hayden/Downloads/Projects/used_car_project/data/cleaned/used_cars_cleaned.csv")
df.head()

# Histogram of Prices

In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(df['price'], bins=40, kde=True)
plt.xlim(0, 150000)
plt.title('Distribution of Car Prices')
plt.xlabel('Price')
plt.ylabel('Count')
plt.grid(True)
plt.show()

# Scatterplot of Mileages

In [None]:
plt.figure(figsize = (8,5))
sns.scatterplot(data = df, x = 'mileage', y = 'price') ## What is KDE?
plt.ylim(0, 100000)
plt.title('Mileage vs Price')
plt.xscale('log')
plt.xlabel('Mileage')
plt.ylabel('Price')
plt.show()

# Distribution of Car Brands

In [None]:
plt.figure(figsize=(12,12))
sns.countplot(df['brand'], order= df['brand'].value_counts().index)
plt.title('Distribution of Car Brands')
plt.xlabel('Brand')
plt.ylabel('Count')
plt.grid(True)
plt.show()

# Mileage vs Price
    mileage in range since mileage vs price above didn't show any significant result

In [None]:
plt.figure(figsize=(12,8))
df_sort = df.sort_values('price', ascending= False)
sns.barplot(data=df, x='mile_bin', y='price', alpha=0.6, order = df_sort['mile_bin'])
plt.title('Average Price by Mileage Range')
plt.xlabel('Mileage Range')
plt.ylabel('Average Price')
plt.show()

# Box plot for accident history
    Shows that on average cars with no accidents are priced higher than those with accidents.

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='accident', y='price')
plt.ylim(0, 150000)
plt.title('Price Distribution by Accident History')
plt.xlabel('Accident History')
plt.ylabel('Price')
plt.show()

# Average price by maker

In [None]:
plt.figure(figsize=(50, 6))
sns.barplot(data=df, x='brand', y='price', estimator='mean', order= df.groupby('brand')['price'].mean().sort_values(ascending=True).index)
plt.ylim(0, 200000)
plt.title('Average Price by Maker')
plt.xlabel('Maker')
plt.ylabel('Average Price')
plt.show()

# Car age vs Price

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='car_age', y='price', alpha=0.6)
plt.title('Car Age vs Price')
plt.xlabel('Car Age (Years)')
plt.ylabel('Price')
#plt.xscale('log')  # Log scale for better visibility
plt.yscale('log')  # Log scale for better visibility
plt.xlim(0,30)
plt.show()

# Car price by fuel type

In [None]:
plt.figure(figsize = (10,6))

sns.boxplot(data = df, x = df['fuel_type'], y = 'price', order = df[~df['fuel_type'].isin(["–", "not supported"])].groupby('fuel_type')['price'].mean().sort_values().index)

plt.title('Average price by fuel type')
plt.xlabel('Fuel type')
plt.ylabel('Price')
plt.ylim(0,125000)
plt.show()

# Car price by exterior color

In [None]:
plt.figure(figsize = (10,6))
sns.boxplot(data = df, x = 'ext_col_grouped', y = 'price', order = df.groupby('ext_col_grouped')['price'].mean().sort_values(ascending=True).index)
plt.title('Average price by exterior color')
plt.xlabel('Exterior color')
plt.ylabel('Price')
plt.ylim(0,100000)
plt.show()

# Car price by clean title

In [None]:
plt.figure(figsize = (10,6))
sns.boxplot(data = df, x = 'clean_title', y = 'price')
plt.title('Average price by clean title')
plt.xticks(ticks = [0,1], labels=['Yes', 'No'])
plt.xlabel('Does the vehicle have clean title')
plt.ylabel('Price')
plt.ylim(0,125000)
plt.show()

# Heatmap

In [None]:
corr = df.corr(numeric_only=True)
price_corr = corr[['price']].drop('price')
# --- Focus on price & price_per_mile correlations ---
plt.figure(figsize=(10,6))
sns.heatmap(
    price_corr.sort_values(by='price', ascending=False),
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    center=0
)

plt.title("Correlation of Features with Price", fontsize=14)
plt.show()