In [1]:
# Data Manipulation
import pandas as pd

# Data Visualization
import plotly.express as px
import matplotlib.pyplot as plt

# Other libraries if needed
import seaborn as sns

import seaborn as sns
import matplotlib.pyplot as plt

import os
#print(os.getcwd())

In [None]:
# Load dataset
df = pd.read_csv('../vehicles_us.csv')

# Display basic info and first few rows of the dataset
df.info()
df.head()

In [None]:
# Get summary statistics
df.describe()


In [None]:
# Check for missing values
df.isnull().sum()

In [5]:
# Handle missing values for 'model_year'
df['model_year'] = df['model_year'].fillna(df['model_year'].median())

# Handle missing values for 'cylinders'
df['cylinders'] = df['cylinders'].fillna(df['cylinders'].median())

# Handle missing values for 'odometer'
df['odometer'] = df['odometer'].fillna(df['odometer'].median())

# Handle missing values for 'is_4wd': Fill NaN with 0 (assuming missing values mean non-4WD cars)
df['is_4wd'] = df['is_4wd'].fillna(0)

# Handle missing values for 'paint_color' and 'type': Fill with mode
df['paint_color'] = df['paint_color'].fillna(df['paint_color'].mode()[0])
df['type'] = df['type'].fillna(df['type'].mode()[0])


In [None]:
# After handling missing data, let's check if there are still any missing values
print(df.isnull().sum())

In [None]:
# Convert 'date_posted' into a datetime format
df['date_posted'] = pd.to_datetime(df['date_posted'], format='%Y-%m-%d')

# Convert categorical columns to category dtype for easier grouping and analysis
categorical_columns = ['fuel', 'transmission', 'type', 'paint_color', 'condition']

for column in categorical_columns:
    df[column] = df[column].astype('category')

# Verify the changes
df.info()

In [None]:
# 1. Histogram of Car Prices
fig1 = px.histogram(df, x='price', nbins=50, title='Distribution of Car Prices')
fig1.show()

In [None]:
# 2. Scatterplot of Odometer vs. Price
fig2 = px.scatter(df, x='odometer', y='price', title='Odometer Reading vs. Price',
                  labels={'odometer':'Odometer Reading (miles)', 'price':'Price (USD)'})
fig2.show()

In [None]:
# 3. Scatterplot of Model Year vs. Price
fig3 = px.scatter(df, x='model_year', y='price', title='Model Year vs. Price',
                  labels={'model_year':'Model Year', 'price':'Price (USD)'})
fig3.show()

In [None]:
# 4. Histogram of Days Listed
fig4 = px.histogram(df, x='days_listed', nbins=30, title='Distribution of Days Listed')
fig4.show()

In [None]:
#Visualize the relationship between the car's price and its odometer reading to identify trends.
fig5 = px.scatter(df, x='odometer', y='price', color='condition', title='Price vs Odometer')
fig5.show()

In [None]:
#Show how car prices vary across different fuel types using a box plot.
fig6 = px.box(df, x='fuel', y='price', title='Box Plot of Car Prices by Fuel Type')
fig6.show()

In [None]:
#Visualize the number of cars for each transmission type.
fig7 = px.histogram(df, x='transmission', title='Count of Cars by Transmission Type')
fig7.show()

In [None]:
#Create a heatmap to show average car prices by model year and condition.
avg_price = df.groupby(['model_year', 'condition'])['price'].mean().reset_index()
fig8 = px.density_heatmap(avg_price, x='model_year', y='condition', z='price', 
                           title='Heatmap of Average Price by Model Year and Condition')
fig8.show()

In [None]:
#Use a pie chart to represent the distribution of different car types.
fig9 = px.pie(df, names='type', title='Distribution of Car Types')
fig9.show()

In [None]:
#Analyze how the number of car listings changes over time by extracting the month and year from the date_posted
# Convert the 'date_posted' column to datetime and extract month and year
df['month_year'] = pd.to_datetime(df['date_posted']).dt.to_period('M').astype(str)

# Count listings per month
listings_per_month = df['month_year'].value_counts().sort_index()

# Create the line plot
fig10 = px.line(listings_per_month, title='Number of Car Listings Over Time')
fig10.show()
