# 🚗 Car Price Prediction - Comprehensive EDA

## Overview
This notebook provides a comprehensive Exploratory Data Analysis (EDA) for the EU Car Pricing dataset. We'll analyze data quality, distributions, relationships, and prepare insights for the machine learning pipeline.

## Dataset Information
- **Source**: EUDS_CaseStudy_Pricing.csv
- **Target Variable**: targetPrice (car price in euros)
- **Features**: 18 features including car specifications, grades, and metadata
- **Size**: ~18,575 records


## 📦 Package Installation


In [None]:
# Install required packages
%pip install -Uq pip
%pip install -Uq numpy scipy seaborn matplotlib pandas pyarrow plotly scikit-learn


## 📚 Import Libraries


In [1]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

# Create output directory
import os
os.makedirs("artifacts", exist_ok=True)

print("✅ Libraries imported successfully!")


✅ Libraries imported successfully!


## 📊 Data Loading


In [4]:
# Load the dataset
df = pd.read_csv('../data/EUDS_CaseStudy_Pricing.csv')

print("📈 Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"Columns: {list(df.columns)}")


📈 Dataset loaded successfully!
Shape: (18575, 19)
Memory usage: 12.04 MB
Columns: ['vehicleID', 'registrationDate', 'kilometers', 'colour', 'aestheticGrade', 'mechanicalGrade', 'saleDate', 'make', 'model', 'doorNumber', 'type', 'fuel', 'transmission', 'yearIntroduced', 'cylinder', 'cubeCapacity', 'powerKW', 'powerHP', 'targetPrice']


## 🔍 Basic Data Information


In [None]:
# Basic information about the dataset
print("=== DATASET OVERVIEW ===")
print(f"Rows: {df.shape[0]:,}")
print(f"Columns: {df.shape[1]}")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Duplicate rows: {df.duplicated().sum()}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\n=== DATA TYPES ===")
print(df.dtypes.value_counts())

print("\n=== FIRST FEW ROWS ===")
df.head()


## 📋 Missing Values Analysis


In [None]:
# Missing values analysis
missing_data = df.isnull().sum()
missing_pct = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing %': missing_pct
}).sort_values('Missing Count', ascending=False)

print("=== MISSING VALUES SUMMARY ===")
print(missing_df[missing_df['Missing Count'] > 0])

# Visualize missing values
if missing_data.sum() > 0:
    plt.figure(figsize=(12, 6))
    missing_df[missing_df['Missing Count'] > 0].plot(kind='bar', y='Missing Count')
    plt.title('Missing Values by Column')
    plt.xlabel('Columns')
    plt.ylabel('Missing Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('artifacts/missing_values_bar.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Missing values heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(df.isnull(), cbar=True, yticklabels=False, cmap='viridis')
    plt.title('Missing Values Heatmap')
    plt.tight_layout()
    plt.savefig('artifacts/missing_values_heatmap.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("✅ No missing values found!")


## 🎯 Target Variable Analysis


In [None]:
# Target variable analysis
target = 'targetPrice'

print("=== TARGET VARIABLE STATISTICS ===")
print(f"Mean: €{df[target].mean():,.2f}")
print(f"Median: €{df[target].median():,.2f}")
print(f"Std: €{df[target].std():,.2f}")
print(f"Min: €{df[target].min():,.2f}")
print(f"Max: €{df[target].max():,.2f}")
print(f"Skewness: {df[target].skew():.3f}")
print(f"Kurtosis: {df[target].kurtosis():.3f}")

# Create comprehensive target variable visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Histogram
axes[0, 0].hist(df[target], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 0].set_title('Target Price Distribution (Histogram)')
axes[0, 0].set_xlabel('Price (€)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].grid(True, alpha=0.3)

# Box plot
axes[0, 1].boxplot(df[target], patch_artist=True, boxprops=dict(facecolor='lightgreen'))
axes[0, 1].set_title('Target Price Distribution (Box Plot)')
axes[0, 1].set_ylabel('Price (€)')
axes[0, 1].grid(True, alpha=0.3)

# Log-transformed histogram
log_target = np.log1p(df[target])
axes[1, 0].hist(log_target, bins=50, alpha=0.7, color='orange', edgecolor='black')
axes[1, 0].set_title('Log-Transformed Target Price Distribution')
axes[1, 0].set_xlabel('Log(Price + 1)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].grid(True, alpha=0.3)

# Q-Q plot for normality check
from scipy import stats
stats.probplot(df[target], dist="norm", plot=axes[1, 1])
axes[1, 1].set_title('Q-Q Plot: Target Price vs Normal Distribution')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('artifacts/target_variable_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

# Price ranges analysis
print("\n=== PRICE RANGES ANALYSIS ===")
price_ranges = pd.cut(df[target], bins=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
print(price_ranges.value_counts().sort_index())


## 📊 Numeric Features Analysis


In [None]:
# Identify numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numeric columns: {numeric_cols}")

# Basic statistics for numeric columns
print("\n=== NUMERIC FEATURES SUMMARY ===")
numeric_summary = df[numeric_cols].describe()
print(numeric_summary)

# Create distribution plots for all numeric features
n_cols = 3
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 6 * n_rows))
axes = axes.flatten() if n_rows > 1 else [axes] if n_rows == 1 else axes

for i, col in enumerate(numeric_cols):
    if i < len(axes):
        # Histogram
        axes[i].hist(df[col].dropna(), bins=30, alpha=0.7, color='skyblue', edgecolor='black')
        axes[i].set_title(f'{col} Distribution')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')
        axes[i].grid(True, alpha=0.3)
        
        # Add statistics text
        stats_text = f'Mean: {df[col].mean():.2f}\nStd: {df[col].std():.2f}\nSkew: {df[col].skew():.2f}'
        axes[i].text(0.7, 0.8, stats_text, transform=axes[i].transAxes, 
                    bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

# Hide empty subplots
for i in range(len(numeric_cols), len(axes)):
    axes[i].set_visible(False)

plt.tight_layout()
plt.savefig('artifacts/numeric_distributions.png', dpi=300, bbox_inches='tight')
plt.show()


## 📦 Box Plots for Numeric Features


In [None]:
# Create box plots for numeric features to identify outliers
n_cols = 3
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 6 * n_rows))
axes = axes.flatten() if n_rows > 1 else [axes] if n_rows == 1 else axes

for i, col in enumerate(numeric_cols):
    if i < len(axes):
        # Box plot
        box_plot = axes[i].boxplot(df[col].dropna(), patch_artist=True)
        box_plot['boxes'][0].set_facecolor('lightcoral')
        axes[i].set_title(f'{col} Box Plot')
        axes[i].set_ylabel(col)
        axes[i].grid(True, alpha=0.3)
        
        # Add outlier count
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)][col]
        axes[i].text(0.7, 0.8, f'Outliers: {len(outliers)}', 
                    transform=axes[i].transAxes,
                    bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

# Hide empty subplots
for i in range(len(numeric_cols), len(axes)):
    axes[i].set_visible(False)

plt.tight_layout()
plt.savefig('artifacts/numeric_boxplots.png', dpi=300, bbox_inches='tight')
plt.show()


## 🔗 Correlation Analysis


In [None]:
# Correlation analysis
correlation_matrix = df[numeric_cols].corr()

# Create correlation heatmap
plt.figure(figsize=(14, 12))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
            square=True, fmt='.2f', cbar_kws={"shrink": .8})
plt.title('Correlation Matrix Heatmap')
plt.tight_layout()
plt.savefig('artifacts/correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

# Find high correlations
print("=== HIGH CORRELATIONS (>0.7) ===")
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_val = correlation_matrix.iloc[i, j]
        if abs(corr_val) > 0.7:
            high_corr_pairs.append((correlation_matrix.columns[i], correlation_matrix.columns[j], corr_val))

if high_corr_pairs:
    for col1, col2, corr in high_corr_pairs:
        print(f"{col1} - {col2}: {corr:.3f}")
else:
    print("No high correlations found (>0.7)")

# Target variable correlations
print("\n=== TARGET VARIABLE CORRELATIONS ===")
target_correlations = correlation_matrix[target].drop(target).sort_values(key=abs, ascending=False)
print(target_correlations)


## 📊 Categorical Features Analysis


In [None]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns: {categorical_cols}")

# Analyze each categorical feature
print("\n=== CATEGORICAL FEATURES SUMMARY ===")
for col in categorical_cols:
    print(f"\n{col}:")
    print(f"  Unique values: {df[col].nunique()}")
    print(f"  Most common values:")
    print(df[col].value_counts().head(5))

# Create visualizations for categorical features
n_cols = 2
n_rows = (len(categorical_cols) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 6 * n_rows))
axes = axes.flatten() if n_rows > 1 else [axes] if n_rows == 1 else axes

for i, col in enumerate(categorical_cols):
    if i < len(axes):
        # Bar plot for top 10 values
        value_counts = df[col].value_counts().head(10)
        value_counts.plot(kind='bar', ax=axes[i], color='lightgreen')
        axes[i].set_title(f'{col} Value Counts (Top 10)')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Count')
        axes[i].tick_params(axis='x', rotation=45)
        axes[i].grid(True, alpha=0.3)

# Hide empty subplots
for i in range(len(categorical_cols), len(axes)):
    axes[i].set_visible(False)

plt.tight_layout()
plt.savefig('artifacts/categorical_distributions.png', dpi=300, bbox_inches='tight')
plt.show()


## 🎯 Target vs Features Analysis


In [None]:
# Analyze relationship between target and key features
key_features = ['kilometers', 'doorNumber', 'yearIntroduced', 'powerKW', 'powerHP']
key_features = [col for col in key_features if col in df.columns]

n_cols = 2
n_rows = (len(key_features) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 6 * n_rows))
axes = axes.flatten() if n_rows > 1 else [axes] if n_rows == 1 else axes

for i, col in enumerate(key_features):
    if i < len(axes):
        # Scatter plot
        axes[i].scatter(df[col], df[target], alpha=0.5, s=1)
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Target Price (€)')
        axes[i].set_title(f'Target Price vs {col}')
        axes[i].grid(True, alpha=0.3)
        
        # Add correlation coefficient
        corr = df[col].corr(df[target])
        axes[i].text(0.05, 0.95, f'Correlation: {corr:.3f}', 
                    transform=axes[i].transAxes,
                    bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

# Hide empty subplots
for i in range(len(key_features), len(axes)):
    axes[i].set_visible(False)

plt.tight_layout()
plt.savefig('artifacts/target_vs_features.png', dpi=300, bbox_inches='tight')
plt.show()


## 📊 Categorical Features vs Target Analysis


In [None]:
# Analyze categorical features vs target
categorical_target_cols = ['colour', 'aestheticGrade', 'mechanicalGrade', 'type', 'fuel', 'transmission']
categorical_target_cols = [col for col in categorical_target_cols if col in df.columns]

n_cols = 2
n_rows = (len(categorical_target_cols) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 6 * n_rows))
axes = axes.flatten() if n_rows > 1 else [axes] if n_rows == 1 else axes

for i, col in enumerate(categorical_target_cols):
    if i < len(axes):
        # Box plot for each category
        df.boxplot(column=target, by=col, ax=axes[i])
        axes[i].set_title(f'Target Price by {col}')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Target Price (€)')
        axes[i].tick_params(axis='x', rotation=45)
        axes[i].grid(True, alpha=0.3)

# Hide empty subplots
for i in range(len(categorical_target_cols), len(axes)):
    axes[i].set_visible(False)

plt.tight_layout()
plt.savefig('artifacts/categorical_vs_target.png', dpi=300, bbox_inches='tight')
plt.show()


## 📋 Summary and Recommendations


In [None]:
# Generate comprehensive summary
print("=== EDA SUMMARY AND RECOMMENDATIONS ===")
print(f"\n📊 Dataset Overview:")
print(f"  • Total records: {df.shape[0]:,}")
print(f"  • Total features: {df.shape[1]}")
print(f"  • Missing values: {df.isnull().sum().sum()}")
print(f"  • Duplicate rows: {df.duplicated().sum()}")

print(f"\n🎯 Target Variable (targetPrice):")
print(f"  • Mean: €{df[target].mean():,.2f}")
print(f"  • Median: €{df[target].median():,.2f}")
print(f"  • Range: €{df[target].min():,.2f} - €{df[target].max():,.2f}")
print(f"  • Skewness: {df[target].skew():.3f} (right-skewed)")

print(f"\n🔍 Data Quality Issues:")
if missing_data.sum() > 0:
    print(f"  • Missing values in: {missing_data[missing_data > 0].index.tolist()}")
else:
    print(f"  • No missing values found")

print(f"\n📈 Key Insights:")
if 'powerKW' in df.columns and 'powerHP' in df.columns:
    print(f"  • High correlation between powerKW and powerHP: {df['powerKW'].corr(df['powerHP']):.3f}")
print(f"  • Strongest predictors of price: {target_correlations.head(3).index.tolist()}")
if 'type' in df.columns:
    print(f"  • Most common car type: {df['type'].mode().iloc[0]}")
if 'fuel' in df.columns:
    print(f"  • Most common fuel type: {df['fuel'].mode().iloc[0]}")

print(f"\n🛠️ Preprocessing Recommendations:")
print(f"  • Handle missing values in colour column")
print(f"  • Consider log transformation for target variable (right-skewed)")
print(f"  • Remove or transform outliers in kilometers, powerKW, powerHP")
if 'powerKW' in df.columns and 'powerHP' in df.columns:
    print(f"  • Drop one of powerKW/powerHP due to high correlation")
print(f"  • Encode categorical variables (colour, aestheticGrade, mechanicalGrade, etc.)")
print(f"  • Create derived features: car_age_years, km_per_year")

print(f"\n📊 Model Recommendations:")
print(f"  • Use tree-based models (Random Forest, XGBoost, LightGBM) for high cardinality features")
print(f"  • Consider ensemble methods for better performance")
print(f"  • Use time series cross-validation due to temporal nature of data")
print(f"  • Apply feature selection to reduce multicollinearity")

print(f"\n✅ EDA Complete! All visualizations saved to 'artifacts/' directory.")


## 🔍 YData Profiling - Original Data


In [None]:
# Install ydata-profiling if not already installed
%pip install -Uq ydata-profiling


In [None]:
# Import ydata-profiling
from ydata_profiling import ProfileReport
import os

# Create artifacts directory if it doesn't exist
os.makedirs("artifacts", exist_ok=True)

print("🔍 Generating YData Profiling report for original data...")
print("This may take a few minutes for large datasets...")

# Generate comprehensive profiling report for original data
report_original = ProfileReport(
    df, 
    title="🚗 Car Pricing Dataset - Original Data EDA", 
    explorative=True,
    minimal=False,
    correlations={
        "pearson": {"calculate": True},
        "spearman": {"calculate": True},
        "kendall": {"calculate": True},
        "phi_k": {"calculate": True},
        "cramers": {"calculate": True}
    },
    interactions={
        "continuous": True,
        "targets": ["targetPrice"]
    },
    missing_diagrams={
        "matrix": True,
        "bar": True,
        "heatmap": True,
        "dendrogram": True
    },
    duplicates={
        "head": 10
    },
    samples={
        "head": 5,
        "tail": 5
    }
)

# Save HTML report
html_path_original = "artifacts/eda_report_original.html"
report_original.to_file(html_path_original)

# Save JSON report
json_path_original = "artifacts/eda_report_original.json"
with open(json_path_original, "w") as f:
    f.write(report_original.to_json())

print(f"✅ Original data profiling complete!")
print(f"📄 HTML report saved: {html_path_original}")
print(f"📄 JSON report saved: {json_path_original}")
print(f"🌐 Open the HTML file in your browser to view the interactive report")


## 🔍 YData Profiling - Cleaned Data


In [None]:
# Load cleaned data if it exists
try:
    df_clean = pd.read_csv('../artifacts/clean_data.csv')
    
    print("🔍 Generating YData Profiling report for cleaned data...")
    print("This may take a few minutes for large datasets...")
    
    # Generate comprehensive profiling report for cleaned data
    report_clean = ProfileReport(
        df_clean, 
        title="🚗 Car Pricing Dataset - Cleaned Data EDA", 
        explorative=True,
        minimal=False,
        correlations={
            "pearson": {"calculate": True},
            "spearman": {"calculate": True},
            "kendall": {"calculate": True},
            "phi_k": {"calculate": True},
            "cramers": {"calculate": True}
        },
        interactions={
            "continuous": True,
            "targets": ["targetPrice"]
        },
        missing_diagrams={
            "matrix": True,
            "bar": True,
            "heatmap": True,
            "dendrogram": True
        },
        duplicates={
            "head": 10
        },
        samples={
            "head": 5,
            "tail": 5
        }
    )
    
    # Save HTML report
    html_path_clean = "artifacts/eda_report_clean.html"
    report_clean.to_file(html_path_clean)
    
    # Save JSON report
    json_path_clean = "artifacts/eda_report_clean.json"
    with open(json_path_clean, "w") as f:
        f.write(report_clean.to_json())
    
    print(f"✅ Cleaned data profiling complete!")
    print(f"📄 HTML report saved: {html_path_clean}")
    print(f"📄 JSON report saved: {json_path_clean}")
    print(f"🌐 Open the HTML file in your browser to view the interactive report")
    
    # Compare original vs cleaned data
    print(f"\n📊 Data Comparison:")
    print(f"Original data shape: {df.shape}")
    print(f"Cleaned data shape: {df_clean.shape}")
    print(f"Rows removed: {df.shape[0] - df_clean.shape[0]}")
    print(f"Columns added: {df_clean.shape[1] - df.shape[1]}")
    
except FileNotFoundError:
    print("⚠️ Cleaned data not found. Run 'make preprocess' first to generate clean_data.csv")
    print("Skipping cleaned data profiling...")


## 📊 YData Profiling Summary


In [None]:
# Display YData Profiling reports summary
print("=== YDATA PROFILING REPORTS GENERATED ===")
print("\n📄 Available Reports:")
print("1. Original Data EDA Report:")
print(f"   • HTML: artifacts/eda_report_original.html")
print(f"   • JSON: artifacts/eda_report_original.json")
print("\n2. Cleaned Data EDA Report:")
print(f"   • HTML: artifacts/eda_report_clean.html")
print(f"   • JSON: artifacts/eda_report_clean.json")

print("\n🔍 What's included in YData Profiling reports:")
print("• Dataset overview and statistics")
print("• Variable types and data quality")
print("• Missing values analysis with visualizations")
print("• Duplicate rows detection")
print("• Correlation analysis (Pearson, Spearman, Kendall, Phi-K, Cramers)")
print("• Interaction plots for continuous variables")
print("• Sample data (head and tail)")
print("• Alerts for data quality issues")
print("• Distribution plots for all variables")
print("• Target variable analysis")

print("\n🌐 To view the reports:")
print("1. Open the HTML files in your web browser")
print("2. Navigate through the interactive sections")
print("3. Use the JSON files for programmatic access to the data")

print("\n💡 Pro tip: The YData Profiling reports provide:")
print("• Automated data quality assessment")
print("• Statistical summaries for all variables")
print("• Interactive visualizations")
print("• Data quality alerts and recommendations")
print("• Professional presentation-ready reports")


# 🚗 Car Price Prediction - Comprehensive EDA

## Overview
This notebook provides a comprehensive Exploratory Data Analysis (EDA) for the EU Car Pricing dataset. We'll analyze data quality, distributions, relationships, and prepare insights for the machine learning pipeline.

## Dataset Information
- **Source**: EUDS_CaseStudy_Pricing.csv
- **Target Variable**: targetPrice (car price in euros)
- **Features**: 18 features including car specifications, grades, and metadata
- **Size**: ~18,575 records

In [None]:
## 📦 Package Installation

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# Install required packages
%pip install -Uq pip
%pip install -Uq numpy scipy seaborn matplotlib pandas pyarrow plotly scikit-learn

In [None]:
## 📚 Import Libraries


# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

# Create output directory
import os
os.makedirs("artifacts", exist_ok=True)

print("✅ Libraries imported successfully!")

In [None]:
## 📊 Data Loading

: 

Quick EDA using Pandas (ydata) Profiling
**Overview**
**1.** The data is already quite clean with < 0.1% missing cells and 0 duplicate rows.
**2.** The data is small and can fit in RAM, we will avoid dropping rows due to a small n.
**3.** The data has a healthy mix of data types.
**4.** vehicleID is the primary key with all unique values.

**Alerts**
**1.** There are 9 alerts for High correlation, meaning that we will have to be careful of multicollinearity.
**2.** There are a couple of Imbalanced features, meaning that underrepresentation can exist in the data sample.


**Columns**
**registrationDate:** Proxy for car age, normally distributed and centered around ~2011, meaning that the average age of the cars sold are ~14 years (Assuming that the cars are registered around the same time that they are built).
**kilometers:** Heavily right skewed with at least one extreme outlier. The outlier(s) will be removed and the feature will be log transformed to make it more gaussian shaped.
**colour:** Modestly high cardinality, we will use model-based feature importance methods later on to derive it's importance (how much variance it explains in the target feature) and decide whether to keep it or not.
**aestheticGrade:** Low cardinality and likely a modest predictor for the target variable.
**mechanicalGrade:** Low cardinality and likely a modest predictor for the target variable.
**saleDate:** Has the correct datatype, useful as an index for the dataset. Need to remove outliers.
**make:** High cardinality which is okay for tree-based models but may be problematic for linear models.
**model:** High cardinality which is okay for tree-based models but will be problematic for linear models. We may need to bin this feature or exclude it.
**doorNumber:** Low cardinality, fine as is.
**type:** Low cardinality, fine as is.
**fuel:** Low cardinality with an underrepresentation for Electric vehicles.
**transmission:** Binary feature we can map to 1/0; Automatic cars are underrepresented and downsampling is an option if n were larger.
**yearIntroduced:** May have a high correlation with derived car age from registrationDate.
**cylinder:** Need to clip the outlier(s) and impute nulls or drop them.
**cubeCapacity:** Will treat 0's as nulls and impute using the median.
**powerKW:** Likely perfect collinearity with powerHP, will drop one.
**powerHP:** Likely perfect collinearity with powerKW, will drop one.
**targetPrice:** Will predict the log of the target and try quantile predictors.



In [3]:
from ydata_profiling import ProfileReport
import os

os.makedirs("artifacts", exist_ok=True)

report = ProfileReport(df, title="EDA – Car Pricing", explorative=True)

# HTML
html_path = "artifacts/eda_report.html"
report.to_file(html_path)

# JSON (best-supported path)
json_path = "artifacts/eda_report.json"
with open(json_path, "w") as f:
    f.write(report.to_json())

print(f"Saved: {html_path}\nSaved: {json_path}")


NameError: name 'df' is not defined

In [None]:
ProfileReport(df, title="EDA – Car Pricing", explorative=True)

NameError: name 'df' is not defined

Oldschool (Manual) EDA

In [None]:
# df.head()
# df.info()
# df.describe()
# df.isnull().sum()
# df.duplicated().sum()
# df.shape
# df.corr()
# df.skew()
# df.kurtosis()
# df.hist(figsize=(20,20))
# plt.show()





In [None]:
import numpy as np
import pandas as pd

# ensure proper dtypes
df['saleDate'] = pd.to_datetime(df['saleDate'], errors='coerce')
df['yearIntroduced'] = pd.to_numeric(df['yearIntroduced'], errors='coerce')

mask = (
    df['saleDate'].notna() &
    df['yearIntroduced'].notna() &
    (df['yearIntroduced'] > df['saleDate'].dt.year)
)

df_bad = df.loc[mask, ['saleDate','yearIntroduced']]
df_bad

Unnamed: 0,saleDate,yearIntroduced
25,1932-09-02,2014


Target EDA

In [None]:
# df["targetPrice"].hist(bins=100)
# plt.show()

# df["targetPrice"].describe()




Cleaned Data EDA

In [None]:
clean = pd.read_csv('../artifacts/clean_data.csv')
ProfileReport(clean, title="Cleaned Data EDA", explorative=True)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
# pd.set_option('display.max_rows', None)
# pd.DataFrame(df["model"].value_counts(normalize=True)[:50])

In [None]:
# pd.DataFrame(df["make"].value_counts(normalize=True)[:50])