# Exploratory Data Analysis (EDA)

In [None]:
import pandas as pd
import os

notebook_dir = os.path.dirname(os.path.abspath('__file__'))

data_path = os.path.join(notebook_dir, '../data/processed/car_prices_after_FE.csv') # Change if necessary

df = pd.read_csv(data_path)

## Initial exploration and visualization

In [None]:
# As a reminder of how the dataframe looks
df.head(5)

In [None]:
df.shape
df.info()
df.describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.pairplot(data = df, diag_kind="kde")
plt.show()

## Null and unique values

In [None]:
# Checking for missing values in each column
null_values = df.isnull().sum()

# Printing the number of missing values per column
print("Missing values per column:")
print(null_values)

# Percentage of missing values for each column (optional)
missing_percentage = (null_values / len(df)) * 100
print("\nPercentage of missing values per column:")
print(missing_percentage)

In [None]:
df.nunique()

## Correlation matrix

In [None]:
plt.tight_layout()
tmap = sns.heatmap(df.corr(numeric_only=True), cmap = 'coolwarm', linecolor="white", linewidths=0.5, fmt="0.2f", annot = True)
for t in tmap.texts:
    if float(t.get_text()) >= 0.3 or float(t.get_text()) <= -0.3:
        t.set_text(t.get_text())
    else:
        t.set_text("")
plt.title('correlation matrix', loc = 'center')