# Libraries Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Loading data

In [None]:
train_df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')

# General Inspection

In [None]:
train_df.info()

In [None]:
train_df.describe(include='all')

In [None]:
train_df.head()

Comparison with test_df

In [None]:
test_df.info()

# Target Distribution

In [None]:
plt.figure(figsize=(8, 4))
sns.histplot(train_df['price'], kde=True, bins=40)
plt.title("Price Distribution")

### Interpretation

The price distribution is **heavily right-skewed**, with the majority of cars priced between **£5,000 and £30,000**, and a long tail extending to over **£140,000**.

### Consequences

- This skewness could **negatively impact regression models**, especially those sensitive to outliers or assuming normality (e.g., Linear Regression).
- A **log-transformation** of the target variable (`log(price)` or `log1p(price)`) might help stabilize variance and improve model performance.
- There may be **outliers** at the higher end of the price spectrum — consider capping or removing them depending on how models behave.

# Feature relationships

In [None]:
train_df.corr(numeric_only=True)['price'].sort_values(ascending=False)

# Look at categorical variables relationship with price

Transmission vs price

In [None]:
sns.boxplot(x='transmission', y='price', data=train_df)

Mean price per brand

In [None]:
train_df.groupby('brand')['price'].mean().sort_values(ascending=False)

# Checking for missing values

In [None]:
train_df.isnull().sum().sort_values(ascending=False)

Perfect

# Comparison of tran vs test sets 

In [None]:
for col in ['mileage', 'engineSize', 'mpg']:
    sns.kdeplot(train_df[col], label='train')
    sns.kdeplot(test_df[col], label='test')
    plt.title(col)
    plt.legend()
    plt.show()

The distributions seem to be sufficiently similar 

# Outliers detection

In [None]:
num_cols = ['price', 'mileage', 'mpg', 'engineSize', 'tax']

plt.figure(figsize=(15, 6))
for i, col in enumerate(num_cols, 1):
    plt.subplot(1, len(num_cols), i)
    sns.boxplot(y=train_df[col])
    plt.title(col)
    plt.tight_layout()

# Feature combination exploration

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=train_df, x='mileage', y='price', hue='brand', alpha=0.5)
plt.title("Price vs Mileage by Brand")
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=train_df, x='engineSize', y='mpg', hue='brand', alpha=0.5)
plt.title("MPG vs Engine Size by Brand")
plt.show()