# Analyze car paremeters

I use data from [otomoto.pl](https://www.otomoto.pl/osobowe) acces on filename (carsDD_MM_YYYY_hh_mm.csv)

Main goal of analyze is found driver profiles for model.

In [None]:
import os
import datetime
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from utils import add_path, get_path_with, save_in

add_path()
from model import CV, EV, PHEV

car_types = [CV, PHEV, EV]

### set plot params

In [None]:
# plt.style.use("dark_background")
# plt.style.use("grayscale")
plt.rc("grid", alpha=0.3)
plt.rc("text", usetex=True)

kde_params = {
    "fill": True,
    "linewidth": 1,
    "common_norm": False,
}

hist_params = {
    "fill": True,
    "linewidth": 0,
    "common_norm": False,
    "common_bins": False,
    "kde": True,
    "stat": "density",
}

### Set data

In [None]:
data_file = os.sep.join([get_path_with("data"), "data", "cars02_05_2023_01_21.csv"])
df = pd.read_csv(data_file)

aliases = {
    "petrol": CV,
    "diesel": CV,
    "petrol-lpg": CV,
    "petrol-cng": CV,
    "electric": EV,
    "hybrid": PHEV,
}

df["type"] = df["fuel_type"].apply(lambda x: aliases[x])

today_year = 2023
df["age"] = df["year"].apply(lambda x: today_year - x)

df["mean_year_mileage"] = df.apply(
    lambda row: row["mileage"] // row.age if row.age != 0 else None, axis=1
)
df = df[df["age"] < 30]

In [None]:
df[df["type"] == CV].count()

In [None]:
df.describe()

## initial car 

In [None]:
sns.countplot(df[df["age"] < 5], x="type", order=[CV, PHEV, EV])
initial_factorial = {}
for c_type in car_types:
    initial_factorial[c_type] = len(df[(df.type == c_type) & (df["age"] < 5)]) / len(
        df[df["age"] < 5]
    )
    print(f'"{c_type}": {initial_factorial[c_type]}')

## Mean miliage

In [None]:
suggest_profiles = {
    CV: 14000,
    PHEV: 8000,
    EV: 3000,
}

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
sns.boxplot(df, y="age", x="type")

plt.title("Car age");

In [None]:
max_age = 5

In [None]:
fig, ax = plt.subplots(figsize=(16, 5))
max_age_median = max_age

median_mean_mileage = {
    CV: df[(df["type"] == CV) & (df["age"] <= max_age_median)][
        "mean_year_mileage"
    ].median(),
    EV: df[(df["type"] == EV) & (df["age"] <= max_age_median)][
        "mean_year_mileage"
    ].median(),
    PHEV: df[(df["type"] == PHEV) & (df["age"] <= max_age_median)][
        "mean_year_mileage"
    ].median(),
}

hist_plot = sns.histplot(
    df[(df["mean_year_mileage"] < 75_000) & (df["age"] <= max_age_median)],
    x="mean_year_mileage",
    hue="type",
    **hist_params,
)


for key in suggest_profiles:
    plt.axvline(x=median_mean_mileage[key], color="m")


plt.xlim([None, 35_000])
plt.title("histplot of mean year miliage")
plt.xlabel("mean year miliage");

In [None]:
fig, ax = plt.subplots(figsize=(5, 3))
for i, key in enumerate(car_types):
    plt.axhline(y=median_mean_mileage[key], xmin=i / 3, xmax=(i + 1) / 3)

sns.violinplot(
    df[(df["mean_year_mileage"] < 100000) & (df["age"] < max_age)],
    y="mean_year_mileage",
    x="type",
    order=car_types,
)

plt.legend(["median"])
plt.ylabel("average annual mileage")
plt.title("average annual mileage")
save_in("averageAnnualMileage", "pictures", plt)

In [None]:
fig, ax = plt.subplots(figsize=(16, 5))


sns.histplot(
    df[(df["mean_year_mileage"] < 75_000) & (df["age"] <= max_age_median)],
    x="mean_year_mileage",
    **hist_params,
)

for key in suggest_profiles:
    plt.axvline(x=median_mean_mileage[key], color="m")
    # plt.axvline(x=suggest_profiles[key], color="y")


plt.xlim([None, 35_000])

plt.title("KDE of mean year mileage")
plt.xlabel("mean year mileage");

In [None]:
median_mean_mileage

# Profiles distribution

In [None]:
sns.violinplot(df[(df["age"] < max_age)], y="mean_year_mileage")
for key in suggest_profiles:
    plt.axhline(y=median_mean_mileage[key], color="m")

In [None]:
profiles_distribution = {}
k = 9

profiles_distribution[CV] = len(
    df[
        (df["age"] < max_age)
        & (
            df.mean_year_mileage
            > (median_mean_mileage[CV] * k + (10 - k) * median_mean_mileage[PHEV]) / 10
        )
    ]
) / len(df[(df["age"] < max_age)])

profiles_distribution[EV] = len(
    df[
        (df["age"] < max_age)
        & (
            df.mean_year_mileage
            < (median_mean_mileage[EV] * k + (10 - k) * median_mean_mileage[PHEV]) / 10
        )
    ]
) / len(df[(df["age"] < max_age)])

profiles_distribution[PHEV] = 1 - profiles_distribution[CV] - profiles_distribution[EV]

In [None]:
(median_mean_mileage[CV] * k + (10 - k) * median_mean_mileage[PHEV]) / 10

In [None]:
median_mean_mileage[CV]

In [None]:
median_mean_mileage

In [None]:
profiles_distribution