In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno

from core.src.constants import CORE_FORMATTED_CSV, ROCAR_CSV

In [None]:
sns.set(rc={"figure.figsize": (10, 10)})
plt.figure(figsize=(10, 8))
df = pd.read_csv(CORE_FORMATTED_CSV)

In [None]:
msno.matrix(df)

In [None]:
msno.bar(df)

In [None]:
def plot_distribution_on_categorised_columns(df):
    columns = [
        "marca",
        "cutie de viteze",
        "tip caroserie",
        "stare",
        "combustibil",
    ]

    for column in columns:
        print(f"{column}: {df[column].value_counts(dropna=False)}")
        plt.figure(figsize=(10, 8))
        sns.countplot(x=column, data=df, order=df[column].value_counts().index)
        plt.xticks(rotation=90)
        plt.show()


plot_distribution_on_categorised_columns(df)

In [None]:
df["price_bin"] = pd.cut(
    df["price"],
    bins=np.arange(0, df["price"].max() + 20000, 20000),
    labels=np.arange(0, df["price"].max(), 20000),
)

plt.figure(figsize=(10, 8))
sns.countplot(x="price_bin", data=df, order=df["price_bin"].value_counts().index)
plt.xticks(rotation=90)
plt.show()

In [None]:
# remove outliers that are above 100k
df = df[df["price"] <= 100000]

# plot the distribution again but with 5k bins
df["price_bin"] = pd.cut(
    df["price"],
    bins=np.arange(0, df["price"].max() + 5000, 5000),
    labels=np.arange(0, df["price"].max(), 5000),
)

plt.figure(figsize=(10, 8))
sns.countplot(x="price_bin", data=df, order=df["price_bin"].value_counts().index)
plt.xticks(rotation=90)
plt.show()

In [None]:
def save_df_to_csv(df, path):
    df.to_csv(path, index=False)


save_df_to_csv(df, ROCAR_CSV)

In [None]:
# show the distribution of oferit de related to price
sns.boxplot(x="oferit de", y="price", data=df)

In [None]:
# show de distribution of marca related to price
sns.boxplot(x="marca", y="price", data=df)
plt.xticks(rotation=90)

In [None]:
# show the distribution of anul producției related to price
sns.boxplot(x="anul producției", y="price", data=df)
plt.xticks(rotation=90)

In [None]:
# show the distribution of km related to price, binned in 40k intervals
df["km_bin"] = pd.cut(
    df["km"],
    bins=np.arange(0, df["km"].max() + 40000, 40000),
    labels=np.arange(0, df["km"].max(), 40000),
)

sns.boxplot(x="km_bin", y="price", data=df)
plt.xticks(rotation=90)

In [None]:
# show the distribution of putere related to pirce, binned in 50 intervals

df["putere_bin"] = pd.cut(
    df["putere"],
    bins=np.arange(0, df["putere"].max() + 50, 50),
    labels=np.arange(0, df["putere"].max(), 50),
)

sns.boxplot(x="putere_bin", y="price", data=df)

In [None]:
# shoiw the distribution of capacitate cilindrica related to price, binned in 500 intervals

df["capacitate cilindrica_bin"] = pd.cut(
    df["capacitate cilindrica"],
    bins=np.arange(0, df["capacitate cilindrica"].max() + 500, 500),
    labels=np.arange(0, df["capacitate cilindrica"].max(), 500),
)

sns.boxplot(x="capacitate cilindrica_bin", y="price", data=df)

In [None]:
# show the distribution of cutie de viteze related to price
sns.boxplot(x="cutie de viteze", y="price", data=df)

In [None]:
# show the distribution of tip caroserie related to price
sns.boxplot(x="tip caroserie", y="price", data=df)

In [None]:
# show the distribution of stare related to price
sns.boxplot(x="stare", y="price", data=df)