In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import missingno as msno

from core.src.constants import FINAL_CSV, CLEANED_CSV
from utils.format import drop_na

In [None]:
sns.set(rc={"figure.figsize": (10, 10)})
plt.figure(figsize=(10, 8))
df = pd.read_csv(FINAL_CSV)

In [None]:
msno.matrix(df)

In [None]:
msno.bar(df)

# Removing url and path columns

In [None]:
df = df.drop(["url", "path"], axis=1)

# Show columns and the percentage at which their values are not null

In [None]:
def show_columns(df):
    for column in df.columns:
        print(f"{column}: {df[column].count() / len(df) * 100:.2f}%")


show_columns(df)

# Remove electric vehicles and their specific columns as they are outliers

In [None]:
initial_len = len(df)


def remove_electric_vehicles(df):
    return df[df["combustibil"] != "electric"]


def remove_electric_columns(df):
    return df.drop(["vehicule electrice", "autonomie", "capacitate baterie", "contract baterie", "timp de incarcare"], axis=1)


df = remove_electric_vehicles(df)
df = remove_electric_columns(df)

print(f"Removed {initial_len - len(df)} electric vehicles")

# Remove vehicles that are in leasin, as their price is not accurate to our needs

In [None]:
initial_len = len(df)


def remove_leasing(df):
    return df[df["predare leasing"] != "da"]


def remove_leasing_columns(df):
    return df.drop(
        ["predare leasing", "plata initiala (la predare)", "valoare rata lunara", "numar de rate lunare ramase", "valoare reziduala"], axis=1
    )


df = remove_leasing(df)
df = remove_leasing_columns(df)

print(f"Removed {initial_len - len(df)} leasing vehicles")

# Remove irrelevant columns

In [None]:
# Removing "tuning" column as it is scrapped as bool "True" or "False", ant the actual tunning in real life can really depend on how much money it cost, so a bool doesn't help us in the car price prediction

df = df.drop(["tuning"], axis=1)

# Drop all rows that have NaN values in the CORE columns

In [None]:
for column in [
    "id",
    "oferit de",
    "marca",
    "model",
    "anul producției",
    "km",
    "combustibil",
    "putere",
    "capacitate cilindrica",
    "cutie de viteze",
    "tip caroserie",
    "stare",
    "price",
]:
    df = drop_na(df, column)

# Drop all rows that don't have the currency == 'eur', (we could transofmr ron to euro but there are really a few of them)

In [None]:
initial_len = len(df)

df = df[df["currency"] == "eur"]

print(f"Removed {initial_len - len(df)} non eur currency vehicles")

# Also remove the column
df = df.drop(["currency"], axis=1)

# Drop the column "masina de epoca" as there is a few data points for it, outliers

In [None]:
df = df.drop(["masina de epoca"], axis=1)

# Drop the column "volan pe dreapta" as there is a few data points for it, outliers

In [None]:
df = df.drop(["volan pe dreapta"], axis=1)

In [None]:
df.to_csv(CLEANED_CSV, index=False)