In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from core.src.constants import CORE_CLEANED_CSV, CORE_FORMATTED_CSV

In [None]:
sns.set(rc={"figure.figsize": (10, 10)})
plt.figure(figsize=(10, 8))

# Format all fields that need it in the order they appear in final.csv

In [None]:
df = pd.read_csv(CORE_CLEANED_CSV)

df["anul producției"] = df["anul producției"].astype(int)

# Format "km" column by removing the " km" suffix and the " " separator, save it as int
df["km"] = df["km"].str.rstrip(" km").str.replace(" ", "").astype(int)

# Format "putere" column by removing the " cp" suffix, save it as int
df["putere"] = df["putere"].str.rstrip(" cp").str.replace(" ", "").astype(int)

# Format "capacitate cilindrica" column by removing the " cm3" suffix and the separator " ", save it as int
df["capacitate cilindrica"] = df["capacitate cilindrica"].str.rstrip(" cm3").str.replace(" ", "").astype(int)

# Format "norma de poluare" column by making all values that start with "euro 6" to "euro 6", all values that start with "euro 5" to "euro 5" and so on
df["norma de poluare"] = df["norma de poluare"].str.extract(r"(euro \d)")

# # Format "consum extraurban" column by removing the suffix " l/100 km" and replacing "," with ".", save it as float
# df["consum extraurban"] = df["consum extraurban"].str.rstrip(" l/100 km").str.replace(",", ".").astype(float)
#
# # Same for "consum urban" and "consum mixt" and "consum mediu" columns
# df["consum urban"] = df["consum urban"].str.rstrip(" l/100 km").str.replace(",", ".").astype(float)
# df["consum mixt"] = df["consum mixt"].str.rstrip(" l/100 km").str.replace(",", ".").astype(float)
# df["consum mediu"] = df["consum mediu"].str.rstrip(" l/100 km").str.replace(",", ".").astype(float)
#
# # Format "emisii co2" by removing the suffix " g/km", save it as int
# df["emisii co2"] = df["emisii co2"].str.rstrip(" g/km").astype(int)

# Format "price" by removing the decimal values after one of the following separators: ",", ",", save it as int
df["price"] = df["price"].str.replace(r"\..*", "", regex=True)
df["price"] = df["price"].str.replace(r",.*", "", regex=True)
df["price"] = df["price"].astype(int)

# # Format "masina de epoca" by making all "da" values to True, and "nu" or NaN to False
# df["masina de epoca"] = df["masina de epoca"].replace({"da": True, "nu": False, np.nan: False}).astype(bool)
#
# # Same for "volan pe dreapta" column
# df["volan pe dreapta"] = df["volan pe dreapta"].replace({"da": True, "nu": False, np.nan: False}).astype(bool)

# Create "consum" column, which is a string that contains the values of "consum extraurban", "consum urban", "consum mixt" and "consum mediu" columns, drop the original columns

In [None]:
df["consum extraurban"] = df["consum extraurban"].astype(str).replace("nan", "")
df["consum urban"] = df["consum urban"].astype(str).replace("nan", "")
df["consum mixt"] = df["consum mixt"].astype(str).replace("nan", "")
df["consum mediu"] = df["consum mediu"].astype(str).replace("nan", "")


def concatenate_consum(row):
    cols = ["consum extraurban", "consum urban", "consum mixt", "consum mediu"]
    values = [f"{col} {row[col]}" for col in cols if row[col] != ""]
    if len(values) == 0:
        return None
    return ", ".join(values)


df["consum"] = df.apply(concatenate_consum, axis=1)
df = df.drop(columns=["consum extraurban", "consum urban", "consum mixt", "consum mediu"])

# Create "garantie" column, which is a string that contains the values of "garantie dealer (inclusa in pret)" and "sau in limita a" and "garantie de la producator pana la" columns, drop the original columns

In [None]:
df["garantie dealer (inclusa in pret)"] = df["garantie dealer (inclusa in pret)"].astype(str).replace("nan", "")
df["sau in limita a"] = df["sau in limita a"].astype(str).replace("nan", "")
df["garantie de la producator pana la"] = df["garantie de la producator pana la"].astype(str).replace("nan", "")


def concatenate_garantie(row):
    cols = [
        "garantie dealer (inclusa in pret)",
        "sau in limita a",
        "garantie de la producator pana la",
    ]
    values = [f"{col} {row[col]}" for col in cols if row[col] != ""]
    if len(values) == 0:
        return None
    return ", ".join(values)


df["garantie"] = df.apply(concatenate_garantie, axis=1)
df = df.drop(
    columns=[
        "garantie dealer (inclusa in pret)",
        "sau in limita a",
        "garantie de la producator pana la",
    ]
)

# Create "non important details" column, which collects all the columns that can not be used as independent features, either because not enough values in our dataset, or can't assume their values, nor fill them manually, this column can be used for appending to the description of the car and then processed by BERT

In [None]:
DESCRIPTION_COLUMNS = [
    "are vin (serie sasiu)",
    "se emite factura",
    "eligibil pentru finantare",
]

df["are vin (serie sasiu)"] = df["are vin (serie sasiu)"].astype(str).replace("nan", "")
df["se emite factura"] = df["se emite factura"].astype(str).replace("nan", "")
df["eligibil pentru finantare"] = df["eligibil pentru finantare"].astype(str).replace("nan", "")


def concatenate_non_important_details(row):
    values = [f"{col}: {row[col]}" for col in DESCRIPTION_COLUMNS if row[col] != ""]
    if len(values) == 0:
        return None
    return "\n".join(values)


df["non important details"] = df.apply(concatenate_non_important_details, axis=1)
df = df.drop(columns=DESCRIPTION_COLUMNS)

# Create "istoric" column, which is a string that contains the values of "tara de origine", "data primei inmatriculari", "inmatriculat", "primul proprietar (de nou), "fara accident in istoric", "carte de service", columns, separated by nextline and the form of "column: value", drop the original columns, also ignore the NaN values

In [None]:
df["tara de origine"] = df["tara de origine"].astype(str).replace("nan", "")
df["data primei inmatriculari"] = df["data primei inmatriculari"].astype(str).replace("nan", "")
df["inmatriculat"] = df["inmatriculat"].astype(str).replace("nan", "")
df["primul proprietar (de nou)"] = df["primul proprietar (de nou)"].astype(str).replace("nan", "")
df["fara accident in istoric"] = df["fara accident in istoric"].astype(str).replace("nan", "")
df["carte de service"] = df["carte de service"].astype(str).replace("nan", "")


def concatenate_istoric(row):
    cols = [
        "tara de origine",
        "data primei inmatriculari",
        "inmatriculat",
        "primul proprietar (de nou)",
        "fara accident in istoric",
        "carte de service",
    ]
    values = [f"{col}: {row[col]}" for col in cols if row[col] != ""]
    if len(values) == 0:
        return None
    return "\n".join(values)


df["istoric"] = df.apply(concatenate_istoric, axis=1)
df = df.drop(
    columns=[
        "tara de origine",
        "data primei inmatriculari",
        "inmatriculat",
        "primul proprietar (de nou)",
        "fara accident in istoric",
        "carte de service",
    ]
)

# Create "poluare" column, which is a string that contains the values of "norma de poluare", "emisii co2" columns, separated by nextline and the form of "column: value", drop the original columns, also ignore the NaN values

In [None]:
df["norma de poluare"] = df["norma de poluare"].astype(str).replace("nan", "")
df["emisii co2"] = df["emisii co2"].astype(str).replace("nan", "")


def concatenate_poluare(row):
    cols = ["norma de poluare", "emisii co2"]
    values = [f"{col}: {row[col]}" for col in cols if row[col] != ""]
    if len(values) == 0:
        return None
    return "\n".join(values)


df["poluare"] = df.apply(concatenate_poluare, axis=1)
df = df.drop(columns=["norma de poluare", "emisii co2"])

# Create "culoare" column, which is a string that contains the values of "culoare" and "optiuni culoare" columns, separated by nextline and the form of "column: value", drop the original columns, also ignore the NaN values

In [None]:
df["culoare"] = df["culoare"].astype(str).replace("nan", "")
df["optiuni culoare"] = df["optiuni culoare"].astype(str).replace("nan", "")


def concatenate_culoare(row):
    cols = ["culoare", "optiuni culoare"]
    values = [f"{col}: {row[col]}" for col in cols if row[col] != ""]
    if len(values) == 0:
        return None
    return "\n".join(values)


df["culoare"] = df.apply(concatenate_culoare, axis=1)
df = df.drop(columns=["optiuni culoare"])

# Create "detalii generale" column which is a string that contains values from "marca", "model", "versiune", "generatie", "anul producției", "combustibil", "km", "putere", "capacitate cilindrica", "transmisie", "cutie de viteze", "tip caroserie", "numar de portiere", "numar locuri", "stare", columns, separated by nextline and the form of "column: value", also ignore the NaN values

In [None]:
temp = df.copy()

temp["marca"] = temp["marca"].astype(str).replace("nan", "")
temp["model"] = temp["model"].astype(str).replace("nan", "")
temp["versiune"] = temp["versiune"].astype(str).replace("nan", "")
temp["generatie"] = temp["generatie"].astype(str).replace("nan", "")
temp["anul producției"] = temp["anul producției"].astype(str).replace("nan", "")
temp["combustibil"] = temp["combustibil"].astype(str).replace("nan", "")
temp["km"] = temp["km"].astype(str).replace("nan", "")
temp["putere"] = temp["putere"].astype(str).replace("nan", "")
temp["capacitate cilindrica"] = temp["capacitate cilindrica"].astype(str).replace("nan", "")

# append the corresponding suffixes to the values
temp["km"] = np.where(temp["km"] != "", temp["km"] + " km", temp["km"])
temp["putere"] = np.where(temp["putere"] != "", temp["putere"] + " cp", temp["putere"])
temp["capacitate cilindrica"] = np.where(
    temp["capacitate cilindrica"] != "",
    temp["capacitate cilindrica"] + " cm3",
    temp["capacitate cilindrica"],
)


def concatenate_detalii_generale(row):
    cols = [
        "marca",
        "model",
        "versiune",
        "generatie",
        "anul producției",
        "combustibil",
        "km",
        "putere",
        "capacitate cilindrica",
        "transmisie",
        "cutie de viteze",
        "tip caroserie",
        "numar de portiere",
        "numar locuri",
        "stare",
    ]
    values = [f"{col}: {row[col]}" for col in cols if row[col] != ""]
    if len(values) == 0:
        return None
    return "\n".join(values)


df["detalii generale"] = temp.apply(concatenate_detalii_generale, axis=1)
df = df.drop(columns=["generatie", "numar locuri", "numar de portiere", "transmisie", "versiune"])

## Remove vehicles with more than 600k km, as they are outliers and also remove vehicles older than 1990, as they are also outliers

In [None]:
initial_len = len(df)

df = df[df["km"] <= 600000]
df = df[df["anul producției"] >= 1990]

final_len = len(df)
print(f"Removed {initial_len - final_len} outliers")

In [None]:
df.to_csv(CORE_FORMATTED_CSV, index=False)