In [36]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from core.src.constants import CLEANED_CSV, FORMATTED_CSV

In [37]:
sns.set(rc={"figure.figsize": (10, 10)})
plt.figure(figsize=(10, 8))

<Figure size 1000x800 with 0 Axes>

<Figure size 1000x800 with 0 Axes>

# Format all fields that need it in the order they appear in final.csv

In [38]:
df = pd.read_csv(CLEANED_CSV)

# Format "km" column by removing the " km" suffix and the " " separator, save it as int
df["km"] = df["km"].str.rstrip(" km").str.replace(" ", "").astype(int)

# Format "putere" column by removing the " cp" suffix, save it as int
df["putere"] = df["putere"].str.rstrip(" cp").str.replace(" ", "").astype(int)

# Format "capacitate cilindrica" column by removing the " cm3" suffix and the separator " ", save it as int
df["capacitate cilindrica"] = df["capacitate cilindrica"].str.rstrip(" cm3").str.replace(" ", "").astype(int)

# Format "norma de poluare" column by making all values that start with "euro 6" to "euro 6", all values that start with "euro 5" to "euro 5" and so on
df["norma de poluare"] = df["norma de poluare"].str.extract(r"(euro \d)")

# # Format "consum extraurban" column by removing the suffix " l/100 km" and replacing "," with ".", save it as float
# df["consum extraurban"] = df["consum extraurban"].str.rstrip(" l/100 km").str.replace(",", ".").astype(float)
#
# # Same for "consum urban" and "consum mixt" and "consum mediu" columns
# df["consum urban"] = df["consum urban"].str.rstrip(" l/100 km").str.replace(",", ".").astype(float)
# df["consum mixt"] = df["consum mixt"].str.rstrip(" l/100 km").str.replace(",", ".").astype(float)
# df["consum mediu"] = df["consum mediu"].str.rstrip(" l/100 km").str.replace(",", ".").astype(float)
#
# # Format "emisii co2" by removing the suffix " g/km", save it as int
# df["emisii co2"] = df["emisii co2"].str.rstrip(" g/km").astype(int)

# Format "price" by removing the decimal values after one of the following separators: ",", ",", save it as int
df["price"] = df["price"].str.replace(r"\..*", "", regex=True)
df["price"] = df["price"].str.replace(r",.*", "", regex=True)
df["price"] = df["price"].astype(int)

# Format "masina de epoca" by making all "da" values to True, and "nu" or NaN to False
df["masina de epoca"] = df["masina de epoca"].replace({"da": True, "nu": False, np.nan: False}).astype(bool)

# Same for "volan pe dreapta" column
df["volan pe dreapta"] = df["volan pe dreapta"].replace({"da": True, "nu": False, np.nan: False}).astype(bool)

  df["masina de epoca"] = df["masina de epoca"].replace({"da": True, "nu": False, np.nan: False}).astype(bool)
  df["volan pe dreapta"] = df["volan pe dreapta"].replace({"da": True, "nu": False, np.nan: False}).astype(bool)


# Create "consum" column, which is a string that contains the values of "consum extraurban", "consum urban", "consum mixt" and "consum mediu" columns, drop the original columns

In [39]:
df["consum extraurban"] = df["consum extraurban"].astype(str).replace("nan", "")
df["consum urban"] = df["consum urban"].astype(str).replace("nan", "")
df["consum mixt"] = df["consum mixt"].astype(str).replace("nan", "")
df["consum mediu"] = df["consum mediu"].astype(str).replace("nan", "")


def concatenate_consum(row):
    cols = ["consum extraurban", "consum urban", "consum mixt", "consum mediu"]
    values = [f"{col} {row[col]}" for col in cols if row[col] != ""]
    if len(values) == 0:
        return None
    return ", ".join(values)


df["consum"] = df.apply(concatenate_consum, axis=1)
df = df.drop(columns=["consum extraurban", "consum urban", "consum mixt", "consum mediu"])

# Create "garantie" column, which is a string that contains the values of "garantie dealer (inclusa in pret)" and "sau in limita a" and "garantie de la producator pana la" columns, drop the original columns

In [40]:
df["garantie dealer (inclusa in pret)"] = df["garantie dealer (inclusa in pret)"].astype(str).replace("nan", "")
df["sau in limita a"] = df["sau in limita a"].astype(str).replace("nan", "")
df["garantie de la producator pana la"] = df["garantie de la producator pana la"].astype(str).replace("nan", "")


def concatenate_garantie(row):
    cols = ["garantie dealer (inclusa in pret)", "sau in limita a", "garantie de la producator pana la"]
    values = [f"{col} {row[col]}" for col in cols if row[col] != ""]
    if len(values) == 0:
        return None
    return ", ".join(values)


df["garantie"] = df.apply(concatenate_garantie, axis=1)
df = df.drop(columns=["garantie dealer (inclusa in pret)", "sau in limita a", "garantie de la producator pana la"])

In [41]:
df.to_csv(FORMATTED_CSV, index=False)