In [None]:
from constants import FINAL_CSV
import pandas as pd

In [None]:
df = pd.read_csv(FINAL_CSV)

In [None]:
df.drop(["url", "path"], axis=1, inplace=True)

df = df[df["id"] != "ID7HeZrU"]

In [None]:
df.info()

## Remove electric vehicles and their specific columns as they are outliers

In [None]:
def remove_electric_vehicles(df):
    initial_len = len(df)
    df = df[df["combustibil"] != "electric"]
    print(f"Removed {initial_len - len(df)} electric vehicles")
    df = df.drop(
        [
            "vehicule electrice",
            "autonomie",
            "capacitate baterie",
            "contract baterie",
            "timp de incarcare",
        ],
        axis=1,
    )
    return df


print(df["combustibil"].value_counts())
print(df["combustibil"].unique())
df = remove_electric_vehicles(df)

## Remove vehicles that are in leasing, as their price is not accurate to our needs

In [None]:
def remove_leasing(df):
    initial_len = len(df)
    df = df[df["predare leasing"] != "da"]
    print(f"Removed {initial_len - len(df)} leasing vehicles")
    df = df.drop(
        [
            "predare leasing",
            "plata initiala (la predare)",
            "valoare rata lunara",
            "numar de rate lunare ramase",
            "valoare reziduala",
        ],
        axis=1,
    )
    return df


df = remove_leasing(df)

## Remove tuned cars as they are outliers, and alos not detailed enough, as tuning can vary a lot

In [None]:
def remove_tuning(df):
    initial_len = len(df)
    df = df[df["tuning"] != "da"]
    print(f"Removed {initial_len - len(df)} tuned vehicles")
    df = df.drop(["tuning"], axis=1)
    return df


print(df["tuning"].value_counts())
print(df["tuning"].unique())
df = remove_tuning(df)

## Drop rows that have currency different than eur

In [None]:
def remove_non_eur_currency(df):
    initial_len = len(df)
    df = df[df["currency"] == "eur"]
    print(f"Removed {initial_len - len(df)} non eur currency vehicles")
    df = df.drop(["currency"], axis=1)
    return df


print(df["currency"].isna().sum())
print(df["currency"].value_counts())
print(df["currency"].unique())
df = remove_non_eur_currency(df)

## Drop vintage cars, as they are outliers

In [None]:
def remove_vintage_cars(df):
    initial_len = len(df)
    df = df[df["masina de epoca"] != "da"]
    print(f"Removed {initial_len - len(df)} vintage cars")
    seecond_initial_len = len(df)
    df = df[df["anul producției"] >= 2000]
    print(f"Removed {seecond_initial_len - len(df)} older than 2000 cars")
    df = df.drop(["masina de epoca"], axis=1)
    return df


print(df["masina de epoca"].value_counts())
print(df["masina de epoca"].unique())
print(df["anul producției"].isna().sum())
print(df["anul producției"].value_counts())
print(df["anul producției"].unique())
df = remove_vintage_cars(df)

## Drop the column "volan pe dreapta" as there is a few data points for it, outliers

In [None]:
def remove_volan_pe_dreapta(df):
    initial_len = len(df)
    df = df[df["volan pe dreapta"] != "da"]
    print(f"Removed {initial_len - len(df)} right hand drive cars")
    df = df.drop(["volan pe dreapta"], axis=1)
    return df


print(df["volan pe dreapta"].value_counts())
print(df["volan pe dreapta"].unique())
df = remove_volan_pe_dreapta(df)

In [None]:
import missingno as msno

msno.bar(df)

## Remove fuel outliers

In [None]:
print(df["combustibil"].value_counts())


def remove_fuel_outliers(df):
    initial_len = len(df)
    df = df[df["combustibil"].isin(["benzina", "diesel"])]
    print(f"Removed {initial_len - len(df)} fuel outliers")
    return df


df = remove_fuel_outliers(df)
print(df["combustibil"].isna().sum())
print(df["combustibil"].value_counts())
print(df["combustibil"].unique())

## Format numeric columns

In [None]:
def format_numeric_columns(df):
    def format_km():
        df["km"] = df["km"].str.replace(" km", "").str.replace(" ", "").astype(int)

    def format_horsepower():
        df["putere"] = df["putere"].str.replace(" cp", "").str.replace(" ", "").astype(int)

    def format_motor_capacity():
        df["capacitate cilindrica"] = df["capacitate cilindrica"].str.replace(" cm3", "").str.replace(" ", "").astype(int)

    def format_price():
        df["price"] = df["price"].astype(str)
        df["price"] = df["price"].str.replace(r"\..*", "", regex=True)
        df["price"] = df["price"].str.replace(r",.*", "", regex=True)
        df["price"] = df["price"].astype(int)

    format_km()
    format_horsepower()
    format_motor_capacity()
    format_price()
    return df


# print how many rows from putere are na
print(df["km"].isna().sum())
print(df["putere"].isna().sum())
print(df["capacitate cilindrica"].isna().sum())
print(df["price"].isna().sum())

# make ale new cars have 1 km
df.loc[df["stare"] == "nou", "km"] = "1 km"
# drop rows with na values
df = df.dropna(subset=["km", "putere", "capacitate cilindrica", "price"])

print(df["km"].isna().sum())
print(df["putere"].isna().sum())
print(df["capacitate cilindrica"].isna().sum())
print(df["price"].isna().sum())

df = format_numeric_columns(df)

## Create "consum" column, which is a string that contains the values of "consum extraurban", "consum urban", "consum mixt" and "consum mediu" columns, drop the original columns

In [None]:
CONSUM_COLS = ["consum extraurban", "consum urban", "consum mixt", "consum mediu"]

# for col in CONSUM_COLS:
#     df[f"copy_{col}"] = df[col].copy()
#
# COPY_COLUMNS = [f"copy_{col}" for col in CONSUM_COLS]
#
# for col in COPY_COLUMNS:
#     print(df[col].dtype)
#     print(df[col].isna().sum())
#     print(df[col].value_counts())
#     print(df[col].unique())
#     df.fillna({col: ""}, inplace=True)
#
#
# def concatenate_consum(row):
#     values = [f"{col.replace('copy_', '')} {row[col]}" for col in COPY_COLUMNS if row[col] != "" and row[col] != "nan"]
#     if len(values) == 0:
#         return None
#     return ", ".join(values)
#
#
# df["consum"] = df.apply(concatenate_consum, axis=1)
# df = df.drop(columns=CONSUM_COLS)
# df = df.drop(columns=COPY_COLUMNS)
# print(df["consum"].isna().sum())

df = df.drop(columns=CONSUM_COLS)

## Create "garantie" column, which is a string that contains the values of "garantie dealer (inclusa in pret)" and "sau in limita a" and "garantie de la producator pana la" columns, drop the original columns

In [None]:
GARANTIE_COLS = [
    "garantie dealer (inclusa in pret)",
    "sau in limita a",
    "garantie de la producator pana la",
]

# for col in GARANTIE_COLS:
#     df[f"copy_{col}"] = df[col].copy()
#
# COPY_COLUMNS = [f"copy_{col}" for col in GARANTIE_COLS]
#
# for col in COPY_COLUMNS:
#     print(df[col].dtype)
#     print(df[col].isna().sum())
#     print(df[col].value_counts())
#     print(df[col].unique())
#     df.fillna({col: ""}, inplace=True)
#
#
# def concatenate_garantie(row):
#     values = [f"{col.replace('copy_', '')} {row[col]}" for col in COPY_COLUMNS if row[col] != "" and row[col] != "nan"]
#     if len(values) == 0:
#         return None
#     return ", ".join(values)
#
#
# df["garantie"] = df.apply(concatenate_garantie, axis=1)
# df = df.drop(columns=GARANTIE_COLS)
# df = df.drop(columns=COPY_COLUMNS)
# print(df["garantie"].isna().sum())

df = df.drop(columns=GARANTIE_COLS)

In [None]:
# TODO: maybe use them?

UNUSED_COLUMNS = [
    "are vin (serie sasiu)",
    "se emite factura",
    "eligibil pentru finantare",
]

df = df.drop(columns=UNUSED_COLUMNS)

## History

In [None]:
HISTORY_COLS = [
    "data primei inmatriculari",
    "inmatriculat",
    "primul proprietar (de nou)",
    "fara accident in istoric",
    "carte de service",
]

for col in HISTORY_COLS:
    print(df[col].dtype)
    print(df[col].isna().sum())
    print(df[col].value_counts())
    print(df[col].unique())

# TODO: drop them but might be useful
df = df.drop(columns=HISTORY_COLS)

## Create "poluare" column, which is a string that contains the values of "norma de poluare", "emisii co2" columns, separated by nextline and the form of "column: value", drop the original columns, also ignore the NaN values

In [None]:
POLUARE_COLS = ["norma de poluare", "emisii co2"]

# for col in POLUARE_COLS:
#     df[f"copy_{col}"] = df[col].copy()
#
# COPY_COLUMNS = [f"copy_{col}" for col in POLUARE_COLS]
#
# for col in COPY_COLUMNS:
#     print(df[col].dtype)
#     print(df[col].isna().sum())
#     print(df[col].value_counts())
#     print(df[col].unique())
#     df.fillna({col: ""}, inplace=True)
#
#
# def concatenate_poluare(row):
#     values = [f"{col.replace('copy_', '')}: {row[col]}" for col in COPY_COLUMNS if row[col] != "" and row[col] != "nan"]
#     if len(values) == 0:
#         return None
#     return "\n".join(values)
#
#
# df["poluare"] = df.apply(concatenate_poluare, axis=1)
# df = df.drop(columns=POLUARE_COLS)
# df = df.drop(columns=COPY_COLUMNS)

df = df.drop(columns=POLUARE_COLS)

## Color

In [None]:
print(df["culoare"].isna().sum())
print(df["culoare"].value_counts())
print(df["culoare"].unique())

print(df["optiuni culoare"].isna().sum())
print(df["optiuni culoare"].value_counts())
print(df["optiuni culoare"].unique())

df["optiuni culoare"] = df["optiuni culoare"].fillna("standard")
print(df["optiuni culoare"].isna().sum())
print(df["optiuni culoare"].value_counts())
print(df["optiuni culoare"].unique())

## Concat custom options

In [None]:
CUSTOM_OPTIONS_COLUMNS = [
    "audio si tehnologie",
    "confort si echipamente optionale",
    "electronice si sisteme de asistenta",
    "performanta",
    "siguranta",
]

df["combined"] = df[CUSTOM_OPTIONS_COLUMNS].apply(lambda row: ", ".join(row.dropna()), axis=1).str.split(", ")

df = df.drop(columns=CUSTOM_OPTIONS_COLUMNS)

df.to_csv("core_formatted_v1.csv", index=False)


df.head(100)

In [None]:
from tqdm import tqdm

all_features = set()

for features in df["combined"][:1]:
    print(features)

df["combined"].apply(lambda features: all_features.update(features))

all_features = list(all_features)
print(all_features[:10])

# new_columns = {feature: df['combined'].apply(lambda features: feature in features).astype(int) for feature in tqdm(all_features)}
#
# df = pd.concat([df, pd.DataFrame(new_columns, index=df.index)], axis=1)
#
# df

# for col in CUSTOM_OPTIONS_COLUMNS:
#     df[f"copy_{col}"] = df[col].copy()
#
# COPY_COLUMNS = [f"copy_{col}" for col in CUSTOM_OPTIONS_COLUMNS]
#
# for col in COPY_COLUMNS:
#     print(df[col].dtype)
#     print(df[col].isna().sum())
#     df.fillna({col: ""}, inplace=True)
#
#
# def concatenate_custom_options(row):
#     values = [f"{col.replace('copy_', '')}: {row[col]}" for col in COPY_COLUMNS if row[col] != "" and row[col] != "nan"]
#     if len(values) == 0:
#         return None
#     return "\n".join(values)
#
#
# df["optiuni"] = df.apply(concatenate_custom_options, axis=1)
#
# df = df.drop(columns=CUSTOM_OPTIONS_COLUMNS)
# df = df.drop(columns=COPY_COLUMNS)

## Replace oferit de column with a boolean column

In [None]:
print(df["oferit de"].isna().sum())
print(df["oferit de"].value_counts())
print(df["oferit de"].unique())

df["firma"] = df["oferit de"].apply(lambda x: True if x == "firma" else False)
df = df.drop(columns=["oferit de"])

print(df["firma"].isna().sum())
print(df["firma"].value_counts())
print(df["firma"].unique())

## Drop numar de portiere column as it is very human error prone

In [None]:
df = df.drop(columns=["numar de portiere"])

# TODO: we also drop numar locuri for now, but we might use it
df = df.drop(columns=["numar locuri"])

## Replace stare columns with boolean is_new column

In [None]:
print(df["stare"].isna().sum())
print(df["stare"].value_counts())
print(df["stare"].unique())

df["is_new"] = df["stare"].apply(lambda x: True if x == "nou" else False)
df = df.drop(columns=["stare"])

print(df["is_new"].isna().sum())
print(df["is_new"].value_counts())
print(df["is_new"].unique())

## Drop the is_new column as there are only 1731 rows that are new, and remove those rows as they are outliers

In [None]:
initial_len = len(df)
df = df[df["is_new"] == False]
df = df.drop(columns=["is_new"])
print(f"Removed {initial_len - len(df)} new cars")

## Create “detalii generale” column which is a string that contains values from “marca”, “model”, “versiune”, “generatie”, “anul producției”, “combustibil”, “km”, “putere”, “capacitate cilindrica”, “transmisie”, “cutie de viteze”, “tip caroserie”, “numar de portiere”, “numar locuri”, “stare”, columns, separated by nextline and the form of “column: value”, also ignore the NaN values

In [None]:
GENERAL_DETAILS_COLUMNS = [
    "marca",
    "model",
    "versiune",
    "generatie",
    "transmisie",
    "cutie de viteze",
    # TODO: currently including tara de origine, missing a lot of values, maybe use it as structured data
    "tara de origine",
    "culoare",
    "optiuni culoare",
]

for col in GENERAL_DETAILS_COLUMNS:
    df[f"copy_{col}"] = df[col].copy()

COPY_COLUMNS = [f"copy_{col}" for col in GENERAL_DETAILS_COLUMNS]

for col in COPY_COLUMNS:
    print(df[col].dtype)
    print(df[col].isna().sum())
    df.fillna({col: ""}, inplace=True)


def concatenate_general_details(row):
    values = [f"{col.replace('copy_', '')}: {row[col]}" for col in COPY_COLUMNS if row[col] != "" and row[col] != "nan"]
    if len(values) == 0:
        return None
    return "\n".join(values)


df["detalii generale"] = df.apply(concatenate_general_details, axis=1)
df = df.drop(columns=["versiune", "generatie", "tara de origine", "culoare", "optiuni culoare"])
df = df.drop(columns=COPY_COLUMNS)

## Format cutie de viteze, drop the empty rows, make it a boolean column named is_automatic

In [None]:
print(df["cutie de viteze"].isna().sum())
print(df["cutie de viteze"].value_counts())
print(df["cutie de viteze"].unique())

df = df.dropna(subset=["cutie de viteze"])
df["is_automatic"] = df["cutie de viteze"].apply(lambda x: True if x == "automata" else False)
df = df.drop(columns=["cutie de viteze"])

print(df["is_automatic"].isna().sum())
print(df["is_automatic"].value_counts())
print(df["is_automatic"].unique())

## Format unique_id column

In [None]:
initial_len = len(df)
df = df.dropna(subset=["unique_id"])
print(f"Removed {initial_len - len(df)} rows with na unique_id")

df["unique_id"] = df["unique_id"].astype(str)
df["unique_id"] = df["unique_id"].apply(lambda x: x.zfill(6))

print(df["unique_id"].isna().sum())
print(df["unique_id"].value_counts())

In [None]:
print(df["transmisie"].isna().sum())

In [None]:
msno.bar(df)

## Concat strings columns to a full description

In [None]:
COLUMNS_TO_CONCAT = ["detalii generale", "optiuni", "consum", "garantie", "poluare", "description"]

for col in COLUMNS_TO_CONCAT:
    df[f"copy_{col}"] = df[col].copy()

COPY_COLUMNS = [f"copy_{col}" for col in COLUMNS_TO_CONCAT]

for col in COPY_COLUMNS:
    print(df[col].dtype)
    print(df[col].isna().sum())
    df.fillna({col: ""}, inplace=True)


def concatenate_full_description(row):
    values = [row[col] for col in COPY_COLUMNS if row[col] != "" and row[col] != "nan"]
    if len(values) == 0:
        return None
    return "\n".join(values)


df["input"] = df.apply(concatenate_full_description, axis=1)
df = df.drop(columns=COLUMNS_TO_CONCAT)
df = df.drop(columns=COPY_COLUMNS)

## In tip caroserie make masina mica, masina de oras to be compacta

In [None]:
df = df.replace({"tip caroserie": {"masina mica": "compacta", "masina de oras": "compacta"}})

In [None]:
msno.bar(df)

In [None]:
df = df.drop(columns=["id"])

REQUIRED_COLUMNS = [
    "marca",
    "model",
    "anul producției",
    "combustibil",
    "km",
    "putere",
    "capacitate cilindrica",
    "tip caroserie",
    "price",
    "unique_id",
    "firma",
    "input",
    "is_automatic",
]

initial_len = len(df)
df = df.dropna(subset=REQUIRED_COLUMNS)
print(f"Removed {initial_len - len(df)} rows with na in required columns")

df = df[
    [
        "unique_id",
        "price",
        "marca",
        "model",
        "anul producției",
        "km",
        "putere",
        "capacitate cilindrica",
        "combustibil",
        "tip caroserie",
        "is_automatic",
        "firma",
        "transmisie",
        "input",
    ]
]

df["anul producției"] = df["anul producției"].astype(int)

## OUTLIERS

In [None]:
# drop the rows where putere > 650
initial_len = len(df)
df = df[df["putere"] <= 600]
df = df[df["putere"] >= 50]
print(f"Removed {initial_len - len(df)} rows with putere > 500")

# drop rows with capacitate cilindrica < 500
initial_len = len(df)
df = df[df["capacitate cilindrica"] >= 500]
print(f"Removed {initial_len - len(df)} rows with capacitate cilindrica < 500")

# drop the rows where ca[acitate cilindrica > 4000
initial_len = len(df)
df = df[df["capacitate cilindrica"] <= 4000]
print(f"Removed {initial_len - len(df)} rows with capacitate cilindrica > 4000")

initial_len = len(df)
df = df[df["price"] <= 40_000]
print(f"Removed {initial_len - len(df)} rows with price > 100000")

initial_len = len(df)
df = df[df["km"] <= 500_000]
print(f"Removed {initial_len - len(df)} rows with km > 500000")

initial_len = len(df)
temp_df = df["marca"].value_counts()
df = df[df["marca"].isin(temp_df[temp_df >= 100].index)]
print(f"Removed {initial_len - len(df)} rows of outlier marca values")

In [None]:
msno.bar(df)

In [None]:
df.to_csv("core_formatted_v2", index=False)