In [None]:
from constants import ROCAR_CSV
import pandas as pd
import pickle
from constants import ENCODER_PATH, CATEGORICAL_SCALER_PATH, NUMERICAL_SCALER_PATH, TRAIN_DATA_CSV, TEST_DATA_CSV, TARGET_SCALER_PATH

In [None]:
df = pd.read_csv(
    ROCAR_CSV,
    dtype={
        "unique_id": str,
        "price": int,
        "marca": str,
        "model": str,
        "anul producției": int,
        "km": int,
        "putere": int,
        "capacitate cilindrica": int,
        "combustibil": str,
        "tip caroserie": str,
        "is_automatic": bool,
        "firma": bool,
        "transmisie": str,
        "input": str,
    },
)
df.info()

In [None]:
df.head(10)

In [None]:
df.info()

In [None]:
categorical_columns = ["marca", "model", "combustibil", "tip caroserie"]
numeric_columns = ["km", "putere", "capacitate cilindrica", "anul producției"]

# FORMAT YEAR OF PRODUCTION AS CURRENT_YEAR - YEAR_OF_PRODUCTION

In [None]:
df["anul producției"] = 2024 - df["anul producției"]

df.head(5)

# FORMAT BOOLEAN COLUMNS AS 0 AND 1

In [None]:
BOOLEAN_COLUMNS = ["is_automatic", "firma"]

for column in BOOLEAN_COLUMNS:
    df[column] = df[column].astype(int)

df.head(5)

# Encode categorical columns

In [None]:
import category_encoders as ce

target_encoder = ce.TargetEncoder(cols=categorical_columns)
df_encoded = target_encoder.fit_transform(df[categorical_columns], df["price"])
df_encoded.head(5)

with open(ENCODER_PATH, "wb") as f:
    pickle.dump(target_encoder, f)

print(len(df_encoded))

# Normalize numeric columns and encoded categorical columns

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

numeric_columns_df = scaler.fit_transform(df[numeric_columns])
numeric_columns_df = pd.DataFrame(numeric_columns_df, columns=numeric_columns)
numeric_columns_df.head(5)

with open(NUMERICAL_SCALER_PATH, "wb") as f:
    pickle.dump(scaler, f)

scaler = StandardScaler()

df_encoded = scaler.fit_transform(df_encoded)
df_encoded = pd.DataFrame(df_encoded, columns=categorical_columns)
df_encoded.head(5)

with open(CATEGORICAL_SCALER_PATH, "wb") as f:
    pickle.dump(scaler, f)

print(len(numeric_columns_df))

# Normalize the target column

In [None]:
target_scaler = StandardScaler()

# target_column_df = target_scaler.fit_transform(df["price"].values.reshape(-1, 1))
target_column_df = target_scaler.fit_transform(df["price"].values.reshape(-1, 1))
target_column_df = pd.DataFrame(target_column_df, columns=["price_std"])
target_column_df.head(5)

with open(TARGET_SCALER_PATH, "wb") as f:
    pickle.dump(target_scaler, f)

In [None]:
# TODO: ignoring culoare, optiuni culoare, and transmisie for now

df = pd.concat([numeric_columns_df, df_encoded, df["price"], target_column_df, df["firma"], df["is_automatic"], df["input"], df["unique_id"]], axis=1)
df.head(20)

In [None]:
stratify_columns = ["marca", "model", "combustibil", "tip caroserie", "is_automatic", "firma"]

df["stratify_key"] = df[stratify_columns].astype(str).agg("_".join, axis=1)
df.head(5)

# print the number of unique stratify keys
print(f'Number of unique stratify keys: {len(df["stratify_key"].unique())}')

In [None]:
print(f"Number of rows before removing outliers: {len(df)}")
df = df[df["stratify_key"].map(df["stratify_key"].value_counts()) > 1]
print(f"Number of rows after removing outliers: {len(df)}")
print(f'Number of unique stratify keys: {len(df["stratify_key"].unique())}')

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

In [None]:
for train_index, test_index in sss.split(df, df["stratify_key"]):
    train_set = df.iloc[train_index]
    test_set = df.iloc[test_index]

train_df = train_set.drop("stratify_key", axis=1)
test_df = test_set.drop("stratify_key", axis=1)

train_df.head(5)

In [None]:
train_df.info()

In [None]:
train_df.to_csv(TRAIN_DATA_CSV, index=False)
test_df.to_csv(TEST_DATA_CSV, index=False)

# Show the distribution of the price column in each dataset

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 2, figsize=(15, 5))

train_df["price"].plot(kind="hist", ax=ax[0], title="Train dataset")
test_df["price"].plot(kind="hist", ax=ax[1], title="Test dataset")

plt.show()