In [None]:
from constants import ROCAR_CSV
import pandas as pd

In [None]:
df = pd.read_csv(ROCAR_CSV)
df.info()

In [None]:
df.head(10)

In [None]:
print(f"Initial number of rows: {len(df)}")
df = df[df["anul producției"] >= 2000]
print(f"Number of rows after filtering by year: {len(df)}")
temp_df = df["marca"].value_counts()
df = df[df["marca"].isin(temp_df[temp_df >= 5].index)]
print(f"Number of rows after filtering by brand: {len(df)}")
df = df[df["price"] <= 100_000]
print(f"Number of rows after filtering by price: {len(df)}")

In [None]:
categorical_columns = ["oferit de", "marca", "model", "combustibil", "cutie de viteze", "tip caroserie", "stare"]
numeric_columns = ["km", "putere", "capacitate cilindrica", "anul producției"]

df = df[categorical_columns + numeric_columns + ["price"]]

df.head()
# df.info()

In [None]:
df.to_csv("rocar_filtered.csv", index=False)

In [None]:
df = pd.read_csv("rocar_filtered.csv")

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

numeric_columns_df = scaler.fit_transform(df[numeric_columns])
numeric_columns_df = pd.DataFrame(numeric_columns_df, columns=numeric_columns)
numeric_columns_df.head(5)

In [None]:
import category_encoders as ce

target_encoder = ce.TargetEncoder(cols=categorical_columns)
df_encoded = target_encoder.fit_transform(df[categorical_columns], df["price"])
df_encoded = scaler.fit_transform(df_encoded)
df_encoded = pd.DataFrame(df_encoded, columns=categorical_columns)

df_encoded.head(5)

In [None]:
df = pd.concat([numeric_columns_df, df_encoded, df["price"]], axis=1)
df.head(5)

In [None]:
stratify_columns = ["anul producției", "marca", "combustibil", "cutie de viteze", "tip caroserie", "stare"]
df["stratify_key"] = df[stratify_columns].astype(str).agg("_".join, axis=1)

df.head(5)

# print the number of unique stratify keys
print(f'Number of unique stratify keys: {len(df["stratify_key"].unique())}')

In [None]:
print(f"Number of rows before removing outliers: {len(df)}")
df = df[df["stratify_key"].map(df["stratify_key"].value_counts()) > 1]
print(f"Number of rows after removing outliers: {len(df)}")
print(f'Number of unique stratify keys: {len(df["stratify_key"].unique())}')

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

In [None]:
for train_index, test_index in sss.split(df, df["stratify_key"]):
    train_set = df.iloc[train_index]
    test_set = df.iloc[test_index]

train_df = train_set.drop("stratify_key", axis=1)
test_df = test_set.drop("stratify_key", axis=1)

train_df.head(5)

In [None]:
train_df.to_csv("rocar_train.csv", index=False)
test_df.to_csv("rocar_test.csv", index=False)