In [None]:
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
from constants import ROCAR_CSV

In [None]:
import category_encoders as ce

scaler = StandardScaler()

df = pd.read_csv(ROCAR_CSV)

print(len(df))
# # drop the rows that have anul productiei less than 2000
df = df[df["anul producției"] >= 2000]
# drop the rows that have a marca that appears less than 5 times
temp_df = df["marca"].value_counts()
temp_df2 = df[df["price"] <= 60000]
df = df[df["marca"].isin(temp_df[temp_df >= 5].index)]

print(len(df))
df.head(5)

In [None]:
# Columns to encode
categorical_columns = ["oferit de", "marca", "model", "anul producției", "combustibil", "cutie de viteze", "tip caroserie", "stare"]

# Setting up the encoder
target_encoder = ce.TargetEncoder(cols=categorical_columns)

# Fit the encoder using the training data
train_df_encoded = target_encoder.fit_transform(df[categorical_columns], df["price"])

numeric_columns = ["km", "putere", "capacitate cilindrica"]
numeric_columns_df = scaler.fit_transform(df[numeric_columns])
numeric_columns_df = pd.DataFrame(numeric_columns_df, columns=numeric_columns)

normalized_train_df_encoded = scaler.fit_transform(train_df_encoded)
normalized_train_df_encoded = pd.DataFrame(normalized_train_df_encoded, columns=categorical_columns)

features = pd.concat([normalized_train_df_encoded, numeric_columns_df], axis=1)

# Display the first 5 rows of the encoded data
features.head(5)

In [None]:
kmeans = KMeans(n_clusters=200, random_state=42)  # Choose an appropriate number of clusters
features["cluster"] = kmeans.fit_predict(features)

print(features["cluster"].value_counts())

In [None]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit


# Convert DataFrame to PyTorch tensors
class CarDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features.values, dtype=torch.float32)
        self.labels = torch.tensor(labels.values, dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]


labels = df["price"]
features_train, features_val, labels_train, labels_val = train_test_split(
    features, labels, stratify=features["cluster"], test_size=0.2, random_state=42
)
# train_indexes, val_indexes = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42).split(features, features['cluster'])
# features_train, labels_train = features.iloc[train_indexes[0]], labels.iloc[train_indexes[0]]
# features_val, labels_val = features.iloc[val_indexes[0]], labels.iloc[val_indexes[0]]
train_dataset = CarDataset(features_train, labels_train)
val_dataset = CarDataset(features_val, labels_val)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

from torch import nn


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(features.shape[1], 64)  # Input layer to hidden layer with 128 neurons
        self.fc2 = nn.Linear(64, 128)  # Second hidden layer
        self.fc3 = nn.Linear(128, 256)  # Output layer
        self.dropout = nn.Dropout(0.2)
        self.fc4 = nn.Linear(256, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)
        return x


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = Net()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
history = {"train_loss": [], "val_loss": []}
scheduler = ReduceLROnPlateau(optimizer, "min", patience=10, factor=0.5)

trainSteps = len(train_loader.dataset) // 128
valSteps = len(val_loader.dataset) // 128

# Training loop
num_epochs = 2000
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets.view(-1, 1))
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets.view(-1, 1))
            val_loss += loss.item()

    avg_train_loss = train_loss / trainSteps
    avg_val_loss = val_loss / valSteps
    history["train_loss"].append(avg_train_loss)
    history["val_loss"].append(avg_val_loss)

    scheduler.step(avg_val_loss)
    scheduler.get_last_lr()
    print(f"Epoch {epoch + 1}, Train Loss: {avg_train_loss:.4f}, Validation: {avg_val_loss:.4f}")

# plot history
import matplotlib.pyplot as plt

plt.plot(history["train_loss"], label="train loss")
plt.plot(history["val_loss"], label="validation loss")
plt.xlabel("Epoch")
plt.ylabel("MSE Loss")
plt.legend()
plt.show()

In [None]:
# test more visually, i want to see the values before the transformation and the prediction price
import numpy as np

model.eval()
with torch.no_grad():
    inputs, targets = next(iter(val_loader))
    inputs, targets = inputs.to(device), targets.to(device)
    outputs = model(inputs)
    inputs = inputs.cpu().numpy()
    targets = targets.cpu().numpy()
    outputs = outputs.cpu().numpy()
    for output, target in zip(outputs, targets):
        print(f"Predicted price: {output[0]:.2f}, Actual price: {target:.2f}")
        print("---")