In [6]:
import pandas as pd

df = pd.read_csv(
    r"C:\Users\edson\Documents\Projekte\autoscout-ml-project\data\Autoscouttabelle_clean.csv",
    sep=";"
)

df.shape


(46338, 10)

In [7]:
df.columns

Index(['listing_id', 'mileage', 'brand', 'model', 'fuel', 'gear', 'offerType',
       'price', 'hp', 'year'],
      dtype='object')

In [8]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46338 entries, 0 to 46337
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   listing_id  46338 non-null  int64  
 1   mileage     46338 non-null  int64  
 2   brand       46338 non-null  object 
 3   model       46212 non-null  object 
 4   fuel        46338 non-null  object 
 5   gear        46338 non-null  object 
 6   offerType   46338 non-null  object 
 7   price       46338 non-null  int64  
 8   hp          46309 non-null  float64
 9   year        46338 non-null  int64  
dtypes: float64(1), int64(4), object(5)
memory usage: 3.5+ MB


In [9]:
df.describe()

Unnamed: 0,listing_id,mileage,price,hp,year
count,46338.0,46338.0,46338.0,46309.0,46338.0
mean,23169.5,71244.86,16586.15,133.172969,2016.013488
std,13376.772724,62624.92,19314.6,75.351734,3.155892
min,1.0,0.0,1100.0,18.0,2011.0
25%,11585.25,19890.0,7490.0,86.0,2013.0
50%,23169.5,60000.0,11000.0,116.0,2016.0
75%,34753.75,105000.0,19490.0,150.0,2019.0
max,46338.0,1111111.0,1199900.0,850.0,2021.0


In [10]:
top5 = df["brand"].value_counts().head(5)
top5


brand
Volkswagen    6930
Opel          4814
Ford          4442
Skoda         2889
Renault       2806
Name: count, dtype: int64

In [11]:
top5_brands = top5.index.tolist()
df5 = df[df["brand"].isin(top5_brands)].copy()
print(df5["brand"].value_counts())

brand
Volkswagen    6930
Opel          4814
Ford          4442
Skoda         2889
Renault       2806
Name: count, dtype: int64


In [12]:
df5.shape

(21881, 10)

In [13]:
# Durchschnittspreis pro Marke (nur Top 5)
avg_price = df5.groupby("brand")["price"].mean().sort_values(ascending=False)
avg_price


brand
Volkswagen    16067.870130
Ford          13791.453399
Skoda         13723.617515
Renault       11338.737349
Opel          10442.955339
Name: price, dtype: float64

In [14]:
# Zielvariable
y = df5["price"]

# Features
X = df5[["brand", "model", "fuel", "gear", "offerType", "mileage", "hp", "year"]]

X.head()


Unnamed: 0,brand,model,fuel,gear,offerType,mileage,hp,year
1,Volkswagen,Golf,Gasoline,Manual,Used,92800,122.0,2011
3,Renault,Megane,Gasoline,Manual,Used,96200,110.0,2011
6,Renault,Scenic,Diesel,Manual,Used,91894,131.0,2011
7,Opel,Zafira,Gasoline,Manual,Used,127500,116.0,2011
9,Ford,Transit,Diesel,Manual,Used,104,86.0,2011


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape


((17504, 8), (4377, 8))

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# Welche Spalten sind numerisch, welche kategorisch?
num_cols = ["mileage", "hp", "year"]
cat_cols = ["brand", "model", "fuel", "gear", "offerType"]

# Pipeline für numerische Features
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

# Pipeline für kategorische Features
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Alles zusammenführen
preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ]
)

print("Preprocessing-Pipeline erstellt")


Preprocessing-Pipeline erstellt


In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Pipeline + Modell zusammen
lin_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LinearRegression())
])

# Training
lin_model.fit(X_train, y_train)

# Vorhersage
pred = lin_model.predict(X_test)

# Metriken
import numpy as np
mae = mean_absolute_error(y_test, pred)
rmse = np.sqrt(mean_squared_error(y_test, pred))
r2 = r2_score(y_test, pred)

print(f"Linear Regression -> MAE: {mae:,.0f} €, RMSE: {rmse:,.0f} €, R²: {r2:.3f}")



Linear Regression -> MAE: 2,526 €, RMSE: 4,018 €, R²: 0.821


In [18]:
# Gefilterter Datensatz (Ausreißer begrenzen)
df5_filtered = df5[
    (df5["price"] < 100000) &
    (df5["mileage"] < 300000)
].copy()

# Vergleich der Größen
print("Vorher:", df5.shape)
print("Nachher:", df5_filtered.shape)


Vorher: (21881, 10)
Nachher: (21799, 10)


In [19]:
# Neue Features und Target mit gefilterten Daten
y_f = df5_filtered["price"]
X_f = df5_filtered[["brand", "model", "fuel", "gear", "offerType", "mileage", "hp", "year"]]

# Neuer Split
from sklearn.model_selection import train_test_split

X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(
    X_f, y_f, test_size=0.2, random_state=42
)

X_train_f.shape, X_test_f.shape


((17439, 8), (4360, 8))

In [20]:
# Lineares Modell erneut trainieren (mit gefilterten Daten)
lin_model_f = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LinearRegression())
])

# Training
lin_model_f.fit(X_train_f, y_train_f)

# Vorhersage
pred_f = lin_model_f.predict(X_test_f)

# Metriken berechnen
mae_f = mean_absolute_error(y_test_f, pred_f)
rmse_f = np.sqrt(mean_squared_error(y_test_f, pred_f))
r2_f = r2_score(y_test_f, pred_f)

print(f"Gefilterte Daten -> MAE: {mae_f:,.0f} €, RMSE: {rmse_f:,.0f} €, R²: {r2_f:.3f}")


Gefilterte Daten -> MAE: 2,689 €, RMSE: 4,168 €, R²: 0.811


In [21]:
from sklearn.ensemble import RandomForestRegressor

rf_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", RandomForestRegressor(
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    ))
])

# Training
rf_model.fit(X_train, y_train)

# Vorhersage
pred_rf = rf_model.predict(X_test)

# Metriken
mae_rf = mean_absolute_error(y_test, pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, pred_rf))
r2_rf = r2_score(y_test, pred_rf)

print(f"Random Forest -> MAE: {mae_rf:,.0f} €, RMSE: {rmse_rf:,.0f} €, R²: {r2_rf:.3f}")


Random Forest -> MAE: 1,321 €, RMSE: 2,462 €, R²: 0.933


In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# Beispiel: df5 enthält die Rohdaten
# df5 = pd.read_csv("car_data.csv") # Falls CSV

# Features und Target
df5_filtered = df5[(df5["price"] < 100000) & (df5["mileage"] < 300000)].copy()
X = df5_filtered[["brand", "model", "fuel", "gear", "offerType", "mileage", "hp", "year"]]
y = df5_filtered["price"]

# Spalten
num_cols = ["mileage", "hp", "year"]
cat_cols = ["brand", "model", "fuel", "gear", "offerType"]

# Preprocessing-Pipeline
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)

# Random Forest Pipeline
rf_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))
])

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training
rf_model.fit(X_train, y_train)

# Evaluation
pred = rf_model.predict(X_test)
print(f"MAE: {mean_absolute_error(y_test, pred):,.0f} €")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, pred)):.0f} €")
print(f"R²: {r2_score(y_test, pred):.3f}")

# Modell speichern
joblib.dump(rf_model, r"C:\Users\edson\Documents\Projekte\autoscout-ml-project\model\rf_model.pkl")
print("Modell gespeichert!")

MAE: 1,302 €
RMSE: 2266 €
R²: 0.944
Modell gespeichert!
