# Bereinigten Datensatz einlesen

In [20]:
import pandas
t = pandas.read_csv("data_cars_cleaned.csv")
t.head(3)

Unnamed: 0,brand,model,color,price_in_euro,power_kw,power_ps,transmission_type,fuel_type,fuel_consumption_l_100km,fuel_consumption_g_km,mileage_in_km,registration_month,registration_year
0,alfa-romeo,Alfa Romeo GTV,red,1300.0,148,201,Manual,Petrol,10.9,260.0,160500.0,10,1995
1,alfa-romeo,Alfa Romeo Spider,black,4900.0,110,150,Manual,Petrol,9.5,225.0,189500.0,7,1995
2,alfa-romeo,Alfa Romeo Spider,red,7900.0,110,150,Manual,Petrol,9.5,225.0,47307.0,4,1996


In [24]:
t["brand"].unique()

array(['alfa-romeo', 'aston-martin', 'audi', 'bentley', 'bmw', 'cadillac',
       'chevrolet', 'chrysler', 'citroen', 'dacia', 'daewoo', 'daihatsu',
       'dodge', 'fiat', 'ford', 'honda', 'hyundai', 'infiniti', 'isuzu',
       'jaguar', 'jeep', 'kia', 'lada', 'lancia', 'land-rover',
       'maserati', 'mazda', 'mercedes-benz', 'mini', 'mitsubishi',
       'nissan', 'opel', 'peugeot', 'porsche', 'renault', 'rover', 'saab',
       'seat', 'skoda', 'smart', 'ssangyong', 'toyota', 'volkswagen',
       'volvo'], dtype=object)

# Codierung kategorialer Werte

## Trockenübung zum Thema One-Hot-Encoding

In [4]:
d = {"Kundennr" : [101,20993,330,4440],
     "Alter"    : [39, 23,   40, 19],
     "Lieblingsfarbe" : ["rot", "grün", "rot", "blau"]}
t2 = pandas.DataFrame(d)
t2

Unnamed: 0,Kundennr,Alter,Lieblingsfarbe
0,101,39,rot
1,20993,23,grün
2,330,40,rot
3,4440,19,blau


In [5]:
t2["Lieblingsfarbe"].unique()

array(['rot', 'grün', 'blau'], dtype=object)

In [10]:
pandas.get_dummies(t2)

Unnamed: 0,Kundennr,Alter,Lieblingsfarbe_blau,Lieblingsfarbe_grün,Lieblingsfarbe_rot
0,101,39,False,False,True
1,20993,23,False,True,False
2,330,40,False,False,True
3,4440,19,True,False,False


In [11]:
pandas.__version__

'2.1.2'

## One-Hot-Encoding für unsere Tabelle

In [16]:
t.columns

Index(['brand', 'model', 'color', 'price_in_euro', 'power_kw', 'power_ps',
       'transmission_type', 'fuel_type', 'fuel_consumption_l_100km',
       'fuel_consumption_g_km', 'mileage_in_km', 'registration_month',
       'registration_year'],
      dtype='object')

In [26]:
t = pandas.get_dummies(t)

In [19]:
t

Unnamed: 0,price_in_euro,power_kw,power_ps,fuel_consumption_l_100km,fuel_consumption_g_km,mileage_in_km,registration_month,registration_year,brand_alfa-romeo,brand_aston-martin,...,fuel_type_Diesel,fuel_type_Diesel Hybrid,fuel_type_Electric,fuel_type_Ethanol,fuel_type_Hybrid,fuel_type_Hydrogen,fuel_type_LPG,fuel_type_Other,fuel_type_Petrol,fuel_type_Unknown
0,1300.0,148,201,10.9,260.0,160500.0,10,1995,True,False,...,False,False,False,False,False,False,False,False,True,False
1,4900.0,110,150,9.5,225.0,189500.0,7,1995,True,False,...,False,False,False,False,False,False,False,False,True,False
2,7900.0,110,150,9.5,225.0,47307.0,4,1996,True,False,...,False,False,False,False,False,False,False,False,True,False
3,5500.0,132,179,13.4,320.0,168000.0,7,1996,True,False,...,False,False,False,False,False,False,False,False,True,False
4,6976.0,110,150,9.2,220.0,99000.0,1,1996,True,False,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162516,37900.0,120,163,6.6,166.0,2145.0,2,2023,False,False,...,False,False,False,False,False,False,False,False,True,False
162517,59890.0,145,197,5.5,144.0,50.0,2,2023,False,False,...,True,False,False,False,False,False,False,False,False,False
162518,37995.0,120,163,6.7,152.0,2100.0,3,2023,False,False,...,False,False,False,False,False,False,False,False,True,False
162519,55400.0,145,197,5.6,142.0,5000.0,5,2023,False,False,...,True,False,False,False,False,False,False,False,False,False


# Daten fürs ML vorbereiten

## Input-/Output-Split

In [27]:
x = t.drop("price_in_euro", axis="columns")
x

Unnamed: 0,power_kw,power_ps,fuel_consumption_l_100km,fuel_consumption_g_km,mileage_in_km,registration_month,registration_year,brand_alfa-romeo,brand_aston-martin,brand_audi,...,fuel_type_Diesel,fuel_type_Diesel Hybrid,fuel_type_Electric,fuel_type_Ethanol,fuel_type_Hybrid,fuel_type_Hydrogen,fuel_type_LPG,fuel_type_Other,fuel_type_Petrol,fuel_type_Unknown
0,148,201,10.9,260.0,160500.0,10,1995,True,False,False,...,False,False,False,False,False,False,False,False,True,False
1,110,150,9.5,225.0,189500.0,7,1995,True,False,False,...,False,False,False,False,False,False,False,False,True,False
2,110,150,9.5,225.0,47307.0,4,1996,True,False,False,...,False,False,False,False,False,False,False,False,True,False
3,132,179,13.4,320.0,168000.0,7,1996,True,False,False,...,False,False,False,False,False,False,False,False,True,False
4,110,150,9.2,220.0,99000.0,1,1996,True,False,False,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162516,120,163,6.6,166.0,2145.0,2,2023,False,False,False,...,False,False,False,False,False,False,False,False,True,False
162517,145,197,5.5,144.0,50.0,2,2023,False,False,False,...,True,False,False,False,False,False,False,False,False,False
162518,120,163,6.7,152.0,2100.0,3,2023,False,False,False,...,False,False,False,False,False,False,False,False,True,False
162519,145,197,5.6,142.0,5000.0,5,2023,False,False,False,...,True,False,False,False,False,False,False,False,False,False


In [31]:
type(x)

pandas.core.frame.DataFrame

In [32]:
y = t[["price_in_euro"]]

In [33]:
type(y)

pandas.core.frame.DataFrame

In [34]:
y

Unnamed: 0,price_in_euro
0,1300.0
1,4900.0
2,7900.0
3,5500.0
4,6976.0
...,...
162516,37900.0
162517,59890.0
162518,37995.0
162519,55400.0


## Train-/Test-Split

In [38]:
from sklearn.model_selection import train_test_split

In [41]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2)

In [42]:
x_train.shape

(130016, 1125)

In [43]:
x_test.shape

(32505, 1125)

In [48]:
x.shape[0] * 0.8

130016.8

In [49]:
x.shape[0] * 0.2

32504.2

In [50]:
y_train.shape

(130016, 1)

In [51]:
y_test.shape

(32505, 1)

In [55]:
x_test["power_ps"].describe()

count    32505.000000
mean       155.668451
std         70.612551
min         14.000000
25%        110.000000
50%        141.000000
75%        184.000000
max        810.000000
Name: power_ps, dtype: float64

In [56]:
x_test["mileage_in_km"].describe()

count     32505.000000
mean      74396.462852
std       58601.156766
min           0.000000
25%       25000.000000
50%       64500.000000
75%      114820.000000
max      224980.000000
Name: mileage_in_km, dtype: float64

## Standardisierung der Merkmale

In [58]:
from sklearn.preprocessing import StandardScaler

scaler_input = StandardScaler()
scaler_output = StandardScaler()

x_train_scaled = scaler_input.fit_transform(x_train)
y_train_scaled = scaler_output.fit_transform(y_train)

x_test_scaled = scaler_input.transform(x_test)
y_test_scaled = scaler_output.transform(y_test)

In [63]:
x_train.iloc[0].values

array([62, 84, 4.5, ..., False, True, False], dtype=object)

In [62]:
x_train_scaled[0]

array([-1.0077799 , -1.01360953, -0.78037557, ..., -0.02038397,
        0.79614001, -0.01074168])

In [64]:
len(x_train_scaled[0])

1125

In [67]:
y_train.iloc[0].values

array([8900.])

In [69]:
y_train_scaled[0]

array([-1.05003499])