In [29]:
import pandas as pd
import numpy as np
import re
import gc  # Garbage Collector zur Speicherverwaltung


def drop_records_brand_equal_model(df):
    """
    Entfernt alle Zeilen, in denen brand und model nach Normalisierung identisch sind.
    """
    def normalize(text):
        if pd.isna(text):
            return ""
        return re.sub(r"[^a-z0-9]", "", text.lower())

    df = df.copy()
    df["brand_norm"]  = df["brand"].apply(normalize)
    df["model_norm"]  = df["model"].apply(normalize)

    # Zeilen behalten, bei denen die Normalisierungen unterschiedlich sind
    df = df[df["brand_norm"] != df["model_norm"]].reset_index(drop=True)

    df.drop(columns=["brand_norm", "model_norm"], inplace=True)
    return df
#--------------------------------------   

def preprocessing_pipeline(path = '../data.csv'):
        
    # Daten laden
    df = pd.read_csv(path)
        
    # entferne Duplikate  
    df = df.drop_duplicates(subset= ['brand', 'model', 'color', 'registration_date', 'year',
       'price_in_euro', 'power_kw', 'power_ps', 'transmission_type',
       'fuel_type', 'fuel_consumption_l_100km', 'fuel_consumption_g_km',
       'mileage_in_km', 'offer_description']) 

    # Droppe zweite Index Spalte
    if 'Unnamed: 0' in df.columns:
        df = df.drop('Unnamed: 0', axis=1)

        # alle fuel types werden behalten, aber nochmal Kontrolle dass keine verrutschten Zeilen drin. Deshalb gültige fuel types definiert
    df = df.loc[df['fuel_type'].isin(['Diesel', 'Petrol','Hybrid', 'Diesel Hybrid', 'Electric', 'LPG', 'CNG', 'Ethanol', 'Hydrogen', 'Other'])] 
    df = df.reset_index(drop=True) # Index wird neu nummeriert, da vorher Zeilen rausgelöscht wurden. 
        # drop=True verhindert dass die Indexzahl alleine als neue  Zeile gewertet wird. 
    
            
    # Zeilen mit falschen Jahreszahlen werden herausgenommen
    yearsToFilter = list(df['year'].unique()[:29])
    filt = [val in yearsToFilter for val in df['year']]
    df = df[filt]

    def Electrics_Reichweite(df): # schreibt Recihweite aus g/km als l/100 für alle E Autos
    # Filtere die Zeilen, bei denen "fuel_type" = "Electric" und "fuel_consumption_g_km" das Wort "Reichweite" enthält
        e_mit_reichweite = (df["fuel_type"] == "Electric") & (df["fuel_consumption_g_km"].astype(str).str.contains("Reichweite", na=False))
        # Cutte in der Spalte "fuel_consumption_g_km" beim ersten Leerzeichen
        df.loc[e_mit_reichweite, "fuel_consumption_g_km"] = df.loc[e_mit_reichweite, "fuel_consumption_g_km"].astype(str).str.split().str[0]
        # Kopiere Werte von "fuel_consumption_g_km" in die Spalte "fuel_consumption_l_100km"
        df.loc[e_mit_reichweite, "fuel_consumption_l_100km"] = df.loc[e_mit_reichweite, "fuel_consumption_g_km"]
        
        return df
        
    df = Electrics_Reichweite(df)
        
    """Problem: diese Funktion muss überarbeitet werden. 
        Her würden wir aktuell in der Zeile if pd.isna(value) or 'l/100 km' not in str(value) alle E Autos rausschmeißen      
        Stattdessen umschreiben als: Für alle Zeilen, wo Wort "l/100km" enthalten ist, dort wie gewohnt beim ersten Leerzeichen kappen und Komma durch Punkt ersetzen
        wenn "l/100km" nicht enthalten ist, aber fuel-Type "Electric" ist, dann einfach so lassen
        wenn "l/100km" nicht enthalten ist und  fuel-Type nicht "Electric" ist, dann auf nan setzen
        """
    
    return df

In [30]:
df = pd.read_csv('../data.csv')

yearsToFilter = list(df['year'].unique()[:29])
filt = [val in yearsToFilter for val in df['year']]
df = df[filt]

In [31]:
#df= preprocessing_pipeline()

### Distribution

In [32]:
df['fuel_type'].value_counts()

fuel_type
Petrol           143280
Diesel            86421
Hybrid            12607
Electric           5967
LPG                1255
CNG                 508
Diesel Hybrid       476
Other               178
Unknown              96
Hydrogen             82
Ethanol              10
Name: count, dtype: int64

### Hybrid

In [33]:
# Filter auf fuel_type = 'Hybrid'
hybrid_df = df[df['fuel_type'] == 'Hybrid']

# Enthält genau die Zeichenfolge 'l/100 km'
mask_l_per_100km = hybrid_df['fuel_consumption_l_100km'].astype(str).str.contains(r'l/100 km', na=False)
count_l_per_100km = mask_l_per_100km.sum()

# Enthält genau die Zeichenfolge 'km (Ort)'
mask_km_ort = hybrid_df['fuel_consumption_l_100km'].astype(str).str.contains(r'km \(Ort\)', na=False)
count_km_ort = mask_km_ort.sum()

# Gesamtanzahl Hybrid-Zeilen
total_hybrid = len(hybrid_df)

# Prozentanteile berechnen
share_l = count_l_per_100km / total_hybrid * 100
share_km = count_km_ort / total_hybrid * 100

# Ausgabe
print(f"Hybrid mit 'l/100 km': {count_l_per_100km} ({share_l:.2f}%)")
print(f"Hybrid mit 'km (Ort)': {count_km_ort} ({share_km:.2f}%)")

Hybrid mit 'l/100 km': 10212 (81.00%)
Hybrid mit 'km (Ort)': 137 (1.09%)


In [34]:
def clean_fuel_consumption(value): 
        if pd.isna(value) or 'l/100 km' not in str(value):
            return np.nan
        try:
            return float(value.split(' ')[0].replace(',', '.'))
        except:
            return np.nan

In [35]:
# Nur bei bestimmten fuel_types anwenden
mask = df['fuel_type'].isin(['Petrol', 'Diesel', 'Hybrid'])
df.loc[mask, 'fuel_consumption_l_100km'] = df.loc[mask, 'fuel_consumption_l_100km'].apply(clean_fuel_consumption)

In [36]:
df[df['fuel_type'] == 'Hybrid']['fuel_consumption_l_100km'].value_counts()

fuel_consumption_l_100km
1.4     458
4.0     446
3.8     406
3.3     402
4.9     387
       ... 
14.9      1
10.4      1
9.5       1
53.0      1
22.6      1
Name: count, Length: 118, dtype: int64

In [37]:
df[df['fuel_type'] == 'Hybrid'].sort_values(by = 'fuel_consumption_l_100km', ascending= False)

Unnamed: 0.1,Unnamed: 0,brand,model,color,registration_date,year,price_in_euro,power_kw,power_ps,transmission_type,fuel_type,fuel_consumption_l_100km,fuel_consumption_g_km,mileage_in_km,offer_description
83857,83857,hyundai,Hyundai SANTA FE,silver,02/2023,2023,50900,169,230,Automatic,Hybrid,53.0,126 g/km,17.0,Prime Hybrid T-GDI EU6d Facelift HEV 1.6 T-GDi...
250736,250736,volvo,Volvo XC90,grey,04/2022,2022,67950,335,455,Automatic,Hybrid,22.6,34 g/km,38500.0,T8 AWD Recharge Inscription *Head-up*Standhzg*
42917,42917,bmw,BMW 220,grey,04/2023,2023,40990,125,170,Automatic,Hybrid,19.0,144 g/km,1711.0,i Active Tourer Luxury Line Head-Up DAB LED
245955,245955,volkswagen,Volkswagen T7 Multivan,red,01/2023,2023,71281,160,218,Automatic,Hybrid,18.9,42 g/km,8000.0,Energetic eHybrid
40582,40582,bmw,BMW 330,black,07/2020,2020,42880,135,184,Automatic,Hybrid,17.0,38 g/km,23573.0,"e M SPORT HUD,HARMAN/K,KOMFORTZUGANG,LASER"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250939,250939,volvo,Volvo XC90,white,06/2022,2022,77990,228,310,Automatic,Hybrid,,- (g/km),24582.0,T8 long range Recharge R-Design AWD
251032,251032,volvo,Volvo XC60,black,03/2023,2023,73950,228,310,Automatic,Hybrid,,75 km Reichweite,4200.0,Recharge T8 AWD Plug-in Hybrid Ultimate Bright...
251045,251045,volvo,Volvo XC60,black,03/2023,2023,74970,335,455,Automatic,Hybrid,,77 km Reichweite,8000.0,Recharge T8 Plus Bright Standheizung/LED
251074,251074,volvo,Volvo XC40,white,04/2023,2023,57990,192,261,Automatic,Hybrid,,43 km Reichweite,1229.0,Plus Bright T5 Recharge Intellisafe*Surround+P...


### Electric

In [38]:
df[df['fuel_type'] == 'Electric']

Unnamed: 0.1,Unnamed: 0,brand,model,color,registration_date,year,price_in_euro,power_kw,power_ps,transmission_type,fuel_type,fuel_consumption_l_100km,fuel_consumption_g_km,mileage_in_km,offer_description
16552,16552,audi,Audi e-tron,beige,09/2019,2019,51888,300,408,Automatic,Electric,,359 km Reichweite,84800.0,55 Advanced Quattro S-Line Interieur
16559,16559,audi,Audi e-tron,beige,07/2019,2019,53990,300,408,Automatic,Electric,,359 km Reichweite,51000.0,55 quattro advanced Pano B&O AHK Matrix
16561,16561,audi,Audi e-tron,beige,11/2019,2019,54870,300,408,Automatic,Electric,,0 g/km,82814.0,+ADVANCED+55+PANO+LUFT+HU+MATRIX+
16571,16571,audi,Audi e-tron,beige,12/2019,2019,61989,300,408,Automatic,Electric,,0 g/km,55990.0,advanced 55 QU*B&O*MATRIX*PANO*LED*NAV
16579,16579,audi,Audi e-tron,blue,02/2019,2019,32930,300,408,Automatic,Electric,,359 km Reichweite,84300.0,55 qu. S line AHK LED V-Cockp. RüKa Pano
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251033,251033,volvo,Volvo C40,black,05/2023,2023,52890,170,231,Automatic,Electric,,400 km Reichweite,8.0,Recharge Single Motor Ultimate AHK PANO
251037,251037,volvo,Volvo XC40,black,04/2023,2023,49900,170,231,Automatic,Electric,,0 g/km,14900.0,Recharge Single Motor Plus
251048,251048,volvo,Volvo C40,black,01/2023,2023,51990,170,231,Automatic,Electric,,0 g/km,2106.0,Recharge Single Motor Plus AHK Sitzhzg.
251056,251056,volvo,Volvo C40,black,05/2023,2023,60520,170,231,Automatic,Electric,,400 km Reichweite,3000.0,Ultimate 2WD Recharge Pure Electric


In [39]:
df[df['fuel_type'] == 'Electric']['fuel_consumption_l_100km'].value_counts()

fuel_consumption_l_100km
0 kWh/100 km       101
389 km (Ort)        12
592 km (Ort)        12
305 km (Ort)        11
155 km (Ort)        10
                  ... 
17,8 kWh/100 km      1
540 km (Ort)         1
257 km (Ort)         1
13 kWh/100 km        1
541 km (Ort)         1
Name: count, Length: 175, dtype: int64

In [40]:
# Filter auf fuel_type = 'Electric'
electric_df = df[df['fuel_type'] == 'Electric']

# Enthält genau die Zeichenfolge 'kWh/100 km'
count_kwh_per_100km = electric_df['fuel_consumption_l_100km'].astype(str).str.contains(r'kWh/100 km', na=False).sum()

# Enthält genau die Zeichenfolge 'km (Ort)'
count_km_ort = electric_df['fuel_consumption_l_100km'].astype(str).str.contains(r'km \(Ort\)', na=False).sum()

# Gesamtanzahl der Electric-Zeilen
total_electric = len(electric_df)

# Prozentanteile berechnen
share_kwh = count_kwh_per_100km / total_electric * 100
share_km_ort = count_km_ort / total_electric * 100

# Ausgabe
print(f"Electric mit 'kWh/100 km': {count_kwh_per_100km} ({share_kwh:.2f}%)")
print(f"Electric mit 'km (Ort)': {count_km_ort} ({share_km_ort:.2f}%)")

Electric mit 'kWh/100 km': 125 (2.09%)
Electric mit 'km (Ort)': 343 (5.75%)


In [41]:
# Filter auf Electric-Fahrzeuge
electric_df = df[df['fuel_type'] == 'Electric']

# Maske für beide enthaltenen Fälle
mask_kwh = electric_df['fuel_consumption_l_100km'].astype(str).str.contains(r'kWh/100 km', na=False)
mask_km_ort = electric_df['fuel_consumption_l_100km'].astype(str).str.contains(r'km \(Ort\)', na=False)

# Kombinierte Maske für alles, was NICHT eines der beiden Muster enthält
mask_other = ~(mask_kwh | mask_km_ort)

# Gefilterter DataFrame
electric_other = electric_df[mask_other]

# Ausgabe
electric_other['fuel_consumption_l_100km'].value_counts()

fuel_consumption_l_100km
- (l/100 km)    3
Name: count, dtype: int64

In [42]:
df[df['fuel_type'] == 'Electric']['fuel_consumption_g_km'].value_counts()

fuel_consumption_g_km
0 g/km               2930
200 km Reichweite     177
230 km Reichweite     120
400 km Reichweite      70
340 km Reichweite      69
                     ... 
710 km Reichweite       1
426 km Reichweite       1
151 km Reichweite       1
262 km Reichweite       1
548 km Reichweite       1
Name: count, Length: 383, dtype: int64

In [43]:
# Filter auf fuel_type = 'Electric'
electric_df = df[df['fuel_type'] == 'Electric']

# Enthält genau die Zeichenfolge 'kWh/100 km'
count_kwh_per_100km = electric_df['fuel_consumption_g_km'].astype(str).str.contains(r'km Reichweite', na=False).sum()

# Enthält genau die Zeichenfolge 'km (Ort)'
'''count_km_ort = electric_df['fuel_consumption_l_100km'].astype(str).str.contains(r'km \(Ort\)', na=False).sum()'''

# Gesamtanzahl der Electric-Zeilen
total_electric = len(electric_df)

# Prozentanteile berechnen
share_kwh = count_kwh_per_100km / total_electric * 100
share_km_ort = count_km_ort / total_electric * 100

# Ausgabe
print(f"Electric mit 'kWh/100 km': {count_kwh_per_100km} ({share_kwh:.2f}%)")
print(f"Electric mit 'km (Ort)': {count_km_ort} ({share_km_ort:.2f}%)")

Electric mit 'kWh/100 km': 3009 (50.43%)
Electric mit 'km (Ort)': 343 (5.75%)


In [44]:
# Filter auf Electric-Fahrzeuge
electric_df = df[df['fuel_type'] == 'Electric']

# Maske für beide enthaltenen Fälle
mask_kwh = electric_df['fuel_consumption_g_km'].astype(str).str.contains(r'km Reichweite', na=False)
#mask_km_ort = electric_df['fuel_consumption_l_100km'].astype(str).str.contains(r'km \(Ort\)', na=False)

# Kombinierte Maske für alles, was NICHT eines der beiden Muster enthält
mask_other = ~(mask_kwh)

# Gefilterter DataFrame
electric_other = electric_df[mask_other]

# Ausgabe
electric_other['fuel_consumption_g_km'].value_counts()

fuel_consumption_g_km
0 g/km          2930
-/-               16
122 g/km           2
13 g/km            1
460 km (Ort)       1
389 km (Ort)       1
152 g/km           1
60 g/km            1
85 km (Ort)        1
90 km (Ort)        1
235 km (Ort)       1
100 km (Ort)       1
230 km (Ort)       1
Name: count, dtype: int64

In [45]:
e_mit_reichweite = (df['fuel_type'] == 'Electric') & (df['fuel_consumption_g_km'].astype(str).str.contains(r'km Reichweite', na=False))
df.loc[e_mit_reichweite, 'electric_range'] = df.loc[e_mit_reichweite, 'fuel_consumption_g_km'].astype(str).str.split().str[0]

### LPG

In [46]:
df[df['fuel_type'] == 'LPG']

Unnamed: 0.1,Unnamed: 0,brand,model,color,registration_date,year,price_in_euro,power_kw,power_ps,transmission_type,fuel_type,fuel_consumption_l_100km,fuel_consumption_g_km,mileage_in_km,offer_description,electric_range
1722,1722,audi,Audi S8,silver,08/1997,1997,21899,250,340,Manual,LPG,"13,9 l/100 km",- (g/km),220000.0,4.2 EL.GSD~LEDER~NAVI~XENON~TEMPOMAT~SHZ,
1872,1872,audi,Audi A3,silver,05/2000,2000,3600,132,179,Manual,LPG,"7,8 l/100 km",- (g/km),268300.0,"1.8 Turbo 180 PS, ViALLE LPG, Glas-SSD, AHK, Temp",
1875,1875,audi,Audi A3,silver,05/2000,2000,3600,132,179,Manual,LPG,"7,8 l/100 km",- (g/km),268300.0,"1.8 Turbo 180 PS, ViALLE LPG, Glas-SSD, AHK, Temp",
1892,1892,audi,Audi A3,silver,05/2000,2000,3600,132,179,Manual,LPG,"7,8 l/100 km",- (g/km),268300.0,"1.8 Turbo 180 PS, ViALLE LPG, Glas-SSD, AHK, Temp",
1903,1903,audi,Audi S6,blue,01/2001,2001,8000,250,340,Automatic,LPG,"14,2 l/100 km",341 g/km,335000.0,Avant 4.2 quattro,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240560,240560,volkswagen,Volkswagen Golf,yellow,10/2020,2020,20970,66,90,Manual,LPG,"4,5 l/100 km",102 g/km,19000.0,VIII 1.0l TSI * LED. Digital Cockpit. PDC vo+h...,
245179,245179,volkswagen,Volkswagen T-Cross,orange,03/2022,2022,24870,81,110,Automatic,LPG,5 l/100 km,114 g/km,20000.0,Active DSG AUTOGAS,
246929,246929,volvo,Volvo V40,blue,12/2000,2000,2490,120,163,Automatic,LPG,10 l/100 km,- (g/km),295000.0,2.0t LPG,
247024,247024,volvo,Volvo S60,black,07/2010,2010,5500,102,139,Manual,LPG,"5,8 l/100 km",- (g/km),232000.0,T3,


In [47]:
# Filter auf fuel_type = 'Hybrid'
hybrid_df = df[df['fuel_type'] == 'LPG']

# Enthält genau die Zeichenfolge 'l/100 km'
mask_l_per_100km = hybrid_df['fuel_consumption_l_100km'].astype(str).str.contains(r'l/100 km', na=False)
count_l_per_100km = mask_l_per_100km.sum()

# Enthält genau die Zeichenfolge 'km (Ort)'
mask_km_ort = hybrid_df['fuel_consumption_l_100km'].astype(str).str.contains(r'km \(Ort\)', na=False)
count_km_ort = mask_km_ort.sum()

# Gesamtanzahl Hybrid-Zeilen
total_hybrid = len(hybrid_df)

# Prozentanteile berechnen
share_l = count_l_per_100km / total_hybrid * 100
share_km = count_km_ort / total_hybrid * 100

# Ausgabe
print(f"Hybrid mit 'l/100 km': {count_l_per_100km} ({share_l:.2f}%)")
print(f"Hybrid mit 'km (Ort)': {count_km_ort} ({share_km:.2f}%)")

Hybrid mit 'l/100 km': 1060 (84.46%)
Hybrid mit 'km (Ort)': 0 (0.00%)


In [48]:
# Filter auf Electric-Fahrzeuge
electric_df = df[df['fuel_type'] == 'LPG']

# Maske für beide enthaltenen Fälle
mask_kwh = electric_df['fuel_consumption_l_100km'].astype(str).str.contains(r'l/100 km', na=False)
mask_km_ort = electric_df['fuel_consumption_l_100km'].astype(str).str.contains(r'km \(Ort\)', na=False)

# Kombinierte Maske für alles, was NICHT eines der beiden Muster enthält
mask_other = ~(mask_kwh | mask_km_ort)

# Gefilterter DataFrame
electric_other = electric_df[mask_other]

# Ausgabe
electric_other['fuel_consumption_l_100km'].value_counts()

fuel_consumption_l_100km
14,9 kg/100 km    4
4,9 kg/100 km     2
6,3 kg/100 km     1
8,4 kg/100 km     1
8,3 kg/100 km     1
14,7 kg/100 km    1
Name: count, dtype: int64

In [49]:
# Nur bei bestimmten fuel_types anwenden
mask = df['fuel_type'].isin(['LPG'])
df.loc[mask, 'fuel_consumption_l_100km'] = df.loc[mask, 'fuel_consumption_l_100km'].apply(clean_fuel_consumption)

In [50]:
df[df['fuel_type'] == 'LPG'].sort_values(by = 'fuel_consumption_l_100km', ascending= False)

Unnamed: 0.1,Unnamed: 0,brand,model,color,registration_date,year,price_in_euro,power_kw,power_ps,transmission_type,fuel_type,fuel_consumption_l_100km,fuel_consumption_g_km,mileage_in_km,offer_description,electric_range
51420,51420,dodge,Dodge Durango,black,11/2018,2018,39900,268,364,Automatic,LPG,18.0,387 g/km,46350.0,R/T 5.7 V8 4x4 Blacktop Package + LPG,
87145,87145,jeep,Jeep Grand Cherokee,white,05/2022,2022,91999,268,364,Automatic,LPG,17.7,386 g/km,18000.0,"L 5,7l Summit,ACC,Pano,LPG",
51860,51860,dodge,Dodge Challenger,black,12/2022,2022,103500,535,727,Automatic,LPG,16.5,388 g/km,10.0,"Hellcat 6.2l,Widebody,ACC,LPG",
51833,51833,dodge,Dodge Challenger,red,12/2022,2022,105000,535,727,Automatic,LPG,16.5,388 g/km,10.0,"Hellcat 6.2l,Widebody,ACC,LPG",
51834,51834,dodge,Dodge Challenger,red,12/2022,2022,103500,535,727,Automatic,LPG,16.5,388 g/km,10.0,"Hellcat 6.2l,Widebody,ACC,LPG",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183842,183842,seat,SEAT Tarraco,grey,11/2020,2020,33940,140,190,Automatic,LPG,,166 g/km,40650.0,Xcellence Pano Leder Kameras Led Ahk 20,
214005,214005,volkswagen,Volkswagen Beetle,blue,06/1999,1999,3999,85,116,Manual,LPG,,- (g/km),283000.0,ABT Ausstattung,
219152,219152,volkswagen,Volkswagen Golf Plus,black,06/2008,2008,3499,59,80,Manual,LPG,,0 g/km,166825.0,Edition1.4 Benzin+ LPG-Gas Klimatronic,
224315,224315,volkswagen,Volkswagen Caddy,grey,06/2012,2012,8600,75,102,Manual,LPG,,- (g/km),224000.0,1.6 TDI Maxi 2 Hand Navi Leder AHK,


### CNG

In [51]:
df[df['fuel_type'] == 'CNG']

Unnamed: 0.1,Unnamed: 0,brand,model,color,registration_date,year,price_in_euro,power_kw,power_ps,transmission_type,fuel_type,fuel_consumption_l_100km,fuel_consumption_g_km,mileage_in_km,offer_description,electric_range
9864,9864,audi,Audi A3,blue,01/2015,2015,20930,81,110,Automatic,CNG,5 kg/100 km,115 g/km,98674.0,1.4 TFSI g-tron S-tronic S-Line GRA,
10844,10844,audi,Audi A3,silver,07/2015,2015,17200,110,150,Automatic,CNG,"4,6 kg/100 km",- (g/km),80000.0,1.4 TFSI cylinder on demand ultra Limousine At...,
11169,11169,audi,Audi A3,white,03/2015,2015,13990,81,110,Automatic,CNG,5 kg/100 km,115 g/km,133000.0,Ambiente Sportback Erdgas CNG,
11825,11825,audi,Audi A3,grey,10/2016,2016,16950,81,110,Manual,CNG,"3,6 kg/100 km",98 g/km,82880.0,1.4TFSI g-tron 2x S line/Sportback/VOLL/CNG,
12034,12034,audi,Audi A3,black,12/2016,2016,15950,81,110,Automatic,CNG,"5,4 kg/100 km",128 g/km,78500.0,Sportback g-tron*AHK*Erdgas*Automatik,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240568,240568,volkswagen,Volkswagen up!,yellow,12/2020,2020,13490,50,68,Manual,CNG,"5,8 kg/100 km",105 g/km,22700.0,"eco up! 1.0 EcoFuel, Benzin / CNG Temp BT SHz",
241359,241359,volkswagen,Volkswagen Golf Variant,silver,06/2020,2020,20990,96,131,Automatic,CNG,"4,8 kg/100 km",111 g/km,35700.0,"VII 1,5 TGI BENZIN/CNG DSG LED StHei",
246961,246961,volvo,Volvo V70,black,06/2006,2006,4250,103,140,Manual,CNG,9 kg/100 km,- (g/km),326000.0,V70 Bi-Fuel Momentum,
247037,247037,volvo,Volvo V70,black,12/2010,2010,8888,170,231,Automatic,CNG,"9,7 l/100 km",232 g/km,271000.0,2.5T Aut. Momentum,


In [52]:
# Filter auf Electric-Fahrzeuge
electric_df = df[df['fuel_type'] == 'CNG']

# Maske für beide enthaltenen Fälle
mask_kwh = electric_df['fuel_consumption_l_100km'].astype(str).str.contains(r'kg/100 km', na=False)
mask_km_ort = electric_df['fuel_consumption_l_100km'].astype(str).str.contains(r'l/100 km', na=False)

# Kombinierte Maske für alles, was NICHT eines der beiden Muster enthält
mask_other = (mask_kwh | mask_km_ort)

# Gefilterter DataFrame
electric_other = electric_df[mask_other]

# Ausgabe
electric_other['fuel_consumption_l_100km'].value_counts()

fuel_consumption_l_100km
3,5 kg/100 km    84
3,4 kg/100 km    39
5,3 kg/100 km    23
3,6 kg/100 km    16
2,9 kg/100 km    16
                 ..
7,7 l/100 km      1
7,6 l/100 km      1
5 l/100 km        1
1 kg/100 km       1
9,7 l/100 km      1
Name: count, Length: 78, dtype: int64

In [53]:
# Filter auf fuel_type = 'Hybrid'
hybrid_df = df[df['fuel_type'] == 'CNG']

# Enthält genau die Zeichenfolge 'l/100 km'
mask_l_per_100km = hybrid_df['fuel_consumption_l_100km'].astype(str).str.contains(r'kg/100 km', na=False)
count_l_per_100km = mask_l_per_100km.sum()

# Enthält genau die Zeichenfolge 'km (Ort)'
mask_km_ort = hybrid_df['fuel_consumption_l_100km'].astype(str).str.contains(r'l/100 km', na=False)
count_km_ort = mask_km_ort.sum()

# Gesamtanzahl Hybrid-Zeilen
total_hybrid = len(hybrid_df)

# Prozentanteile berechnen
share_l = count_l_per_100km / total_hybrid * 100
share_km = count_km_ort / total_hybrid * 100

# Ausgabe
print(f"Hybrid mit 'l/100 km': {count_l_per_100km} ({share_l:.2f}%)")
print(f"Hybrid mit 'km (Ort)': {count_km_ort} ({share_km:.2f}%)")

Hybrid mit 'l/100 km': 382 (75.20%)
Hybrid mit 'km (Ort)': 32 (6.30%)


### Diesel Hybrid

In [54]:
df[df['fuel_type'] == 'Diesel Hybrid']

Unnamed: 0.1,Unnamed: 0,brand,model,color,registration_date,year,price_in_euro,power_kw,power_ps,transmission_type,fuel_type,fuel_consumption_l_100km,fuel_consumption_g_km,mileage_in_km,offer_description,electric_range
11436,11436,audi,Audi Q7,blue,06/2016,2016,47980,190,258,Automatic,Diesel Hybrid,"1,9 l/100 km",50 g/km,78500.0,"3.0 TDI e-tron quattro/Matrix/Stadt+Tour/20""",
11454,11454,audi,Audi Q7,blue,09/2016,2016,56950,275,374,Automatic,Diesel Hybrid,"1,9 l/100 km",50 g/km,66170.0,3.0 TDI e-tron qu. tiptronic S line 21 BOSE PA...,
11870,11870,audi,Audi Q7,grey,12/2016,2016,47990,190,258,Automatic,Diesel Hybrid,"1,9 l/100 km",50 g/km,101000.0,3.0 TDI e-tron S Line Pano Virtual AHK 21 Zol,
12428,12428,audi,Audi Q7,silver,11/2016,2016,37500,190,258,Automatic,Diesel Hybrid,,- (g/km),169000.0,3.0 TDI e-tron quattro tiptronic,
12985,12985,audi,Audi Q7,blue,06/2017,2017,35000,190,258,Automatic,Diesel Hybrid,"1,9 l/100 km",50 g/km,169000.0,3.0 TDI e-tron quattro**S-LINE PANO AHK**,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250434,250434,volvo,Volvo XC60,black,07/2021,2021,41970,145,197,Automatic,Diesel Hybrid,"5,6 l/100 km",142 g/km,17000.0,B4 D Momentum Pro AHK+Park+Winter uvm. Navi,
250701,250701,volvo,Volvo S90,grey,01/2022,2022,50970,173,235,Automatic,Diesel Hybrid,"4,9 l/100 km",130 g/km,11000.0,Inscription AWD B5 Sunroof 360° Soundsystem Ha...,
250784,250784,volvo,Volvo S90,black,03/2022,2022,47900,173,235,Automatic,Diesel Hybrid,"4,9 l/100 km",128 g/km,10179.0,B5 R Design AWD Mild- Hybrid Diesel Autom.,
250883,250883,volvo,Volvo V60,silver,08/2022,2022,37581,145,197,Automatic,Diesel Hybrid,,125 g/km,2784.0,"B4 MHEV Diesel Geartronic 145 kW (197 PS), Aut...",


### Other

In [55]:
df[df['fuel_type'] == 'Other']

Unnamed: 0.1,Unnamed: 0,brand,model,color,registration_date,year,price_in_euro,power_kw,power_ps,transmission_type,fuel_type,fuel_consumption_l_100km,fuel_consumption_g_km,mileage_in_km,offer_description,electric_range
1823,1823,audi,Audi TT,blue,03/2000,2000,7600,132,179,Manual,Other,"6,8 l/100 km",- (g/km),233500.0,S-Line,
1961,1961,audi,Audi TT,silver,07/2001,2001,13900,,,Manual,Other,,- (g/km),154920.0,Roadster 1.8T quattro Mokassin-Naht,
3756,3756,audi,Audi Q7,black,09/2008,2008,20000,250,340,Automatic,Other,"11,1 l/100 km",- (g/km),157000.0,4.2 TDI DPF Quattro S LINE PLUS Exklusiv Voll,
3849,3849,audi,Audi R8,black,01/2008,2008,56900,309,420,Unknown,Other,"13,6 l/100 km",325 g/km,41500.0,4.2 FSI quattro Coupe (423),
13252,13252,audi,Audi A4,grey,05/2017,2017,21799,140,190,Automatic,Other,,0 g/km,118000.0,Avant Quattro TDi Navi Scheckh.1.Hd,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249906,249906,volvo,Volvo XC40,black,12/2020,2020,60890,155,211,Automatic,Other,"1,8 l/100 km",41 g/km,26300.0,Inscription Expression Recharge Plug-In Hybrid,
250021,250021,volvo,Volvo XC40,black,07/2020,2020,37150,132,179,Automatic,Other,,40 km Reichweite,49800.0,XC40 T5 Recharge DKG RDesign Expression,
250585,250585,volvo,Volvo XC40,beige,12/2022,2022,54800,192,261,Automatic,Other,,43 km Reichweite,2.0,T5 Recharge DKG Plus Dark *5 Pakete+H&K+Leder*,
250587,250587,volvo,Volvo XC40,blue,12/2022,2022,54500,192,261,Automatic,Other,,43 km Reichweite,2.0,T5 Recharge DKG Plus Dark *5 Pakete+H&K+Leder*,


In [56]:
other_df = df[df['fuel_type'] == 'Other']
count_l_per_100km_other = other_df['fuel_consumption_l_100km'].astype(str).str.contains(r'l/100 km', na=False).sum()
share_l_other = count_l_per_100km_other / len(other_df) * 100
count_km_ort_other = other_df['fuel_consumption_l_100km'].astype(str).str.contains(r'km \(Ort\)', na=False)
count_l_per_100km_other = other_df['fuel_consumption_l_100km'].astype(str).str.contains(r'l/100 km', na=False)

# Maske für alles, was NICHT eines der beiden Muster enthält
mask_other = ~(count_km_ort_other | count_l_per_100km_other)

# Gefilterter DataFrame mit allen anderen Varianten
other_variations = other_df[mask_other]

# Ausgabe der verschiedenen Werte
print(other_variations['fuel_consumption_l_100km'].value_counts())

Series([], Name: count, dtype: int64)


In [57]:
other_df = df[df['fuel_type'] == 'Other']
count_l_per_100km_other = other_df['fuel_consumption_l_100km'].astype(str).str.contains(r'l/100 km', na=False).sum()
share_l_other = count_l_per_100km_other / len(other_df) * 100
count_km_ort_other = other_df['fuel_consumption_l_100km'].astype(str).str.contains(r'km \(Ort\)', na=False).sum()
share_km_ort_other = count_km_ort_other / len(other_df) * 100

print(f"Anzahl 'Other' mit 'l/100 km' in fuel_consumption_l_100km: {count_l_per_100km_other} ({share_l_other:.2f}%)")
print(f"Anzahl 'Other' mit 'km (Ort)' in fuel_consumption_l_100km: {count_km_ort_other} ({share_km_ort_other:.2f}%)")

Anzahl 'Other' mit 'l/100 km' in fuel_consumption_l_100km: 80 (44.94%)
Anzahl 'Other' mit 'km (Ort)' in fuel_consumption_l_100km: 7 (3.93%)


### Unknown

In [58]:
df[df['fuel_type'] == 'Unknown']

Unnamed: 0.1,Unnamed: 0,brand,model,color,registration_date,year,price_in_euro,power_kw,power_ps,transmission_type,fuel_type,fuel_consumption_l_100km,fuel_consumption_g_km,mileage_in_km,offer_description,electric_range
20639,20639,audi,Audi A3,blue,05/2022,2022,33990,110,150,Automatic,Unknown,,- (g/km),18227.0,40 TFSi e Sportback S-tronic Navi Allwetter,
22921,22921,bentley,Bentley Continental GT,grey,04/2008,2008,44740,448,609,Automatic,Unknown,,- (g/km),67710.0,Speed 6.0 W12*TOP Fahrzeug*,
23196,23196,bentley,Bentley Continental GTC,black,01/2020,2020,217990,467,635,Automatic,Unknown,,- (g/km),54000.0,"W12 *First Edition, Mulliner*",
23442,23442,bmw,BMW 316,blue,02/1997,1997,9999,,,Manual,Unknown,,- (g/km),91191.0,*Garagenwagen*,
23615,23615,bmw,BMW Z3,silver,02/1998,1998,7990,85,116,Manual,Unknown,,- (g/km),184000.0,/Vollleder/Sitzheizung/Hardtop,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215531,215531,volkswagen,Volkswagen Beetle,blue,11/2004,2004,3999,75,102,Manual,Unknown,,- (g/km),141000.0,1.6-Cabrio-TÜV 06.2024-Garantie,
215868,215868,volkswagen,Volkswagen Golf,silver,02/2004,2004,2990,55,75,Manual,Unknown,,- (g/km),152987.0,V 1.4 /Klima/SitzH/El.Fen/ESP,
230781,230781,volkswagen,Volkswagen Crafter,silver,12/2015,2015,11290,100,136,Manual,Unknown,,- (g/km),350000.0,2.0 TDI 136 Ps 35 L3 H2 Hochdach,
237301,237301,volkswagen,Volkswagen Polo,silver,02/2018,2018,13850,55,75,Manual,Unknown,"4,7 l/100 km",111 g/km,86446.0,Comfortline 1.0 Fl?ssiggasumbau,


In [59]:
other_df = df[df['fuel_type'] == 'Unknown']
count_l_per_100km_other = other_df['fuel_consumption_l_100km'].astype(str).str.contains(r'l/100 km', na=False).sum()
share_l_other = count_l_per_100km_other / len(other_df) * 100
count_km_ort_other = other_df['fuel_consumption_l_100km'].astype(str).str.contains(r'km \(Ort\)', na=False).sum()
share_km_ort_other = count_km_ort_other / len(other_df) * 100

print(f"Anzahl 'Other' mit 'l/100 km' in fuel_consumption_l_100km: {count_l_per_100km_other} ({share_l_other:.2f}%)")
print(f"Anzahl 'Other' mit 'km (Ort)' in fuel_consumption_l_100km: {count_km_ort_other} ({share_km_ort_other:.2f}%)")

Anzahl 'Other' mit 'l/100 km' in fuel_consumption_l_100km: 22 (22.92%)
Anzahl 'Other' mit 'km (Ort)' in fuel_consumption_l_100km: 0 (0.00%)


In [60]:
other_df = df[df['fuel_type'] == 'Unknown']
count_l_per_100km_other = other_df['fuel_consumption_l_100km'].astype(str).str.contains(r'l/100 km', na=False).sum()
share_l_other = count_l_per_100km_other / len(other_df) * 100
count_km_ort_other = other_df['fuel_consumption_l_100km'].astype(str).str.contains(r'km \(Ort\)', na=False)
count_l_per_100km_other = other_df['fuel_consumption_l_100km'].astype(str).str.contains(r'l/100 km', na=False)

# Maske für alles, was NICHT eines der beiden Muster enthält
mask_other = ~(count_km_ort_other | count_l_per_100km_other)

# Gefilterter DataFrame mit allen anderen Varianten
other_variations = other_df[mask_other]

# Ausgabe der verschiedenen Werte
print(other_variations['fuel_consumption_l_100km'].value_counts())

fuel_consumption_l_100km
3,8 kg/100 km    1
Name: count, dtype: int64


### Hydrogen

In [61]:
df[df['fuel_type'] == 'Hydrogen']

Unnamed: 0.1,Unnamed: 0,brand,model,color,registration_date,year,price_in_euro,power_kw,power_ps,transmission_type,fuel_type,fuel_consumption_l_100km,fuel_consumption_g_km,mileage_in_km,offer_description,electric_range
78514,78514,hyundai,Hyundai iX35,white,12/2015,2015,24900,100,136,Automatic,Hydrogen,,- (g/km),32000.0,Fuel Cell,
78850,78850,hyundai,Hyundai iX35,black,10/2016,2016,11980,100,136,Automatic,Hydrogen,,- (g/km),81000.0,diesel,
79141,79141,hyundai,Hyundai iX35,blue,12/2017,2017,23000,100,136,Automatic,Hydrogen,1 kg/100 km,0 g/km,28000.0,FCEV Wasserstoff ! JUNGER GEBRAUCHTER,
79174,79174,hyundai,Hyundai iX35,blue,12/2017,2017,26990,100,136,Automatic,Hydrogen,"6,8 kg/100 km",0 g/km,17390.0,Wasserstoff Fuel Cell Electro Vollausstattung,
79429,79429,hyundai,Hyundai iX35,silver,12/2017,2017,19990,100,136,Automatic,Hydrogen,,0 g/km,84908.0,Fuel Cell Wasserstoff PDC SHZ LEDER NAVI,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209990,209990,toyota,Toyota Mirai,white,03/2021,2021,53990,134,182,Automatic,Hydrogen,0 kg/100 km,0 g/km,13000.0,Advanced,
210079,210079,toyota,Toyota Mirai,white,09/2021,2021,73900,128,174,Automatic,Hydrogen,"0,9 kg/100 km",0 g/km,565.0,Advance - WASSERSTOFF -,
210084,210084,toyota,Toyota Mirai,white,04/2021,2021,63980,134,182,Automatic,Hydrogen,,0 g/km,26000.0,Advanced Wasserstoffantrieb,
210158,210158,toyota,Toyota Mirai,beige,11/2022,2022,35700,113,154,Automatic,Hydrogen,1 kg/100 km,0 g/km,20.0,FCEV Taxiausführung bis € 14280 Förderung,


In [66]:
other_df = df[df['fuel_type'] == 'Hydrogen']
count_l_per_100km_other = other_df['fuel_consumption_l_100km'].astype(str).str.contains(r'kg/100 km', na=False).sum()
share_l_other = count_l_per_100km_other / len(other_df) * 100
count_km_ort_other = other_df['fuel_consumption_l_100km'].astype(str).str.contains(r'l/100 km', na=False).sum()
share_km_ort_other = count_km_ort_other / len(other_df) * 100

print(f"Anzahl 'Other' mit 'kg/100 km in fuel_consumption_l_100km: {count_l_per_100km_other} ({share_l_other:.2f}%)")
print(f"Anzahl 'Other' mit 'l/100 km km' in fuel_consumption_l_100km: {count_km_ort_other} ({share_km_ort_other:.2f}%)")

Anzahl 'Other' mit 'kg/100 km in fuel_consumption_l_100km: 38 (46.34%)
Anzahl 'Other' mit 'l/100 km km' in fuel_consumption_l_100km: 0 (0.00%)


### Ethanol

In [67]:
df[df['fuel_type'] == 'Ethanol']

Unnamed: 0.1,Unnamed: 0,brand,model,color,registration_date,year,price_in_euro,power_kw,power_ps,transmission_type,fuel_type,fuel_consumption_l_100km,fuel_consumption_g_km,mileage_in_km,offer_description,electric_range
23943,23943,bmw,BMW 325,blue,06/2001,2001,5350,141,192,Manual,Ethanol,"8,9 l/100 km",- (g/km),292000.0,325ti compact,
43265,43265,cadillac,Cadillac Escalade,black,09/2011,2011,25990,301,409,Automatic,Ethanol,,339 g/km,152300.0,6.2 V8 Platinum,
43273,43273,cadillac,Cadillac Escalade,black,03/2013,2013,28900,301,409,Automatic,Ethanol,,339 g/km,184500.0,6.2 V8 PLATINUM Aut. DEUTSCHES FAHRZEUG,
51527,51527,dodge,Dodge Grand Caravan,red,05/2019,2019,21999,211,287,Automatic,Ethanol,,0 g/km,83648.0,GT*FLEXFUEL E85 ETHANOL*7 SITZER*,
58400,58400,ford,Ford Focus,black,05/2006,2006,1999,92,125,Manual,Ethanol,,- (g/km),220000.0,Ford Focus turnier 2,
58513,58513,ford,Ford Focus,silver,06/2006,2006,4900,92,125,Manual,Ethanol,,- (g/km),260000.0,Turnier 1.8 (FFV) flexifuel Sport,
58807,58807,ford,Ford Focus,beige,09/2008,2008,3680,92,125,Manual,Ethanol,7 l/100 km,167 g/km,171000.0,1.8 (FFV) flexifuel Titanium,
59470,59470,ford,Ford C-Max,silver,01/2009,2009,4500,92,125,Manual,Ethanol,"7,1 l/100 km",- (g/km),178000.0,C-MAX 1.8 flexifuel Style+,
60797,60797,ford,Ford C-Max,white,03/2012,2012,9990,88,120,Manual,Ethanol,"6,6 l/100 km",154 g/km,81162.0,"1.6 Ti-VCT , Tempomat, Allwetterreifen, HU/AU neu",
177395,177395,saab,Saab 9-3,blue,06/2008,2008,4500,148,201,Manual,Ethanol,"8,3 l/100 km",184 g/km,304000.0,9-3 2.0t Bio Power Sport-Kombi Vector,


In [68]:
other_df = df[df['fuel_type'] == 'Ethanol']
count_l_per_100km_other = other_df['fuel_consumption_l_100km'].astype(str).str.contains(r'kg/100 km', na=False).sum()
share_l_other = count_l_per_100km_other / len(other_df) * 100
count_km_ort_other = other_df['fuel_consumption_l_100km'].astype(str).str.contains(r'l/100 km', na=False).sum()
share_km_ort_other = count_km_ort_other / len(other_df) * 100

print(f"Anzahl 'Other' mit 'kg/100 km in fuel_consumption_l_100km: {count_l_per_100km_other} ({share_l_other:.2f}%)")
print(f"Anzahl 'Other' mit 'l/100 km km' in fuel_consumption_l_100km: {count_km_ort_other} ({share_km_ort_other:.2f}%)")

Anzahl 'Other' mit 'kg/100 km in fuel_consumption_l_100km: 0 (0.00%)
Anzahl 'Other' mit 'l/100 km km' in fuel_consumption_l_100km: 5 (50.00%)
