# Data preprocessing

The raw data has been obtained from the Nettiauto API.

After processing, the dataset is exported to a CSV file. The notebook `nettiauto_model.ipynb` uses this CSV file to train XGBoost model.

Running this notebook creates these files:

- `nettiauto_dataset.csv`

In [63]:
ACCESSORIES_TO_COLUMN_NAME = {
    'Airconditioning: Automatic': 'airconditioning',
    'Parking sensors': 'parking_sensors',
    'Satellite navigator': 'satellite_navigator',
    'Cruise control: Adaptive': 'cruise_control_adaptive',
    'Parking camera: Simple camera': 'parking_camera_simple_camera',
    'Seat heaters': 'seat_heaters',
    'Driving assistant': 'driving_assistant', # ei mobile de poistetaan
    'Alloy wheels': 'alloy_wheels',
    'Electric mirrors': 'electric_mirrors',
    'Lane departure warning system': 'lane_departure_warning_system',
    'Electrically operated tailgate': 'electrically_operated_tailgate',
    'Leather upholstery': 'leather_upholstery',
    'Emergency brake assist': 'emergency_brake_assist',
    'Collision avoidance system': 'collision_avoidance_system',
    'Cruise control: Traditional': 'cruise_control_traditional',
    'Parking Assistant': 'parking_assistant',
    'Tow bar': 'tow_bar',
    'Electric seats: With memory': 'electric_seats_with_memory',
    'Heated steering wheel': 'heated_steering_wheel',
    'Sunroof': 'sunroof',
    'Sun hatch: With panorama': 'sun_hatch_with_panorama',
    'Adaptive headlights': 'adaptive_headlights',
    'Sport seats': 'sport_seats',
    'Fuel / battery powered heater': 'fuel_battery_powered_heater',
    'Parking camera: 360-degree camera': 'parking_camera_360-degree_camera',
    'Battery preheating': 'battery_preheating',
    'Electric seats: Without memory': 'electric_seats_without_memory',
    'Air suspension': 'air_suspension',
    'Curve lights': 'curve_lights',
    'Head-Up display': 'head_up_display',
    'Sport base': 'sport_base',
}

DRIVE_TYPE = {'Four wheel': '4wd', 'Front wheel': 'fwd', 'Rear wheel': 'rwd', 'Not available': None}

fields_to_pick = [
    'make',
    'model',
    'modelTypeName',
    'color',
    'driveType',
    'price',
    'totalOwners',
    'kilometers',
    'seats',
    'power',
    'batteryCapacity',
    'electricRange',
]

columns = [
    *fields_to_pick,
    'age',
    'isSuv',
    'metallicColor',
    *ACCESSORIES_TO_COLUMN_NAME.values(),
]

In [64]:
import re

zero_times_s = 'Pro Performance 1ST 150 kW'
one_times_s = 'Pro Performance 1ST 150 kW, akku 77 kWh'
two_times_s = 'Pro Performance 1ST 150 kW, akku 77 kWh Pro Performance 1ST 150 kW, akku 88 kWh'
one_time_no_space = 'Pro Performance 1ST 150 kW, akku 77kWh Pro Performance 1ST 150 kW'
one_time_no_space_with_delimitter = 'Pro Performance 1ST 150 kW, akku 77,8kWh Pro Performance 1ST 150 kW'
one_time_no_space_with_delimitter2 = 'Pro Performance 1ST 150 kW, akku 77.8kWh Pro Performance 1ST 150 kW'



battery_regex = r'(\d{2,3}([.,]+\d)*\s*kWh)'

def parse_battery_capacity_from_free_text(free_text):
    matches = re.findall(battery_regex, free_text, re.IGNORECASE)
    if not len(matches):
        return 0
    return sorted([float(s[0].lower().replace(',', '.').replace('kwh', '').split()[0]) for s in matches], reverse=True)[0]

print(parse_battery_capacity_from_free_text(zero_times_s))
print(parse_battery_capacity_from_free_text(one_times_s))
print(parse_battery_capacity_from_free_text(two_times_s))
print(parse_battery_capacity_from_free_text(one_time_no_space))
print(parse_battery_capacity_from_free_text(one_time_no_space_with_delimitter))
print(parse_battery_capacity_from_free_text(one_time_no_space_with_delimitter2))

0
77.0
88.0
77.0
77.8
77.8


In [65]:
CURRENT_YEAR = 2023
CURRENT_MONTH = 10

def get_age(year, reg_y, reg_m):
    if reg_y and reg_m:
        return CURRENT_YEAR - reg_y + (CURRENT_MONTH - reg_m) / 12
        
        
    return CURRENT_YEAR - year

In [66]:
import os
import json
import pandas as pd
pd.set_option('display.max_columns', None)


QUERY_RESULTS_PATH = './query_results'



def open_nettiauto_file(file_name):
    print(file_name)
    with open(f'{QUERY_RESULTS_PATH}/{file_name}', 'r') as f:
        data = json.load(f)
    return data

def map_nettiauto_entry_to_row(item):
    try:
        acc = item['accessories'] if item['accessories'] else []
        acc_set = set([ ACCESSORIES_TO_COLUMN_NAME[row['en']] for row in acc if row['en'] in ACCESSORIES_TO_COLUMN_NAME])


        result_without_accessories = {key: item[key] for key in fields_to_pick if key in item}
        result_without_accessories['isSuv'] = item['bodyType']['id'] == 5
        result_without_accessories['make'] = result_without_accessories['make']['name'].lower()
        result_without_accessories['model'] = result_without_accessories['model']['name'].lower()
        result_without_accessories['color'] = result_without_accessories['color']['en'].lower()
        result_without_accessories['metallicColor'] = item['colorType']['en'].lower() == 'metallic'
        result_without_accessories['driveType'] = DRIVE_TYPE[result_without_accessories['driveType']['en']]
        result_without_accessories['age'] = get_age(item['year'], item['firstRegistrationYear'], item['firstRegistrationMonth'])
        
        # It's ugly but does its job. I was able to get the missing batteryCapacity from 2109 to 1219
        if not result_without_accessories['batteryCapacity'] and 'modelTypeName' in item and item['modelTypeName']:
            result_without_accessories['batteryCapacity'] = parse_battery_capacity_from_free_text(item['modelTypeName'])
        if not result_without_accessories['batteryCapacity'] and 'description' in item and item['description']:
            result_without_accessories['batteryCapacity'] = parse_battery_capacity_from_free_text(item['description'])
        if not result_without_accessories['modelTypeName']:
            result_without_accessories['modelTypeName'] = ''

        return {
            **result_without_accessories,
            **{key: key in acc_set for key in ACCESSORIES_TO_COLUMN_NAME.values()},
        }
    except Exception as e:
        print(item['id'])
        raise e


dfs = []

for file_name in sorted(os.listdir(QUERY_RESULTS_PATH)):
    results = map(map_nettiauto_entry_to_row, open_nettiauto_file(file_name))
    dfs.append(pd.DataFrame(data=results, columns=columns))

    
df = pd.concat(dfs)
df.head()


2021-01_response_1696071482757.json
2021-02_response_1696071737045.json
2021-03_response_1696072016857.json
2021-04_response_1696072267024.json
2021-05_response_1696072588298.json
2021-06_response_1696072850927.json
2021-07_response_1696073239785.json
2021-08_response_1696073507847.json
2021-09_response_1696073655890.json
2022-01_response_1696074468991.json
2022-02_response_1696074615821.json
2022-03_response_1696074839641.json
2022-04_response_1696075021170.json
2022-05_response_1696075239764.json
2022-07_response_1696075876798.json
2022-07_response_1696079148457.json
2022-08_response_1696076147323.json
2022-09_response_1696076364847.json
2022-10_response_1696076596294.json
2022-11_response_1696076798891.json
2022-12_response_1696076991892.json
2022-13_response_1696077143377.json
2022-14_response_1696077274819.json
2022_06_response_1696075611367.json
2023-01_response_1696078129488.json
2023-02_response_1696078250632.json
2023-03_response_1696078413236.json
2023-04_response_16960785579

Unnamed: 0,make,model,modelTypeName,color,driveType,price,totalOwners,kilometers,seats,power,batteryCapacity,electricRange,age,isSuv,metallicColor,airconditioning,parking_sensors,satellite_navigator,cruise_control_adaptive,parking_camera_simple_camera,seat_heaters,driving_assistant,alloy_wheels,electric_mirrors,lane_departure_warning_system,electrically_operated_tailgate,leather_upholstery,emergency_brake_assist,collision_avoidance_system,cruise_control_traditional,parking_assistant,tow_bar,electric_seats_with_memory,heated_steering_wheel,sunroof,sun_hatch_with_panorama,adaptive_headlights,sport_seats,fuel_battery_powered_heater,parking_camera_360-degree_camera,battery_preheating,electric_seats_without_memory,air_suspension,curve_lights,head_up_display,sport_base
0,porsche,taycan,Taycan sähköauto Heti toimitukseen!! rahoitus ...,black,rwd,101880.0,1.0,2100.0,5.0,300.0,79.0,400.0,2.416667,False,False,True,True,True,False,False,True,False,True,True,False,True,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,True
1,opel,zafira-e,"Life L Comfort 136 automaatti 50 **Täyssähkö, ...",other,fwd,47900.0,,2500.0,,100.0,0.0,,2.583333,False,False,True,False,False,False,False,True,True,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,mercedes-benz,eqs,"580 4Matic ** Muistipenkit, Burmester, nelipyö...",gray,4wd,125900.0,,30000.0,5.0,385.0,108.4,702.0,2.0,False,False,True,True,True,True,False,True,True,True,True,True,True,True,False,True,False,True,False,True,False,False,True,True,True,True,True,True,False,True,False,False,False
3,nissan,leaf,e+ N-Connecta MY21 62 kWh *Isoakkunen Huippuva...,black,fwd,35800.0,,10000.0,5.0,160.0,62.0,,2.0,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,citroen,e-c4,"Full Electric 136 Shine 50 kWh, KORKOTARJOUS 3...",blue,fwd,33800.0,,17000.0,5.0,100.0,50.0,,2.0,False,False,True,True,True,False,True,False,True,True,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False


In [67]:
def update_capacity(df, make, model, capacity, re=".*", reverse=False):
    m = ((df["make"] == make)
        & (df["model"] == model))
    if reverse:
        m = (m & ~((df["modelTypeName"].str.contains(re, case=False, regex=True) == True).values))
    else:
        m = (m & ((df["modelTypeName"].str.contains(re, case=False, regex=True) == True).values))

    df.loc[m,"batteryCapacity"] = capacity
    print(f"Updated {m.sum():3} cars with parameters:\t{make:15}\t{model:10}\t{capacity:4}\t{re}")
    
def update_power(make, model, power, re=""):
    m = ((df["make"] == make)
            & (df["model"] == model)
            & (df["power"] <= 50)
            & ((df["modelTypeName"].str.contains(re, case=False, regex=True) == True).values))
    df.loc[m,"power"] = power
    print(f"Updated {m.sum():3} cars with parameters:\t{make:15}\t{model:10}\t{power:4}\t{re}")

def get_missing_df(df, make, model=''):
        if model:
                return df[(df['batteryCapacity'] == 0) & (df['make'] == make) & (df['model'] == model)][['make', 'model', 'modelTypeName', 'power', 'kilometers', 'electricRange', 'driveType']]
        return df[(df['batteryCapacity'] == 0) & (df['make'] == make)][['make', 'model', 'modelTypeName', 'power', 'kilometers', 'electricRange', 'driveType']]

def print_info(df, make, model, re=".*"):
        mask = (df['make'] == make) & (df['model'] == model) & (df['modelTypeName'].str.contains(re, regex=True))
        mercedes = df[mask]
        print(mercedes['batteryCapacity'].value_counts())
        print(mercedes[['modelTypeName', 'batteryCapacity']])
        
def print_missing_for_make(df, make):
        no_battery_df = df[df['batteryCapacity'] == 0]
        mercedes_no_battery = no_battery_df[no_battery_df['make'] == make]['model']
        print(mercedes_no_battery.value_counts())
        
def print_make_missing_battery(df):
    print(df[df['batteryCapacity'] == 0]['make'].value_counts())

In [68]:
update_power("kia", "ev6", 125, "standard\s*range")
update_power("kia", "ev6", 168, "long\s*range\s*AWD")
update_power("kia", "ev6", 239, "long\s*range")
update_power("kia", "ev6", 430, "gt")
update_power("kia", "niro", 150)


update_power("skoda", "enyaq", 109, "50")
update_power("skoda", "enyaq", 132, "60")
update_power("skoda", "enyaq", 195, "80X")
update_power("skoda", "enyaq", 150, "80")
update_power("skoda", "enyaq", 225, "RS")

update_power("mercedes-benz", "eqe", 180, "300")
update_power("mercedes-benz", "eqe", 215, "350")
update_power("mercedes-benz", "eqe", 300, "500")
update_power("mercedes-benz", "eqe", 350, "43")
update_power("mercedes-benz", "eqe", (460+505)/2, "53")

update_power("mercedes-benz", "eqb", 140, "250")
update_power("mercedes-benz", "eqb", 168, "300")
update_power("mercedes-benz", "eqb", 215, "350")

update_power("mercedes-benz", "eqa", 140, "250")
update_power("mercedes-benz", "eqa", 168, "300")
update_power("mercedes-benz", "eqa", 215, "350")

update_power("nissan", "leaf", 80)

update_power("peugeot", "e-2008", 100)
update_power("peugeot", "e-208", 100)

update_power("toyota", "bz4x", (150+160)/2)

Updated   0 cars with parameters:	kia            	ev6       	 125	standard\s*range
Updated   0 cars with parameters:	kia            	ev6       	 168	long\s*range\s*AWD
Updated   0 cars with parameters:	kia            	ev6       	 239	long\s*range
Updated  10 cars with parameters:	kia            	ev6       	 430	gt
Updated  21 cars with parameters:	kia            	niro      	 150	
Updated   0 cars with parameters:	skoda          	enyaq     	 109	50
Updated  10 cars with parameters:	skoda          	enyaq     	 132	60
Updated   4 cars with parameters:	skoda          	enyaq     	 195	80X
Updated  11 cars with parameters:	skoda          	enyaq     	 150	80
Updated   0 cars with parameters:	skoda          	enyaq     	 225	RS
Updated   3 cars with parameters:	mercedes-benz  	eqe       	 180	300
Updated   5 cars with parameters:	mercedes-benz  	eqe       	 215	350
Updated   1 cars with parameters:	mercedes-benz  	eqe       	 300	500
Updated   0 cars with parameters:	mercedes-benz  	eqe       	

In [69]:
update_capacity(df, "toyota", "bz4x", 64) 

no_battery_df = df[df['batteryCapacity'] == 0]
toyotas_no_battery = no_battery_df[no_battery_df['make'] == 'toyota']['model']
toyotas_no_battery.value_counts()
get_missing_df(df, 'toyota')
print_missing_for_make(df, 'toyota')

Updated  51 cars with parameters:	toyota         	bz4x      	  64	.*
Series([], Name: count, dtype: int64)


In [70]:
mercedes = df[(df['make'] == 'mercedes-benz') & (df['model'] == 'eqe')]
print(mercedes['batteryCapacity'].value_counts())
mercedes['modelTypeName']

batteryCapacity
0.0      77
90.0     25
89.0     24
90.6     12
100.0     7
91.0      3
98.0      2
88.1      1
56.0      1
Name: count, dtype: int64


16    350+ #Suomi-auto  #Panoraamalasikatto #Premium...
17    Mercedes-AMG EQE 43 4MATIC #Suomi-auto #Premiu...
18    350+ #AMG Line #Suomi-auto #Premium #Digital L...
19                      350+ *** Korko alk. 1,95% !!! *
43    350+ #Suomi-auto #Premium #AMG-ulko-/sisä #AMG...
                            ...                        
89    500 4MATIC SUV ** Esittelyauto nopeaan toimitu...
92                                           350 4MATIC
97    350 4matic ** Esittelyauto nopeaan toimituksee...
7                                        350 4MATIC SUV
14    500 4MATIC SUV #Heti toimitus #AMG-Line #Nelip...
Name: modelTypeName, Length: 152, dtype: object

In [71]:
update_capacity(df, "mercedes-benz", "eqe", 89.0, r'(350|300)\s') 
update_capacity(df, "mercedes-benz", "eqe", 90.6, r'(350\+|500|43)\s') 

# let's say rest if 89. I think it could be the best guesss. usually they advertise the better model
m = ((df["make"] == 'mercedes-benz') & (df["model"] == 'eqe') & (df['batteryCapacity'] == 0))
df.loc[m,"batteryCapacity"] = 89

get_missing_df(df, "mercedes-benz", 'eqe')



Updated  67 cars with parameters:	mercedes-benz  	eqe       	89.0	(350|300)\s
Updated  64 cars with parameters:	mercedes-benz  	eqe       	90.6	(350\+|500|43)\s


  m = (m & ((df["modelTypeName"].str.contains(re, case=False, regex=True) == True).values))


Unnamed: 0,make,model,modelTypeName,power,kilometers,electricRange,driveType


In [72]:
update_capacity(df, "mercedes-benz", "eqs", 108.4, r'(450|450\+|580|53)\s') 
update_capacity(df, "mercedes-benz", "eqs", 90.6, r'(350)\s') 

m = ((df["make"] == 'mercedes-benz') & (df["model"] == 'eqs') & (df['batteryCapacity'] == 0) & (df['power'] > 216))
df.loc[m,"batteryCapacity"] = 108.4

m = ((df["make"] == 'mercedes-benz') & (df["model"] == 'eqs') & (df['batteryCapacity'] == 0))
df.loc[m,"batteryCapacity"] = 90.6

get_missing_df(df, "mercedes-benz", 'eqs')

Updated  69 cars with parameters:	mercedes-benz  	eqs       	108.4	(450|450\+|580|53)\s
Updated   8 cars with parameters:	mercedes-benz  	eqs       	90.6	(350)\s


  m = (m & ((df["modelTypeName"].str.contains(re, case=False, regex=True) == True).values))


Unnamed: 0,make,model,modelTypeName,power,kilometers,electricRange,driveType


In [73]:
# https://www.mercedes-benz.fi/content/dam/finland/passengercars/NEW-WLTP-pricelists/EQA/EQA_10.5.2023_1.pdf
print_info(df, 'mercedes-benz', 'eqa', r'\d{3}\+')
update_capacity(df, "mercedes-benz", "eqa", 70.5, r'\d{3}\+') 
update_capacity(df, "mercedes-benz", "eqa", 66.5, r'\d{3}') 
get_missing_df(df, "mercedes-benz", 'eqa')

batteryCapacity
66.0     1
100.0    1
70.0     1
0.0      1
Name: count, dtype: int64
                                        modelTypeName  batteryCapacity
63  250+ suuremmalla akulla Advanced 529km (WLTP) ...             66.0
15  250+ Business / Koukku / AMG Line / EQ-Navigoi...            100.0
26   250+ Business **Distronic, MBUX-inovaatiopaketti             70.0
17                                      250+ Business              0.0
Updated   4 cars with parameters:	mercedes-benz  	eqa       	70.5	\d{3}\+
Updated  73 cars with parameters:	mercedes-benz  	eqa       	66.5	\d{3}


Unnamed: 0,make,model,modelTypeName,power,kilometers,electricRange,driveType
72,mercedes-benz,eqa,* Amg paketoitu ( ! ) *,,29000.0,,fwd
98,mercedes-benz,eqa,,,,,fwd
49,mercedes-benz,eqa,,140.0,23000.0,420.0,fwd
56,mercedes-benz,eqa,,168.0,7000.0,,4wd
72,mercedes-benz,eqa,,,3000.0,,4wd
4,mercedes-benz,eqa,,5.0,,,fwd


In [74]:
# https://www.mercedes-benz.fi/content/dam/finland/passengercars/NEW-WLTP-pricelists/EQA/EQA_10.5.2023_1.pdf
print_info(df, 'mercedes-benz', 'eqb', r'\d{3}\+')
update_capacity(df, "mercedes-benz", "eqb", 70.5, r'\d{3}\+') 
update_capacity(df, "mercedes-benz", "eqb", 66.5, r'\d{3}') 
get_missing_df(df, "mercedes-benz", 'eqb')

Series([], Name: count, dtype: int64)
Empty DataFrame
Columns: [modelTypeName, batteryCapacity]
Index: []
Updated   0 cars with parameters:	mercedes-benz  	eqb       	70.5	\d{3}\+
Updated  78 cars with parameters:	mercedes-benz  	eqb       	66.5	\d{3}


Unnamed: 0,make,model,modelTypeName,power,kilometers,electricRange,driveType
50,mercedes-benz,eqb,,,,,4wd
62,mercedes-benz,eqb,,168.0,8090.0,,4wd
75,mercedes-benz,eqb,,140.0,,,fwd
99,mercedes-benz,eqb,,5.0,6000.0,,4wd
22,mercedes-benz,eqb,,,,,4wd
41,mercedes-benz,eqb,,168.0,3000.0,,4wd
85,mercedes-benz,eqb,,5.0,,,4wd


In [75]:
print_info(df, 'mercedes-benz', 'eqc')
update_capacity(df, "mercedes-benz", "eqc", 80) 
get_missing_df(df, "mercedes-benz", 'eqc')

batteryCapacity
80.0    25
0.0     23
82.0     1
Name: count, dtype: int64
                                        modelTypeName  batteryCapacity
15                                         400 4Matic              0.0
41  400 4Matic AMG Line, Head up !! Distronic !! 3...              0.0
74                 takuu 11/23, Rahoitus alkaen 3,99%             80.0
16         400 AMG 4Matic // ALV / Huippuvarusteet //             80.0
34  400 4Matic AMG / Nahkaverhoilu / Keyless Go / ...              0.0
95                                                                 0.0
0   AMG 400 4Matic / Designo sisusta ja ulkoväri!!...              0.0
34  400 4Matic * AMG Edition, Distronic+, 360 kame...             80.0
7   400 4MATIC AMG / DTR+ / Koukku / WideScreen / ...             80.0
11  400 4Matic AMG-Line*Night Desing*Burmester / D...              0.0
26  400 4MATIC, WLTP 472km, Tunnelmavalaistus, 360...             80.0
28  400 4Matic AMG **Distonic plus / Multibeam LED...             80.0
33

Unnamed: 0,make,model,modelTypeName,power,kilometers,electricRange,driveType


In [76]:
print_missing_for_make(df, 'mercedes-benz')

model
eqs suv    8
eqb        7
eqa        6
eqv        3
vito       1
Name: count, dtype: int64


In [77]:
print_info(df, 'bmw', 'ix')
update_capacity(df, "bmw", "ix", 76.6, r'xDrive\s*40')
update_capacity(df, "bmw", 'ix', 111.5, r'xDrive\s*40', True)
get_missing_df(df, "bmw", 'ix')


batteryCapacity
0.0      33
71.0      5
111.0     4
76.6      3
77.0      3
76.0      3
111.5     2
105.0     1
Name: count, dtype: int64
                                        modelTypeName  batteryCapacity
92  xDrive40 // BMW Premium Selection -takuu 24kk/...              0.0
20  xDrive40 Fully Charged / Adapt.cruise / HUD / ...             76.6
28  xDrive40 *SUPERVARUSTEET! 22"/LASER/HUD/PANORA...              0.0
88  xDrive40 Fully Charged, WLTP 372km, Harman/Kar...              0.0
55  xDrive50 //H&K/Laser/Ilmajousitus/Nelipyöräohj...              0.0
71  xDrive40 Fully Charged 11CF *** Korko alk. 1,9...              0.0
88  xDrive40, WLTP 372km, Sähkösäätöiset Etuistuim...             71.0
15  xDrive40 Fully Charged *ESITTELYAUTO* Hinta uu...              0.0
42  xDrive40 Fully Charged // Huippuvarusteet  **B...              0.0
44   xDrive50 Signature *** Tämä auto nyt etuhintaan!              0.0
62        xDrive40 *** BMW Rahoitusetu 2,90% (+kulut)              0.0
63  Akku 7

Unnamed: 0,make,model,modelTypeName,power,kilometers,electricRange,driveType


In [78]:
print_info(df, 'bmw', 'ix')
update_capacity(df, "bmw", "ix", 76.6, r'xDrive\s*40')
update_capacity(df, "bmw", 'ix', 111.5, r'xDrive\s*40', True)
get_missing_df(df, "bmw", 'ix')

batteryCapacity
76.6     31
111.5    23
Name: count, dtype: int64
                                        modelTypeName  batteryCapacity
92  xDrive40 // BMW Premium Selection -takuu 24kk/...             76.6
20  xDrive40 Fully Charged / Adapt.cruise / HUD / ...             76.6
28  xDrive40 *SUPERVARUSTEET! 22"/LASER/HUD/PANORA...             76.6
88  xDrive40 Fully Charged, WLTP 372km, Harman/Kar...             76.6
55  xDrive50 //H&K/Laser/Ilmajousitus/Nelipyöräohj...            111.5
71  xDrive40 Fully Charged 11CF *** Korko alk. 1,9...             76.6
88  xDrive40, WLTP 372km, Sähkösäätöiset Etuistuim...             76.6
15  xDrive40 Fully Charged *ESITTELYAUTO* Hinta uu...             76.6
42  xDrive40 Fully Charged // Huippuvarusteet  **B...             76.6
44   xDrive50 Signature *** Tämä auto nyt etuhintaan!            111.5
62        xDrive40 *** BMW Rahoitusetu 2,90% (+kulut)             76.6
63  Akku 77kWh xDrive40 Fully Charged Aut. 326hv |...             76.6
22  1-om, M

Unnamed: 0,make,model,modelTypeName,power,kilometers,electricRange,driveType


In [79]:
print_info(df, 'bmw', 'ix1')
update_capacity(df, "bmw", "ix1", 66.5)
get_missing_df(df, "bmw", 'ix1')

batteryCapacity
0.0     33
68.0    13
64.0     2
64.7     1
64.8     1
66.0     1
67.0     1
Name: count, dtype: int64
                                        modelTypeName  batteryCapacity
2            xDrive30 61EF *** Korko alk. 1,95% !!! *              0.0
5            xDrive30 61EF *** Korko alk. 1,95% !!! *              0.0
66  U11 30 xDrive Charged Edition - *Heti toimituk...              0.0
73                              M Sport U11 30 xDrive             64.0
50  U11 30 xDrive M-Sport - Adaptiviinen vakionope...             64.7
43  30 xDrive, WLTP 440km, M-Sport, Kamera, Drivin...              0.0
74  30e xDrive /Driving assistant plus /Harman Kar...              0.0
87           xDrive30 61EF *** Korko alk. 1,95% !!! *              0.0
81           xDrive30 61EF *** Korko alk. 1,95% !!! *              0.0
63  U11 30 xDrive (M Sport) *** BMW Rahoitusetu 2,...              0.0
74  U11 30 xDrive Charged Edition *** Tämä auto ny...              0.0
2        U11 30 xDrive xLine,

Unnamed: 0,make,model,modelTypeName,power,kilometers,electricRange,driveType


In [80]:
print_info(df, 'bmw', 'i4')
update_capacity(df, "bmw", "i4", 83.9)
get_missing_df(df, "bmw", 'i4')

batteryCapacity
83.0    32
0.0     29
81.0     5
83.9     5
84.0     4
80.0     2
80.7     1
Name: count, dtype: int64
                                        modelTypeName  batteryCapacity
7                   eDrive40 M-Sport *HUD*ACC*Kamera*             81.0
18  eDrive 40 M-Sport, WLTP 590km, Adaptiivinen Va...             81.0
75  eDrive40 M Sport Pro #Jatkotakuu #ACC #Harman/...             84.0
8   M-Sport / Aktiivinen vakionopeudensäädin / Per...             84.0
46  eDrive40 M-Sport * Takuu 02/2024 / Harman Kard...             84.0
..                                                ...              ...
58  M50 Super Charged *** BMW Rahoitusetu 2,90% (+...             83.0
77                                                M50             83.0
78        eDrive40 *** BMW Rahoitusetu 2,90% (+kulut)             83.0
80  M50 // Hinta uutena ~ 78 510€ // Luovutus 12 /...             83.0
90                                   eDrive40 Charged              0.0

[78 rows x 2 columns]
Update

Unnamed: 0,make,model,modelTypeName,power,kilometers,electricRange,driveType


In [81]:
print_info(df, 'bmw', 'i4 m50')
update_capacity(df, "bmw", "i4 m50", 83.9)
get_missing_df(df, "bmw", 'i4 m50')

batteryCapacity
0.0     23
83.0    13
83.9     7
84.0     4
81.0     4
82.0     1
Name: count, dtype: int64
                                        modelTypeName  batteryCapacity
9   M50 // Driving assistant+/ Laser/ Comfort acce...             83.9
77  xDrive - M-Sport, Adaptiivinen alusta - sähkös...             83.0
80  Laserajovalot, Driving assistant+, Hifi, HUD, ...              0.0
87  G26 M50 * Korko 4,99% / ACC / Harman Kardon / ...             84.0
41                                             M50 i4              0.0
62  i4 M50 - Korkokampanja 3.99%+kulut - Suomiauto...              0.0
99  M50 - *Korko 3,99%+kulut + Kotiintoimitus alka...              0.0
80  X-Drive - M-Sport -Carbon -pack - 20” Multispo...             83.0
83  ** Vetokoukku / HIFI / BMW Live Cockpit Plus /...             83.0
14                               M50 i4 Fully Charged              0.0
76  Huippu varusteet! BMW Individual metalliväri, ...             84.0
35  M50 // Takuu / ACC / Kamera / M Spor

Unnamed: 0,make,model,modelTypeName,power,kilometers,electricRange,driveType


In [82]:
update_capacity(df, "bmw", "ix3", 80.0)
update_capacity(df, "bmw", "i7", 101.7)
update_capacity(df, "bmw", "ix m50", 111.5)

print_missing_for_make(df, 'bmw')

Updated  39 cars with parameters:	bmw            	ix3       	80.0	.*
Updated   9 cars with parameters:	bmw            	i7        	101.7	.*
Updated   0 cars with parameters:	bmw            	ix m50    	111.5	.*
model
i3        4
i3s       3
ix m60    3
Name: count, dtype: int64


In [83]:
update_capacity(df, "tesla", "model 3", (54+55)/2, "standard[-|\s]*range") 
update_capacity(df, "tesla", "model 3", ((78+79.5)/2), "long[\s|-]range|performance")
get_missing_df(df, "tesla", "model 3")


Updated  37 cars with parameters:	tesla          	model 3   	54.5	standard[-|\s]*range
Updated  90 cars with parameters:	tesla          	model 3   	78.75	long[\s|-]range|performance


Unnamed: 0,make,model,modelTypeName,power,kilometers,electricRange,driveType
23,tesla,model 3,",Refresh, Eap, ilp, Chrome delete, 20""Uber tur...",377.0,25000.0,,4wd
25,tesla,model 3,*Tummennukset*Ilp*,324.0,48000.0,,4wd
37,tesla,model 3,,340.0,20000.0,,4wd
68,tesla,model 3,"ILP, Sähk. takaluukku, Panoramakatto, Autopilo...",211.0,35000.0,,rwd
35,tesla,model 3,*Vaihto *Rahoitus *Takuu,340.0,28991.0,,4wd
87,tesla,model 3,"Facelift, Tehdastakuu, Autopilot, KW alusta, K...",513.0,36000.0,,4wd
29,tesla,model 3,,239.0,28000.0,,rwd
39,tesla,model 3,SR+ / Autopilot / Premium Audio / ILP / Panor...,,123000.0,,rwd
55,tesla,model 3,,,21340.0,,4wd
56,tesla,model 3,,510.0,45285.0,,4wd


In [84]:
update_capacity(df, "tesla", "model y", 78.1, "standard[-|\s]*range")
update_capacity(df, "tesla", "model y", 78.1, "long[\s|-]range|performance")
get_missing_df(df, "tesla", "model y")

Updated   0 cars with parameters:	tesla          	model y   	78.1	standard[-|\s]*range
Updated 111 cars with parameters:	tesla          	model y   	78.1	long[\s|-]range|performance


Unnamed: 0,make,model,modelTypeName,power,kilometers,electricRange,driveType
66,tesla,model y,,,58000.0,,4wd
29,tesla,model y,Model Y Monikäyttöajoneuvo (AF) 5ov - MODEL Y ...,5.0,22000.0,,4wd
99,tesla,model y,Prem. Audio & Nahat / ilmalämpöpumppu/autopilotti,258.0,32100.0,,4wd
0,tesla,model y,"* Valkoinen * 20"" Induction vanteet * AMD Ryze...",258.0,9300.0,,4wd
8,tesla,model y,* Vetokoukku * AMD ryzen * ilmalämpöpumppu *,340.0,18000.0,,4wd
75,tesla,model y,Rear-Wheel Drive RWD Aut.| Vetokoukku | AMD Ry...,,26400.0,,rwd
92,tesla,model y,"Parannettu Autopilot, Teslan irrotettava Vetok...",514.0,49000.0,,4wd
48,tesla,model y,,534.0,11000.0,,4wd
62,tesla,model y,VAIHTO / RAHOITUS!!,336.0,26000.0,520.0,4wd
70,tesla,model y,"Vetokoukku, Midnight Cherry Red",336.0,5900.0,,4wd


In [85]:
print_missing_for_make(df, 'tesla')

model
model 3    19
model y    13
model s     3
model x     1
Name: count, dtype: int64


In [86]:
# https://www.media.volvocars.com/global/en-gb/models/xc40-recharge/2023/specifications
# https://www.media.volvocars.com/global/en-gb/models/c40-recharge/2023/specifications
update_capacity(df, "volvo", "xc40", 78, "recharge\s*twin")
update_capacity(df, "volvo", "xc40", 69, "recharge")
update_capacity(df, "volvo", "c40", 78, "recharge\s*twin")
update_capacity(df, "volvo", "c40", 69, "recharge")

print_missing_for_make(df, 'volvo')

Updated  28 cars with parameters:	volvo          	xc40      	  78	recharge\s*twin
Updated  68 cars with parameters:	volvo          	xc40      	  69	recharge
Updated  57 cars with parameters:	volvo          	c40       	  78	recharge\s*twin
Updated  77 cars with parameters:	volvo          	c40       	  69	recharge
model
xc40    7
c40     4
Name: count, dtype: int64


In [87]:
# https://ev-database.org/car/1586/Opel-Mokka-e 50
# https://ev-database.org/uk/car/1585/Vauxhall-Corsa-e 50
update_capacity(df, "opel", "mokka-e", 50)
update_capacity(df, "opel", "corsa-e", 50)

print_missing_for_make(df, 'opel')

Updated  77 cars with parameters:	opel           	mokka-e   	  50	.*
Updated  28 cars with parameters:	opel           	corsa-e   	  50	.*
model
mokka       4
zafira-e    1
Name: count, dtype: int64


In [88]:
# https://en.wikipedia.org/wiki/Audi_Q4_e-tron
# https://ev-database.org/car/1355/Audi-e-tron-55-quattro 95
# https://en.wikipedia.org/wiki/Audi_Q8_e-tron
# https://ev-database.org/car/1420/Audi-e-tron-GT-quattro 93.4
# https://ev-database.org/car/1153/Audi-e-tron-GT-RS 93.4
update_capacity(df, "audi", "q4 e-tron", 55, "35")
update_capacity(df, "audi", "q4 e-tron", 82, "40|45|50")
update_capacity(df, "audi", "e-tron", 95, "55.*quattro")
update_capacity(df, "audi", "e-tron", 71, "50.*(q|g)uattro")
update_capacity(df, "audi", "e-tron", 95, "Sportback 55")
update_capacity(df, "audi", "e-tron", 95, "Sportback S")
update_capacity(df, "audi", "q8 e-tron", 95, "55.*quattro")
update_capacity(df, "audi", "q8 e-tron", 71, "50.*quattro")
update_capacity(df, "audi", "q8 e-tron", 95, "Sportback 55")
update_capacity(df, "audi", "q8 e-tron", 71, "Sportback 50")
update_capacity(df, "audi", "e-tron gt", 93.4)
update_capacity(df, "audi", "e-tron gt rs", 93.4)

print_missing_for_make(df, 'audi')

Updated  18 cars with parameters:	audi           	q4 e-tron 	  55	35
Updated  98 cars with parameters:	audi           	q4 e-tron 	  82	40|45|50
Updated  28 cars with parameters:	audi           	e-tron    	  95	55.*quattro
Updated  15 cars with parameters:	audi           	e-tron    	  71	50.*(q|g)uattro
Updated   4 cars with parameters:	audi           	e-tron    	  95	Sportback 55
Updated  10 cars with parameters:	audi           	e-tron    	  95	Sportback S
Updated  13 cars with parameters:	audi           	q8 e-tron 	  95	55.*quattro
Updated  20 cars with parameters:	audi           	q8 e-tron 	  71	50.*quattro
Updated   9 cars with parameters:	audi           	q8 e-tron 	  95	Sportback 55
Updated  13 cars with parameters:	audi           	q8 e-tron 	  71	Sportback 50
Updated  26 cars with parameters:	audi           	e-tron gt 	93.4	.*
Updated  14 cars with parameters:	audi           	e-tron gt rs	93.4	.*
model
e-tron       2
q4 e-tron    2
q8 e-tron    1
Name: count, dtype: int64


  m = (m & ((df["modelTypeName"].str.contains(re, case=False, regex=True) == True).values))


In [89]:
print_info(df, 'audi', 'q8 e-tron')


batteryCapacity
71.0     25
95.0     13
106.0     1
0.0       1
Name: count, dtype: int64
                                        modelTypeName  batteryCapacity
8                  Sportback 50 quattro Progress Plus             71.0
42  55 Sportback, WLTP 540km, Musta Optiikka -pake...            106.0
98                   50 quattro S line Launch Edition             71.0
6                 Sportback 50 quattro Launch Edition             71.0
12  Sportback e-tron Launch Edition S line 50 quat...             71.0
45                        Sportback 50 Launch Edition             71.0
46                          50 quattro Launch Edition             71.0
99  Sportback 50 S line Launch Edition *Head-Up Di...             71.0
3                    50 quattro s-line Launch Edition             71.0
9   Sportback 50 quattro 250kw Progress Plus Katta...             71.0
12  Sportback 50 quattro 250kw Progress Plus Todel...             71.0
20                 Sportback 50 S line Launch Edition     

In [90]:
# Kiat
# https://ev-database.org/car/1666/Kia-Niro-EV 68
# https://www.kia.com/content/dam/kwcms/kme/uk/en/assets/vehicles/EV6/Specification/Kia%20EV6%20Spec%20Sheet%20May%202021.pdf (ev6) 77.5
# https://ev-database.org/car/1288/Kia-e-Soul-64-kWh 67.5
# https://ev-database.org/car/1666/Kia-Niro-EV 68
# https://ev-database.org/car/1835/Kia-EV9-998-kWh-AWD 99.8
update_capacity(df, "kia", "niro", 68)
update_capacity(df, "kia", "ev6", 77.5)
update_capacity(df, "kia", "e-soul",  67.5)
update_capacity(df, "kia", "niro ev",  68)
update_capacity(df, "kia", "ev9", 99.8)

print_missing_for_make(df, 'kia')

Updated 108 cars with parameters:	kia            	niro      	  68	.*
Updated 136 cars with parameters:	kia            	ev6       	77.5	.*
Updated  39 cars with parameters:	kia            	e-soul    	67.5	.*
Updated   4 cars with parameters:	kia            	niro ev   	  68	.*
Updated   4 cars with parameters:	kia            	ev9       	99.8	.*
Series([], Name: count, dtype: int64)


In [91]:
update_capacity(df, "skoda", "enyaq", 82, r'4x4')
update_capacity(df, "skoda", "enyaq", 82, r'80')
update_capacity(df, "skoda", "enyaq", 62, r'60')

print_missing_for_make(df, 'skoda')
get_missing_df(df, 'skoda', 'enyaq')


Updated 105 cars with parameters:	skoda          	enyaq     	  82	4x4
Updated 234 cars with parameters:	skoda          	enyaq     	  82	80
Updated  49 cars with parameters:	skoda          	enyaq     	  62	60
model
enyaq    9
Name: count, dtype: int64


Unnamed: 0,make,model,modelTypeName,power,kilometers,electricRange,driveType
92,skoda,enyaq,,150.0,39000.0,,rwd
92,skoda,enyaq,,,29000.0,,rwd
39,skoda,enyaq,,5.0,29000.0,,rwd
5,skoda,enyaq,,,,405.0,rwd
7,skoda,enyaq,,195.0,1000.0,,4wd
50,skoda,enyaq,,195.0,78.0,,4wd
66,skoda,enyaq,,5.0,2000.0,,rwd
88,skoda,enyaq,,,16555.0,,4wd
5,skoda,enyaq,,150.0,5851.0,,rwd


In [92]:
print_missing_for_make(df, 'volkswagen')
update_capacity(df, "volkswagen", "id5", 77, r'GTX')
update_capacity(df, "volkswagen", "id5", 77, r'128 KW')
update_capacity(df, "volkswagen", "id4", 77, r'Pro Performance')
update_capacity(df, "volkswagen", "id3", 58, r'akku 58')
update_capacity(df, "volkswagen", "id3", 42, r'pure')
update_capacity(df, "volkswagen", "id3", 58, r'pro business')
update_capacity(df, "volkswagen", "id3", 77, r'pro s business')


get_missing_df(df, 'volkswagen', 'id3')


model
id5         6
id3         6
id4         2
up!         1
id. buzz    1
Name: count, dtype: int64
Updated  30 cars with parameters:	volkswagen     	id5       	  77	GTX
Updated  22 cars with parameters:	volkswagen     	id5       	  77	128 KW
Updated  93 cars with parameters:	volkswagen     	id4       	  77	Pro Performance
Updated 124 cars with parameters:	volkswagen     	id3       	  58	akku 58
Updated   9 cars with parameters:	volkswagen     	id3       	  42	pure
Updated  23 cars with parameters:	volkswagen     	id3       	  58	pro business
Updated  11 cars with parameters:	volkswagen     	id3       	  77	pro s business


Unnamed: 0,make,model,modelTypeName,power,kilometers,electricRange,driveType
62,volkswagen,id3,"Pro Business Plus 150 kW, akku KORKO ALK 1,95 %",150.0,,,rwd


In [93]:
# https://media.porsche.com/mediakit/taycan/en/porsche-taycan/die-batterie
# performance battery plus = 93.4 ja performance battery = 79.2
print_missing_for_make(df, 'porsche')
update_capacity(df, "porsche", "taycan", 93.4, r'Cross Turismo')
update_capacity(df, "porsche", "taycan", 93.4, r'Sport Turismo')
update_capacity(df, "porsche", "taycan", 93.4, r'Performance Battery Plus')
update_capacity(df, "porsche", "taycan", 93.4, r'turbo')
update_capacity(df, "porsche", "taycan", 93.4, r'gts')
update_capacity(df, "porsche", "taycan", 93.4, r'Perf. Battery\+')
update_capacity(df, "porsche", "taycan", 79.2, r'4s')

m = ((df["make"] == 'porsche') & (df["model"] == 'taycan') & (df['batteryCapacity'] == 0) & (df['power'] == 300))
df.loc[m,"batteryCapacity"] = 79.2

m = ((df["make"] == 'porsche') & (df["model"] == 'taycan') & (df['batteryCapacity'] == 0) & (df['power'] >= 420))
df.loc[m,"batteryCapacity"] = 93.4

get_missing_df(df, 'porsche', 'taycan')


model
taycan    39
Name: count, dtype: int64
Updated  38 cars with parameters:	porsche        	taycan    	93.4	Cross Turismo
Updated  12 cars with parameters:	porsche        	taycan    	93.4	Sport Turismo
Updated   6 cars with parameters:	porsche        	taycan    	93.4	Performance Battery Plus
Updated  10 cars with parameters:	porsche        	taycan    	93.4	turbo
Updated   6 cars with parameters:	porsche        	taycan    	93.4	gts
Updated   2 cars with parameters:	porsche        	taycan    	93.4	Perf. Battery\+
Updated  40 cars with parameters:	porsche        	taycan    	79.2	4s


Unnamed: 0,make,model,modelTypeName,power,kilometers,electricRange,driveType


In [94]:
print_missing_for_make(df, 'hyundai')
# https://www.hyundai.fi/mallisto/ioniq-5/ominaisuudet/
update_capacity(df, "hyundai", "ioniq 5", 77.4)
update_capacity(df, "hyundai", "ioniq 6", 77.4)

get_missing_df(df, 'hyundai', 'ioniq 6')


model
ioniq 5           7
ioniq 6           6
kona              1
ioniq electric    1
Name: count, dtype: int64
Updated  84 cars with parameters:	hyundai        	ioniq 5   	77.4	.*
Updated  45 cars with parameters:	hyundai        	ioniq 6   	77.4	.*


Unnamed: 0,make,model,modelTypeName,power,kilometers,electricRange,driveType


In [95]:
print_missing_for_make(df, 'peugeot')
update_capacity(df, "peugeot", "e-208", 50)
update_capacity(df, "peugeot", "e-2008", 50)

get_missing_df(df, 'peugeot', 'e-208')

model
e-208     8
e-2008    6
Name: count, dtype: int64
Updated  54 cars with parameters:	peugeot        	e-208     	  50	.*
Updated  33 cars with parameters:	peugeot        	e-2008    	  50	.*


Unnamed: 0,make,model,modelTypeName,power,kilometers,electricRange,driveType


In [96]:
print_missing_for_make(df, 'polestar')
update_capacity(df, "polestar", "2", 75, r'Long Range')
update_capacity(df, 'polestar', '2', 78, r'Engineered BST 270')
update_capacity(df, "polestar", "2", 67, r'Standard Range')

m = ((df["make"] == 'polestar') & (df["model"] == '2') & (df['batteryCapacity'] == 0) & (df['driveType'] == '4wd'))
df.loc[m,"batteryCapacity"] = 75

m = ((df["make"] == 'polestar') & (df["model"] == '2') & (df['batteryCapacity'] == 0) & (df['driveType'] == 'fwd'))
df.loc[m,"batteryCapacity"] = 67

get_missing_df(df, 'polestar', '2')

model
2    10
Name: count, dtype: int64
Updated 102 cars with parameters:	polestar       	2         	  75	Long Range
Updated   4 cars with parameters:	polestar       	2         	  78	Engineered BST 270
Updated  22 cars with parameters:	polestar       	2         	  67	Standard Range


Unnamed: 0,make,model,modelTypeName,power,kilometers,electricRange,driveType


In [97]:
print_make_missing_battery(df)

make
tesla            36
mercedes-benz    25
maxus            23
subaru           13
volvo            11
lexus            10
bmw              10
skoda             9
nissan            6
dacia             6
renault           6
fiat              6
mini              5
audi              5
opel              5
seat              5
mg                4
volkswagen        3
jaguar            3
citroen           3
hyundai           2
smart             2
mazda             2
ford              1
ds                1
muu merkki        1
byd               1
Name: count, dtype: int64


In [98]:
no_battery_df = df[df['batteryCapacity'] == 0]
no_battery_df['make'].value_counts()
print(sum(df['batteryCapacity'] > 0) / len(df))
print(sum(df['batteryCapacity'] == 0))

0.9516129032258065
204


In [99]:
# final touches
df = df[df['batteryCapacity'] > 0]
df.loc[ df['power'] < 20,"power"] = 0
df['totalOwners'].fillna(df['totalOwners'].median(), inplace=True)
df['totalOwners'] = df['totalOwners'].astype(int)
df['seats'].fillna(5, inplace=True)
df['seats'] = df['seats'].astype(int)
df['power'].isna().sum()
df['kilometers'].fillna(df['kilometers'].mean(), inplace=True)


In [100]:
m_missing_drive_type = df['driveType'].isna()
m_tesla = (df['make'] == 'tesla')
m_opel = (df['make'] == 'opel')
m_toyota = (df['make'] == 'toyota')

m_fwd = (m_missing_drive_type) & ((m_opel) | (m_toyota))
df.loc[m_fwd, 'driveType'] = 'fwd'
m_rwd = (m_missing_drive_type) & (m_tesla)
df.loc[m_rwd, 'driveType'] = 'rwd'
df[df['driveType'].isna()]


Unnamed: 0,make,model,modelTypeName,color,driveType,price,totalOwners,kilometers,seats,power,batteryCapacity,electricRange,age,isSuv,metallicColor,airconditioning,parking_sensors,satellite_navigator,cruise_control_adaptive,parking_camera_simple_camera,seat_heaters,driving_assistant,alloy_wheels,electric_mirrors,lane_departure_warning_system,electrically_operated_tailgate,leather_upholstery,emergency_brake_assist,collision_avoidance_system,cruise_control_traditional,parking_assistant,tow_bar,electric_seats_with_memory,heated_steering_wheel,sunroof,sun_hatch_with_panorama,adaptive_headlights,sport_seats,fuel_battery_powered_heater,parking_camera_360-degree_camera,battery_preheating,electric_seats_without_memory,air_suspension,curve_lights,head_up_display,sport_base


In [101]:
# infer power
df['price_bucket_5'] = (df['price'] // 5000) * 5000
df['price_bucket_10'] = (df['price'] // 10000) * 10000
df['price_bucket_20'] = (df['price'] // 20000) * 20000
df['price_bucket_30'] = (df['price'] // 30000) * 30000
df['price_bucket_40'] = (df['price'] // 40000) * 40000
df['price_bucket_100'] = (df['price'] // 100000) * 100000
pbucket_cols = list(map(lambda x: 'price_bucket_{}'.format(x), [5, 10, 20, 30, 40, 100]))
for pb in pbucket_cols:
    median_power_bucketed = df.groupby(['make', 'model', pb, 'batteryCapacity'])['power'].median()
    
    def fill_power_bucketed(row):
        if pd.isna(row['power']):
            return median_power_bucketed.get((row['make'], row['model'], row[pb], row['batteryCapacity']), None)
        return row['power']

    df['power'] = df.apply(fill_power_bucketed, axis=1)

df['power'].isna().sum()

31

In [102]:
df[df['power'].isna()]['make'].value_counts()

make
peugeot          6
kia              5
skoda            3
jaguar           3
nissan           3
ford             3
dacia            2
muu merkki       2
seat             1
fiat             1
mercedes-benz    1
lexus            1
Name: count, dtype: int64

In [103]:
df[(df['power'].isna()) & (df['make'] == 'mercedes-benz')][['make', 'model', 'modelTypeName', 'price', 'batteryCapacity', 'power']]

Unnamed: 0,make,model,modelTypeName,price,batteryCapacity,power
58,mercedes-benz,eqs,,178090.0,90.6,


In [104]:
df.head(25)

Unnamed: 0,make,model,modelTypeName,color,driveType,price,totalOwners,kilometers,seats,power,batteryCapacity,electricRange,age,isSuv,metallicColor,airconditioning,parking_sensors,satellite_navigator,cruise_control_adaptive,parking_camera_simple_camera,seat_heaters,driving_assistant,alloy_wheels,electric_mirrors,lane_departure_warning_system,electrically_operated_tailgate,leather_upholstery,emergency_brake_assist,collision_avoidance_system,cruise_control_traditional,parking_assistant,tow_bar,electric_seats_with_memory,heated_steering_wheel,sunroof,sun_hatch_with_panorama,adaptive_headlights,sport_seats,fuel_battery_powered_heater,parking_camera_360-degree_camera,battery_preheating,electric_seats_without_memory,air_suspension,curve_lights,head_up_display,sport_base,price_bucket_5,price_bucket_10,price_bucket_20,price_bucket_30,price_bucket_40,price_bucket_100
0,porsche,taycan,Taycan sähköauto Heti toimitukseen!! rahoitus ...,black,rwd,101880.0,1,2100.0,5,300.0,79.0,400.0,2.416667,False,False,True,True,True,False,False,True,False,True,True,False,True,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,True,100000.0,100000.0,100000.0,90000.0,80000.0,100000.0
2,mercedes-benz,eqs,"580 4Matic ** Muistipenkit, Burmester, nelipyö...",gray,4wd,125900.0,1,30000.0,5,385.0,108.4,702.0,2.0,False,False,True,True,True,True,False,True,True,True,True,True,True,True,False,True,False,True,False,True,False,False,True,True,True,True,True,True,False,True,False,False,False,125000.0,120000.0,120000.0,120000.0,120000.0,100000.0
3,nissan,leaf,e+ N-Connecta MY21 62 kWh *Isoakkunen Huippuva...,black,fwd,35800.0,1,10000.0,5,160.0,62.0,,2.0,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,35000.0,30000.0,20000.0,30000.0,0.0,0.0
4,citroen,e-c4,"Full Electric 136 Shine 50 kWh, KORKOTARJOUS 3...",blue,fwd,33800.0,1,17000.0,5,100.0,50.0,,2.0,False,False,True,True,True,False,True,False,True,True,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,30000.0,30000.0,20000.0,30000.0,0.0,0.0
5,hyundai,kona,electric 64 kWh 204 hv Comfort. Upea. Adaptive...,white,fwd,27590.0,1,56000.0,5,150.0,64.0,415.0,2.75,False,True,True,True,True,True,True,True,True,True,True,False,False,False,False,True,False,False,False,False,True,False,False,True,False,False,False,False,True,False,True,True,False,25000.0,20000.0,20000.0,0.0,0.0,0.0
6,citroen,e-c4,e-C4 Full Electric 136 Shine 50 kWh / Nahkaver...,beige,fwd,29700.0,1,34000.0,5,100.0,50.0,,2.0,False,False,True,True,True,False,True,False,True,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,25000.0,20000.0,20000.0,0.0,0.0,0.0
7,polestar,2,"PILOT JA PLUS paketit 78Kw korko 4,99% AWD LO...",black,4wd,60780.0,1,7000.0,5,300.0,75.0,500.0,2.0,False,False,True,True,True,True,False,True,True,True,True,False,True,False,True,True,False,True,False,True,True,True,False,True,True,True,True,False,False,False,True,False,False,60000.0,60000.0,60000.0,60000.0,40000.0,0.0
8,peugeot,e-traveller,50kwh Standard 9-Paikkainen # SYYSTARJOUS!! S...,black,fwd,39900.0,1,3000.0,9,,50.0,,2.833333,False,False,True,True,True,False,False,False,True,False,True,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,35000.0,30000.0,20000.0,30000.0,0.0,0.0
9,peugeot,e-2008,ACTIVE PACK 50 kwh 136 Automaatti # Vaihtoehto...,orange,fwd,25790.0,1,13000.0,5,100.0,50.0,,2.5,True,True,True,True,True,False,False,False,True,True,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,25000.0,20000.0,20000.0,0.0,0.0,0.0
10,bmw,ix3,G08 Charged #PANORAMA #NAHKAVERHOILU #MUISTI #...,white,rwd,51800.0,1,17000.0,5,210.0,80.0,,2.166667,True,False,True,True,True,True,True,True,True,True,True,True,True,True,False,True,False,False,True,True,False,True,False,False,False,False,False,False,False,False,False,False,False,50000.0,50000.0,40000.0,30000.0,40000.0,0.0


In [105]:
drop_columns = ['electricRange', 'modelTypeName'] + list(pbucket_cols)
for c in drop_columns:
    df = df.drop(c, axis=1)

rows_to_drop_by = ['power', 'price']

# convert bool to int
bool_cols = df.columns[df.dtypes == 'bool']
df[bool_cols] = df[bool_cols].astype(int)

df.dropna(subset=rows_to_drop_by, inplace=True)

df.to_csv('nettiauto_dataset.csv', index=False)

In [106]:
drivetype_df = df[['make', 'model', 'driveType']].drop_duplicates().reset_index(drop=True)
drivetype_df.head(25)

Unnamed: 0,make,model,driveType
0,porsche,taycan,rwd
1,mercedes-benz,eqs,4wd
2,nissan,leaf,fwd
3,citroen,e-c4,fwd
4,hyundai,kona,fwd
5,polestar,2,4wd
6,peugeot,e-2008,fwd
7,bmw,ix3,rwd
8,kia,niro,fwd
9,volkswagen,id3,rwd


In [109]:
# df['make'] = df['make'].str.replace('-', '')
# df['model'] = df['model'].str.replace(r'\-|\.|\s|!', '', regex=True)

# df[df['batteryCapacity'] > 20].groupby('make')['model'].unique()


make
audi                [etron, etrongtrs, q4etron, etrongt, q8etron]
bmw                 [ix3, i3, ix, i3s, i4m50, i4, ix1, i7, ixm60]
byd                                            [tang, han, atto3]
citroen                  [ec4, c4, espacetourer, eberlingo, ec4x]
cupra                                                      [born]
dacia                                                    [spring]
ds                                                            [3]
fiat                                                       [500e]
ford                                      [mustangmache, mustang]
hyundai         [kona, ioniq5, ioniqelectric, konaelectric, io...
jaguar                                                    [ipace]
kia                                [niro, esoul, ev6, eniro, ev9]
lexus                                                        [ux]
maxus                                         [euniq, t90, mifa9]
mazda                                                      [mx30]
merce

In [108]:
drivetype_df.to_csv('../mobile_de/drivetype.csv', index=False)