In [1]:
import pandas as pd
import numpy as np

Einlesen des zuvor erzeugten Datensatzes (siehe Datenaufbereitung.ipynb)

In [10]:
df = pd.read_csv('formula1_daten.csv', sep = ';', decimal = '.')
del df['Unnamed: 0']
df.shape

(157755, 19)

In [11]:
df.head()

Unnamed: 0,raceId,year,circuitId,grandprix_name,driverId,lap_number,lap_position,lap_in_milliseconds,driver_fullname,podium_position,constructorId,constructor_name,stop_binary,total_laps,race_completion,grid,status_clean,total_milliseconds,form
0,841.0,2011.0,1.0,Australian Grand Prix,1.0,1.0,2.0,100573.0,Lewis Hamilton,2.0,1.0,McLaren,0.0,58.0,0.017241,2.0,Finished,5392556.0,0.0
1,841.0,2011.0,1.0,Australian Grand Prix,1.0,2.0,2.0,93774.0,Lewis Hamilton,2.0,1.0,McLaren,0.0,58.0,0.034483,2.0,Finished,5392556.0,0.0
2,841.0,2011.0,1.0,Australian Grand Prix,1.0,3.0,2.0,92900.0,Lewis Hamilton,2.0,1.0,McLaren,0.0,58.0,0.051724,2.0,Finished,5392556.0,0.0
3,841.0,2011.0,1.0,Australian Grand Prix,1.0,4.0,2.0,92582.0,Lewis Hamilton,2.0,1.0,McLaren,0.0,58.0,0.068966,2.0,Finished,5392556.0,0.0
4,841.0,2011.0,1.0,Australian Grand Prix,1.0,5.0,2.0,92471.0,Lewis Hamilton,2.0,1.0,McLaren,0.0,58.0,0.086207,2.0,Finished,5392556.0,0.0


Anzahl der uniquen Ausprägungen in jeder Spalte

In [9]:
for col in df.columns:
    print(col, len(df[col].unique()))

raceId 137
year 7
circuitId 26
grandprix_name 25
driverId 57
lap_number 78
lap_position 24
lap_in_milliseconds 54290
driver_fullname 57
podium_position 24
constructorId 17
constructor_name 17
stop_binary 2
total_laps 18
race_completion 948
grid 25
status_clean 3
total_milliseconds 2902
form 130


In [49]:
def hot_encode_top (column, df, feat_count = 10):
    '''
    hot one encoding, limitiert auf die feat_count häufigsten features
    eines nominalen features um zu Hohe dimensionen zu vermeiden
    
    column: liste mit einem oder mehr Spaltennamen, die hot encoded werden sollen
    df: dataframe der die Datenbasis darstellt
    feat_count: Anzahl Spalten die für jede Spalte encoded werden
    '''
    df_ = df.copy(deep = True)
    
    for col in column:
        
        
        #nur die häufigsten feat_count Featues werden encoded
        encode_features = [x for x in df_[col].value_counts(ascending = False).head(feat_count).index]
        
        for feature in encode_features:
            col_feature = col + '_'+str(feature)
            #dort wo feature nicht dem encode feature entspricht wird eine 0 gesetzt
            df_[col_feature] = df_.where(df_[col] == feature, other = 0)[col]
            #encode feature selbst wird in dataframe durch eine 1 ersetzt
            df_[col_feature].replace(feature, 1, inplace = True)
        
        #löschen der nun "bereinigten" Spalte
        del df_[col]
        
    return df_

In [55]:
encoded_df = hot_encode_top(['constructorId', 'driverId', 'total_laps','year'], df, 5)
encoded_df.shape

(157755, 35)

In [56]:
encoded_df.head(15)

Unnamed: 0,raceId,circuitId,grandprix_name,lap_number,lap_position,lap_in_milliseconds,driver_fullname,podium_position,constructor_name,stop_binary,...,total_laps_56.0,total_laps_71.0,total_laps_53.0,total_laps_70.0,total_laps_57.0,year_2012.0,year_2016.0,year_2011.0,year_2013.0,year_2014.0
0,841.0,1.0,Australian Grand Prix,1.0,2.0,100573.0,Lewis Hamilton,2.0,McLaren,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,841.0,1.0,Australian Grand Prix,2.0,2.0,93774.0,Lewis Hamilton,2.0,McLaren,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,841.0,1.0,Australian Grand Prix,3.0,2.0,92900.0,Lewis Hamilton,2.0,McLaren,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,841.0,1.0,Australian Grand Prix,4.0,2.0,92582.0,Lewis Hamilton,2.0,McLaren,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,841.0,1.0,Australian Grand Prix,5.0,2.0,92471.0,Lewis Hamilton,2.0,McLaren,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,841.0,1.0,Australian Grand Prix,6.0,2.0,92434.0,Lewis Hamilton,2.0,McLaren,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,841.0,1.0,Australian Grand Prix,7.0,2.0,92447.0,Lewis Hamilton,2.0,McLaren,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,841.0,1.0,Australian Grand Prix,8.0,2.0,92310.0,Lewis Hamilton,2.0,McLaren,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8,841.0,1.0,Australian Grand Prix,9.0,2.0,92612.0,Lewis Hamilton,2.0,McLaren,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9,841.0,1.0,Australian Grand Prix,10.0,2.0,93121.0,Lewis Hamilton,2.0,McLaren,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
