In [6]:
import pandas as pd
import numpy as np
import pandasql as sqldf
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import sys,os
import random

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as MAE

In [3]:
def hot_encode_top (column, df, feat_count = 10):
    '''
    hot one encoding, limitiert auf die feat_count häufigsten features
    eines nominalen features um zu Hohe dimensionen zu vermeiden
    
    column: liste mit einem oder mehr Spaltennamen, die hot encoded werden sollen
    df: dataframe der die Datenbasis darstellt
    feat_count: Anzahl Spalten die für jede Spalte encoded werden
    '''
    df_ = df.copy(deep = True)
    
    for col in column:
        
        
        #nur die häufigsten feat_count Featues werden encoded
        encode_features = [x for x in df_[col].value_counts(ascending = False).head(feat_count).index]
        if col == 'status_clean':
            #encode_features = ['lapped', 'Finished', 'DNF']
            encode_features = ['Finished', 'DNF']
        for feature in encode_features:
            col_feature = col + '_'+str(feature)
            
            #dort wo feature nicht dem encode feature entspricht wird eine 0 gesetzt
            df_[col_feature] = df_.where(df_[col] == feature, other = 0)[col]
            #encode feature selbst wird in dataframe durch eine 1 ersetzt
            df_[col_feature].replace(feature, 1, inplace = True)
            
        
        #löschen der nun "bereinigten" Spalte
        del df_[col]
        
    return df_

In [2]:
rain_id = [847,861,879,910,914,934,942,953,957,967,970,982]
if os.path.exists('sliced_data70'):
    csv_filenames = []
    #auslesen aller csv file dateinamen aus formula 1 datensatz und abspeichern in liste
    for filename in os.listdir(os.getcwd()+'/sliced_data70'):
        typ = filename.split('.')[-1]
        name = filename.split('.')[0]
        if typ == 'csv':
            csv_filenames.append(filename)
    sliced_races = {}
    #einlesen und abspeichern als dataframe aller dateien
    for file in csv_filenames:
        try:
            df = pd.read_csv('sliced_data70/'+file, engine = 'python', sep = ';', decimal = '.')
            del df['Unnamed: 0']
        except Exception as e:
            df = pd.read_csv('sliced_data70/'+file, engine = 'c', sep = ';', decimal = '.')
            del df['Unnamed: 0']
            print(e)
        #print(df.head())
        f = int(file.split('_')[-1].split('.')[0])
        df["rain"] = 0
        if list(df["raceId"])[0] in rain_id:
            df["rain"] = 1
        sliced_races[f] = df
    print('Einlesen der sliced Dateien erfolgreich')
else:
    print('Dateien können nicht eingelesen werden, da kein entsprechendes Verzeichnis existiert!')
    
if os.path.exists('split_data'):
    csv_filenames = []
    #auslesen aller csv file dateinamen aus formula 1 datensatz und abspeichern in liste
    for filename in os.listdir(os.getcwd()+'/split_data'):
        typ = filename.split('.')[-1]
        name = filename.split('.')[0]
        if typ == 'csv':
            csv_filenames.append(filename)
    split_by_race = {}
    #einlesen und abspeichern als dataframe aller dateien
    for file in csv_filenames:
        try:
            df = pd.read_csv('split_data/'+file, engine = 'python', sep = ';', decimal = '.')
            del df['Unnamed: 0']
        except Exception as e:
            df = pd.read_csv('split_data/'+file, engine = 'c', sep = ';', decimal = '.')
            del df['Unnamed: 0']
            print(e)
        #print(df.head())
        f = int(file.split('_')[-1].split('.')[0])
        split_by_race[f] = df
    print('Einlesen der split Dateien erfolgreich')
else:
    print('Dateien können nicht eingelesen werden, da kein entsprechendes Verzeichnis existiert!')

Einlesen der sliced Dateien erfolgreich
Einlesen der split Dateien erfolgreich


### Lineare Regression

In [14]:
print('---shuffle races---')
keys = list(sliced_races.keys())
random.shuffle(keys)
sliced_shuffled = {}
for key in keys:
    sliced_shuffled[key] = sliced_races[key]


#columns werden definiert, die erst einmal nicht beachtet werden sollen
nogo_columns = [#'grid',
                #'race_completion',
                'lap_position','circuitId','lap_number',
                'podium_position', 'raceId',
                'grandprix_name', 'driver_fullname',
               'constructor_name', 'total_laps',
               'status_clean', 'constructorId',
                'total_milliseconds', 'driverId'
               'lap_in_milliseconds','year', 'stop_binary']

#definieren eines linearen Regressionsmodells
reg = LinearRegression()

train_break = 120
category_count = 13
counter = 0

print('---training---')
full_races = pd.DataFrame()
for raceId, race in sliced_shuffled.items():
    full = pd.DataFrame(columns = race.columns)
    if counter == train_break:
        break
    else:
        last_lap = max(race.lap_number.tolist())
        for did in race.driverId.unique():
            stop_sum = np.sum(race.where(race.driverId == did).dropna(how='all')['stop_binary'])
            race['stop_sum'] = stop_sum
            driver_df = race.where(np.logical_and(race.lap_number == last_lap, race.driverId == did)).dropna(how = 'all')
            full = full.append(driver_df)
        #for elem in full['driverId'].unique():
        #    full[str(elem)] = full['driverId'] == elem
        #for elem in full['constructorId'].unique():
        #    full[str(elem)] = full['constructorId'] == elem
        #for c in ['driverId','constructorId']:
        #temp = pd.get_dummies(data = full, columns = ['driverId','constructorId'], drop_first = True)
        #temp.replace(np.nan,0,inplace = True)
        #full = temp.copy(deep = True)
        ##for col in temp.columns:
        #    full[col] = temp[col]
        full_races = full_races.append(full)
#full_races = full_races.replace(np.nan,0)   
#full_races = hot_encode_top(['status_clean','driverId', 'constructorId'],full_races,15)
full_races = hot_encode_top(['status_clean','driverId'],full_races,category_count)
cols = [col for col in full_races.columns if col not in nogo_columns] 

#umrechnen in minuten
y = np.array(full_races['total_milliseconds']/60000)
races = full_races[cols]  
#races = races.dropna(how = 'any')
X = np.array(races)
reg = reg.fit(X, y)
A = reg.predict(X)
mse = mean_squared_error(A, y)
mae = MAE(A,y)
print('mse:', mse)
print('mae:',mae)

counter = 0

full_races = pd.DataFrame()
print('---testing---')
for raceId, race in sliced_shuffled.items():
    full = pd.DataFrame(columns = race.columns)
    if counter == train_break:
        last_lap = max(race.lap_number.tolist())
        for did in race.driverId.unique():
            stop_sum = np.sum(race.where(race.driverId == did).dropna(how='all')['stop_binary'])
            race['stop_sum'] = stop_sum
            driver_df = race.where(np.logical_and(race.lap_number == last_lap, race.driverId == did)).dropna(how = 'all')
            full = full.append(driver_df)
            
        #temp = pd.get_dummies(data = full, columns = ['driverId','constructorId'], drop_first = True)
        #temp.replace(np.nan,0,inplace = True)
        #full = temp.copy(deep = True)
        #for col in temp.columns:
        #    full[col] = temp[col]
        #full = hot_encode_top(['status_clean'],full)
        #full = hot_encode_top(['status_clean','driverId', 'constructorId'],full)
        full_races = full_races.append(full)
        
    else:
        counter += 1
#full_races = hot_encode_top(['status_clean','driverId', 'constructorId'],full_races,15)
full_races = hot_encode_top(['status_clean','driverId'],full_races,category_count)
full_races = full_races.replace(np.nan,0)   
y = np.array(full_races['total_milliseconds']/60000)
cols = [col for col in full_races.columns if col not in nogo_columns] 
races = full_races[cols]  
X = np.array(races)
A = reg.predict(X)
mse = mean_squared_error(A, y)
mae = MAE(A,y)
print('mse:', mse)
print('mae:',mae)

#hinzufügen der 'wichtigen' spalten zur späteren auswertung der vorhersagen auf dem trainingsdatensatz
full_races['prediction'] = A
full_races['total_milliseconds'] = y
full_races['total_minutes'] = y/60000

---shuffle races---
---training---
mse: 17.299965468326885
mae: 2.475987786306369
---testing---
mse: 8.528937382485418
mae: 1.9438280411415028


In [15]:
treffer = 0
nahdran = 0
weiterweg = 0
for raceId in full_races.raceId.unique():
    
    race = full_races.where(full_races.raceId == raceId).dropna(how = 'all')
    winner = race.where(race.podium_position == 1).dropna(how = 'all')['driver_fullname'].tolist()[0]
    second = race.where(race.podium_position == 2).dropna(how = 'all')['driver_fullname'].tolist()[0]
    third = race.where(race.podium_position == 3).dropna(how = 'all')['driver_fullname'].tolist()[0]
    
    fourth = race.where(race.podium_position == 4).dropna(how = 'all')['driver_fullname'].tolist()[0]
    fifth = race.where(race.podium_position == 5).dropna(how = 'all')['driver_fullname'].tolist()[0]
    sixth = race.where(race.podium_position == 6).dropna(how = 'all')['driver_fullname'].tolist()[0]
    seventh = race.where(race.podium_position == 7).dropna(how = 'all')['driver_fullname'].tolist()[0]
    
    pred_winner = race.where(race.prediction == min(race.prediction)).dropna(how = 'all')['driver_fullname'].tolist()[0]
    
    podium = [second,third]
    seven = [fourth,fifth,sixth,seventh]
    if pred_winner == winner:
        treffer += 1
    if pred_winner in podium:
        nahdran += 1
    if pred_winner in seven:
        weiterweg += 1
    print('winner, second, third:', winner,',' ,second,',' , third)
    print('predicted winner:', pred_winner,'\n')
    
print('genaue treffer:', treffer)
print('nah dran:', nahdran)
print('weiter weg:', weiterweg)

winner, second, third: Sebastian Vettel , Mark Webber , Fernando Alonso
predicted winner: Paul di Resta 

winner, second, third: Lewis Hamilton , Nico Rosberg , Valtteri Bottas
predicted winner: Lewis Hamilton 

winner, second, third: Nico Rosberg , Max Verstappen , Lewis Hamilton
predicted winner: Max Verstappen 

winner, second, third: Kimi RÃŒ_ikkÃŒÂ¦nen , Fernando Alonso , Sebastian Vettel
predicted winner: Kimi RÃŒ_ikkÃŒÂ¦nen 

winner, second, third: Nico Rosberg , Kevin Magnussen , Jenson Button
predicted winner: Daniel Ricciardo 

winner, second, third: Sebastian Vettel , Fernando Alonso , Kimi RÃŒ_ikkÃŒÂ¦nen
predicted winner: Mark Webber 

winner, second, third: Sebastian Vettel , Mark Webber , Nico Rosberg
predicted winner: Sebastian Vettel 

winner, second, third: Sebastian Vettel , Jenson Button , Fernando Alonso
predicted winner: Bruno Senna 

winner, second, third: Sebastian Vettel , Fernando Alonso , Mark Webber
predicted winner: Mark Webber 

winner, second, third: Lewis

### Ridge Regression

In [16]:
alphas = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
print('---shuffle races---')

keys = list(sliced_races.keys())
random.shuffle(keys)
sliced_shuffled = {}
for key in keys:
    sliced_shuffled[key] = sliced_races[key]


#columns werden definiert, die erst einmal nicht beachtet werden sollen
nogo_columns = [#'grid',
                #'race_completion',
                'lap_position','circuitId','lap_number',
                'podium_position', 'raceId',
                'grandprix_name', 'driver_fullname',
               'constructor_name', 'total_laps',
               'status_clean', 'constructorId',
                'total_milliseconds', 'driverId'
               'lap_in_milliseconds','year', 'stop_binary']

#testen verschiedener Generalisierungsstufen für das Regressionsmodell
for alpha in alphas:

    #definieren eines Ridge Regressionsmodells
    ridge = Ridge(alpha = alpha)
    
    train_break = 120
    category_count = 13
    counter = 0
    
    print('---training---')
    full_races = pd.DataFrame()
    for raceId, race in sliced_shuffled.items():
        full = pd.DataFrame(columns = race.columns)
        if counter == train_break:
            break
        else:
            last_lap = max(race.lap_number.tolist())
            for did in race.driverId.unique():
                stop_sum = np.sum(race.where(race.driverId == did).dropna(how='all')['stop_binary'])
                race['stop_sum'] = stop_sum
                driver_df = race.where(np.logical_and(race.lap_number == last_lap, race.driverId == did)).dropna(how = 'all')
                full = full.append(driver_df)
            
            full_races = full_races.append(full)
    #full_races = full_races.replace(np.nan,0)   
    #full_races = hot_encode_top(['status_clean','driverId', 'constructorId'],full_races,15)
    full_races = hot_encode_top(['status_clean','driverId'],full_races,category_count)
    cols = [col for col in full_races.columns if col not in nogo_columns] 
    
    #umrechnen in minuten
    y = np.array(full_races['total_milliseconds']/60000)
    races = full_races[cols]  
    #races = races.dropna(how = 'any')
    X = np.array(races)
    ridge = ridge.fit(X, y)
    A = ridge.predict(X)
    mse = mean_squared_error(A, y)
    mae = MAE(A,y)
    print('mse:', mse)
    print('mae:',mae)
    
    counter = 0
    
    full_races = pd.DataFrame()
    print('---testing---')
    for raceId, race in sliced_shuffled.items():
        full = pd.DataFrame(columns = race.columns)
        if counter == train_break:
            last_lap = max(race.lap_number.tolist())
            for did in race.driverId.unique():
                stop_sum = np.sum(race.where(race.driverId == did).dropna(how='all')['stop_binary'])
                race['stop_sum'] = stop_sum
                driver_df = race.where(np.logical_and(race.lap_number == last_lap, race.driverId == did)).dropna(how = 'all')
                full = full.append(driver_df)
                
            #temp = pd.get_dummies(data = full, columns = ['driverId','constructorId'], drop_first = True)
            #temp.replace(np.nan,0,inplace = True)
            #full = temp.copy(deep = True)
            #for col in temp.columns:
            #    full[col] = temp[col]
            #full = hot_encode_top(['status_clean'],full)
            #full = hot_encode_top(['status_clean','driverId', 'constructorId'],full)
            full_races = full_races.append(full)
            
        else:
            counter += 1
    #full_races = hot_encode_top(['status_clean','driverId', 'constructorId'],full_races,15)
    full_races = hot_encode_top(['status_clean','driverId'],full_races,category_count)
    full_races = full_races.replace(np.nan,0)   
    y = np.array(full_races['total_milliseconds']/60000)
    cols = [col for col in full_races.columns if col not in nogo_columns] 
    races = full_races[cols]  
    X = np.array(races)
    A = ridge.predict(X)
    mse = mean_squared_error(A, y)
    mae = MAE(A,y)
    print('mse:', mse)
    print('mae:',mae)
    print('alpha:',alpha)
    print('====================')
    
    #hinzufügen der 'wichtigen' spalten zur späteren auswertung der vorhersagen auf dem trainingsdatensatz
    full_races['prediction'] = A
    full_races['total_milliseconds'] = y
    full_races['total_minutes'] = y/60000

---shuffle races---
---training---


  overwrite_a=True).T


mse: 17.29996546832689
mae: 2.4759877864118836
---testing---
mse: 13.463522842807729
mae: 2.351277978922402
alpha: 0
---training---


  overwrite_a=True).T


mse: 17.394417571259474
mae: 2.4663528914954687
---testing---
mse: 13.277933343544102
mae: 2.276961391318
alpha: 0.1
---training---


  overwrite_a=True).T


mse: 17.4495809022525
mae: 2.471165797627999
---testing---
mse: 13.266364294961916
mae: 2.267584161054838
alpha: 0.2
---training---
mse: 17.479147980611017
mae: 2.474222790432166
---testing---
mse: 13.266396065256343
mae: 2.2648438960045154
alpha: 0.3
---training---
mse: 17.49734478491083
mae: 2.4762505598553997
---testing---
mse: 13.268143151374503
mae: 2.2637635321260827
alpha: 0.4
---training---
mse: 17.50964013535708
mae: 2.477683072256026
---testing---
mse: 13.270096330656946
mae: 2.2632869650883713
alpha: 0.5
---training---
mse: 17.51850190687774
mae: 2.4787333551584485
---testing---
mse: 13.27195732718757
mae: 2.2630616342236882
alpha: 0.6
---training---
mse: 17.52519640490124
mae: 2.4795340736273794
---testing---
mse: 13.27367724268673
mae: 2.262986087139394
alpha: 0.7
---training---
mse: 17.530437971239774
mae: 2.4801653671752226
---testing---
mse: 13.27526516626893
mae: 2.263030312797003
alpha: 0.8
---training---
mse: 17.53465950345362
mae: 2.480672561654685
---testing---
mse

bestes alpha zeigt sich bei ungefähr 0.7, im folgenden werden zufällig alphas um den WErt 0.7 erzeugt, um zu sehen welches am besten funktioniert

In [21]:
print('---shuffle races---')

keys = list(sliced_races.keys())
random.shuffle(keys)
sliced_shuffled = {}
for key in keys:
    sliced_shuffled[key] = sliced_races[key]


#columns werden definiert, die erst einmal nicht beachtet werden sollen
nogo_columns = [#'grid',
                #'race_completion',
                'lap_position','circuitId','lap_number',
                'podium_position', 'raceId',
                'grandprix_name', 'driver_fullname',
               'constructor_name', 'total_laps',
               'status_clean', 'constructorId',
                'total_milliseconds', 'driverId'
               'lap_in_milliseconds','year', 'stop_binary']

#testen verschiedener Generalisierungsstufen für das Regressionsmodell
for i in range(10):
    #random erzeugen eines alphas
    
    alpha = random.randint(668,730)
    alpha = alpha/1000
    #definieren eines Ridge Regressionsmodells
    ridge = Ridge(alpha = alpha)
    
    train_break = 120
    category_count = 13
    counter = 0
    
    print('---training---')
    full_races = pd.DataFrame()
    for raceId, race in sliced_shuffled.items():
        full = pd.DataFrame(columns = race.columns)
        if counter == train_break:
            break
        else:
            last_lap = max(race.lap_number.tolist())
            for did in race.driverId.unique():
                stop_sum = np.sum(race.where(race.driverId == did).dropna(how='all')['stop_binary'])
                race['stop_sum'] = stop_sum
                driver_df = race.where(np.logical_and(race.lap_number == last_lap, race.driverId == did)).dropna(how = 'all')
                full = full.append(driver_df)
            
            full_races = full_races.append(full)
    #full_races = full_races.replace(np.nan,0)   
    #full_races = hot_encode_top(['status_clean','driverId', 'constructorId'],full_races,15)
    full_races = hot_encode_top(['status_clean','driverId'],full_races,category_count)
    cols = [col for col in full_races.columns if col not in nogo_columns] 
    
    #umrechnen in minuten
    y = np.array(full_races['total_milliseconds']/60000)
    races = full_races[cols]  
    #races = races.dropna(how = 'any')
    X = np.array(races)
    ridge = ridge.fit(X, y)
    A = ridge.predict(X)
    mse = mean_squared_error(A, y)
    mae = MAE(A,y)
    print('mse:', mse)
    print('mae:',mae)
    
    counter = 0
    
    full_races = pd.DataFrame()
    print('---testing---')
    for raceId, race in sliced_shuffled.items():
        full = pd.DataFrame(columns = race.columns)
        if counter == train_break:
            last_lap = max(race.lap_number.tolist())
            for did in race.driverId.unique():
                stop_sum = np.sum(race.where(race.driverId == did).dropna(how='all')['stop_binary'])
                race['stop_sum'] = stop_sum
                driver_df = race.where(np.logical_and(race.lap_number == last_lap, race.driverId == did)).dropna(how = 'all')
                full = full.append(driver_df)
                
            #temp = pd.get_dummies(data = full, columns = ['driverId','constructorId'], drop_first = True)
            #temp.replace(np.nan,0,inplace = True)
            #full = temp.copy(deep = True)
            #for col in temp.columns:
            #    full[col] = temp[col]
            #full = hot_encode_top(['status_clean'],full)
            #full = hot_encode_top(['status_clean','driverId', 'constructorId'],full)
            full_races = full_races.append(full)
            
        else:
            counter += 1
    #full_races = hot_encode_top(['status_clean','driverId', 'constructorId'],full_races,15)
    full_races = hot_encode_top(['status_clean','driverId'],full_races,category_count)
    full_races = full_races.replace(np.nan,0)   
    y = np.array(full_races['total_milliseconds']/60000)
    cols = [col for col in full_races.columns if col not in nogo_columns] 
    races = full_races[cols]  
    X = np.array(races)
    A = ridge.predict(X)
    mse = mean_squared_error(A, y)
    mae = MAE(A,y)
    print('mse:', mse)
    print('mae:',mae)
    print('alpha:',alpha)
    print('====================')
    
    #hinzufügen der 'wichtigen' spalten zur späteren auswertung der vorhersagen auf dem trainingsdatensatz
    full_races['prediction'] = A
    full_races['total_milliseconds'] = y
    full_races['total_minutes'] = y/60000

---shuffle races---
---training---
mse: 17.52548853541353
mae: 2.479569310329132
---testing---
mse: 12.646885322223724
mae: 2.4798965104922854
alpha: 0.705
---training---
mse: 17.525662123958107
mae: 2.47959022730646
---testing---
mse: 12.647182076470711
mae: 2.4799532386131125
alpha: 0.708
---training---
mse: 17.5237431759634
mae: 2.4793590527587077
---testing---
mse: 12.643921513329293
mae: 2.4793270583134603
alpha: 0.676
---training---
mse: 17.523429331016544
mae: 2.4793214682734894
---testing---
mse: 12.643392227605966
mae: 2.479224822458353
alpha: 0.671
---training---
mse: 17.524298278885936
mae: 2.479425565203147
---testing---
mse: 12.644860326293621
mae: 2.4795079982200807
alpha: 0.685
---training---
mse: 17.525078554773728
mae: 2.47951984596137
---testing---
mse: 12.646185916163297
mae: 2.4797626002953788
alpha: 0.698
---training---
mse: 17.52435919248803
mae: 2.479432891582909
---testing---
mse: 12.644963556877121
mae: 2.479527862697628
alpha: 0.686
---training---
mse: 17.5261

alphas werden jetzt nur zwischen 0.65 und 0.7 gewählt:

In [22]:
print('---shuffle races---')

keys = list(sliced_races.keys())
random.shuffle(keys)
sliced_shuffled = {}
for key in keys:
    sliced_shuffled[key] = sliced_races[key]


#columns werden definiert, die erst einmal nicht beachtet werden sollen
nogo_columns = [#'grid',
                #'race_completion',
                'lap_position','circuitId','lap_number',
                'podium_position', 'raceId',
                'grandprix_name', 'driver_fullname',
               'constructor_name', 'total_laps',
               'status_clean', 'constructorId',
                'total_milliseconds', 'driverId'
               'lap_in_milliseconds','year', 'stop_binary']

#testen verschiedener Generalisierungsstufen für das Regressionsmodell
for i in range(10):
    #random erzeugen eines alphas
    
    alpha = random.randint(650,700)
    alpha = alpha/1000
    #definieren eines Ridge Regressionsmodells
    ridge = Ridge(alpha = alpha)
    
    train_break = 120
    category_count = 13
    counter = 0
    
    print('---training---')
    full_races = pd.DataFrame()
    for raceId, race in sliced_shuffled.items():
        full = pd.DataFrame(columns = race.columns)
        if counter == train_break:
            break
        else:
            last_lap = max(race.lap_number.tolist())
            for did in race.driverId.unique():
                stop_sum = np.sum(race.where(race.driverId == did).dropna(how='all')['stop_binary'])
                race['stop_sum'] = stop_sum
                driver_df = race.where(np.logical_and(race.lap_number == last_lap, race.driverId == did)).dropna(how = 'all')
                full = full.append(driver_df)
            
            full_races = full_races.append(full)
    #full_races = full_races.replace(np.nan,0)   
    #full_races = hot_encode_top(['status_clean','driverId', 'constructorId'],full_races,15)
    full_races = hot_encode_top(['status_clean','driverId'],full_races,category_count)
    cols = [col for col in full_races.columns if col not in nogo_columns] 
    
    #umrechnen in minuten
    y = np.array(full_races['total_milliseconds']/60000)
    races = full_races[cols]  
    #races = races.dropna(how = 'any')
    X = np.array(races)
    ridge = ridge.fit(X, y)
    A = ridge.predict(X)
    mse = mean_squared_error(A, y)
    mae = MAE(A,y)
    print('mse:', mse)
    print('mae:',mae)
    
    counter = 0
    
    full_races = pd.DataFrame()
    print('---testing---')
    for raceId, race in sliced_shuffled.items():
        full = pd.DataFrame(columns = race.columns)
        if counter == train_break:
            last_lap = max(race.lap_number.tolist())
            for did in race.driverId.unique():
                stop_sum = np.sum(race.where(race.driverId == did).dropna(how='all')['stop_binary'])
                race['stop_sum'] = stop_sum
                driver_df = race.where(np.logical_and(race.lap_number == last_lap, race.driverId == did)).dropna(how = 'all')
                full = full.append(driver_df)
                
            #temp = pd.get_dummies(data = full, columns = ['driverId','constructorId'], drop_first = True)
            #temp.replace(np.nan,0,inplace = True)
            #full = temp.copy(deep = True)
            #for col in temp.columns:
            #    full[col] = temp[col]
            #full = hot_encode_top(['status_clean'],full)
            #full = hot_encode_top(['status_clean','driverId', 'constructorId'],full)
            full_races = full_races.append(full)
            
        else:
            counter += 1
    #full_races = hot_encode_top(['status_clean','driverId', 'constructorId'],full_races,15)
    full_races = hot_encode_top(['status_clean','driverId'],full_races,category_count)
    full_races = full_races.replace(np.nan,0)   
    y = np.array(full_races['total_milliseconds']/60000)
    cols = [col for col in full_races.columns if col not in nogo_columns] 
    races = full_races[cols]  
    X = np.array(races)
    A = ridge.predict(X)
    mse = mean_squared_error(A, y)
    mae = MAE(A,y)
    print('mse:', mse)
    print('mae:',mae)
    print('alpha:',alpha)
    print('====================')
    
    #hinzufügen der 'wichtigen' spalten zur späteren auswertung der vorhersagen auf dem trainingsdatensatz
    full_races['prediction'] = A
    full_races['total_milliseconds'] = y
    full_races['total_minutes'] = y/60000

---shuffle races---
---training---
mse: 17.524114627639907
mae: 2.4794034739777855
---testing---
mse: 48.451973188113044
mae: 3.3339561471586037
alpha: 0.682
---training---
mse: 17.524601344696265
mae: 2.4794621603785436
---testing---
mse: 48.45229221232572
mae: 3.3341171915337933
alpha: 0.69
---training---
mse: 17.522854299266765
mae: 2.479252482218274
---testing---
mse: 48.451147846836555
mae: 3.3335379463389883
alpha: 0.662
---training---
mse: 17.524960128023157
mae: 2.4795055414106444
---testing---
mse: 48.452527490550516
mae: 3.3342357401846257
alpha: 0.696
---training---
mse: 17.522918844448878
mae: 2.4792602335353586
---testing---
mse: 48.45119008964011
mae: 3.333559404816531
alpha: 0.663
---training---
mse: 17.525019413744843
mae: 2.4795127033342
---testing---
mse: 48.452566377325674
mae: 3.3342553157176766
alpha: 0.697
---training---
mse: 17.522265875637306
mae: 2.4791822822368457
---testing---
mse: 48.45076286272057
mae: 3.333342117925475
alpha: 0.653
---training---
mse: 17.5

In [17]:
treffer = 0
nahdran = 0
weiterweg = 0
for raceId in full_races.raceId.unique():
    
    race = full_races.where(full_races.raceId == raceId).dropna(how = 'all')
    winner = race.where(race.podium_position == 1).dropna(how = 'all')['driver_fullname'].tolist()[0]
    second = race.where(race.podium_position == 2).dropna(how = 'all')['driver_fullname'].tolist()[0]
    third = race.where(race.podium_position == 3).dropna(how = 'all')['driver_fullname'].tolist()[0]
    
    fourth = race.where(race.podium_position == 4).dropna(how = 'all')['driver_fullname'].tolist()[0]
    fifth = race.where(race.podium_position == 5).dropna(how = 'all')['driver_fullname'].tolist()[0]
    sixth = race.where(race.podium_position == 6).dropna(how = 'all')['driver_fullname'].tolist()[0]
    seventh = race.where(race.podium_position == 7).dropna(how = 'all')['driver_fullname'].tolist()[0]
    
    pred_winner = race.where(race.prediction == min(race.prediction)).dropna(how = 'all')['driver_fullname'].tolist()[0]
    
    podium = [second,third]
    seven = [fourth,fifth,sixth,seventh]
    if pred_winner == winner:
        treffer += 1
    if pred_winner in podium:
        nahdran += 1
    if pred_winner in seven:
        weiterweg += 1
    print('winner, second, third:', winner,',' ,second,',' , third)
    print('predicted winner:', pred_winner,'\n')
    
print('genaue treffer:', treffer)
print('nah dran:', nahdran)
print('weiter weg:', weiterweg)

winner, second, third: Sebastian Vettel , Fernando Alonso , Mark Webber
predicted winner: Mark Webber 

winner, second, third: Nico Rosberg , Lewis Hamilton , Valtteri Bottas
predicted winner: Sebastian Vettel 

winner, second, third: Lewis Hamilton , Sebastian Vettel , Mark Webber
predicted winner: Sebastian Vettel 

winner, second, third: Jenson Button , Fernando Alonso , Felipe Massa
predicted winner: Lewis Hamilton 

winner, second, third: Lewis Hamilton , Nico Rosberg , Daniel Ricciardo
predicted winner: Lewis Hamilton 

winner, second, third: Jenson Button , Sebastian Vettel , Kimi RÃŒ_ikkÃŒÂ¦nen
predicted winner: Jenson Button 

winner, second, third: Lewis Hamilton , Kimi RÃŒ_ikkÃŒÂ¦nen , Romain Grosjean
predicted winner: Michael Schumacher 

winner, second, third: Lewis Hamilton , Nico Rosberg , Daniel Ricciardo
predicted winner: Lewis Hamilton 

winner, second, third: Sebastian Vettel , Jenson Button , Fernando Alonso
predicted winner: Bruno Senna 

winner, second, third: Lew

### Lasso Regression

In [18]:
alphas = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
print('---shuffle races---')

keys = list(sliced_races.keys())
random.shuffle(keys)
sliced_shuffled = {}
for key in keys:
    sliced_shuffled[key] = sliced_races[key]


#columns werden definiert, die erst einmal nicht beachtet werden sollen
nogo_columns = [#'grid',
                #'race_completion',
                'lap_position','circuitId','lap_number',
                'podium_position', 'raceId',
                'grandprix_name', 'driver_fullname',
               'constructor_name', 'total_laps',
               'status_clean', 'constructorId',
                'total_milliseconds', 'driverId'
               'lap_in_milliseconds','year', 'stop_binary']

#testen verschiedener Generalisierungsstufen für das Regressionsmodell
for alpha in alphas:

    #definieren eines lasso Regressionsmodells
    lasso = Lasso(alpha = alpha)
    
    train_break = 120
    category_count = 13
    counter = 0
    
    print('---training---')
    full_races = pd.DataFrame()
    for raceId, race in sliced_shuffled.items():
        full = pd.DataFrame(columns = race.columns)
        if counter == train_break:
            break
        else:
            last_lap = max(race.lap_number.tolist())
            for did in race.driverId.unique():
                stop_sum = np.sum(race.where(race.driverId == did).dropna(how='all')['stop_binary'])
                race['stop_sum'] = stop_sum
                driver_df = race.where(np.logical_and(race.lap_number == last_lap, race.driverId == did)).dropna(how = 'all')
                full = full.append(driver_df)
            
            full_races = full_races.append(full)
    #full_races = full_races.replace(np.nan,0)   
    #full_races = hot_encode_top(['status_clean','driverId', 'constructorId'],full_races,15)
    full_races = hot_encode_top(['status_clean','driverId'],full_races,category_count)
    cols = [col for col in full_races.columns if col not in nogo_columns] 
    
    #umrechnen in minuten
    y = np.array(full_races['total_milliseconds']/60000)
    races = full_races[cols]  
    #races = races.dropna(how = 'any')
    X = np.array(races)
    lasso = lasso.fit(X, y)
    A = lasso.predict(X)
    mse = mean_squared_error(A, y)
    mae = MAE(A,y)
    print('mse:', mse)
    print('mae:',mae)
    
    counter = 0
    
    full_races = pd.DataFrame()
    print('---testing---')
    for raceId, race in sliced_shuffled.items():
        full = pd.DataFrame(columns = race.columns)
        if counter == train_break:
            last_lap = max(race.lap_number.tolist())
            for did in race.driverId.unique():
                stop_sum = np.sum(race.where(race.driverId == did).dropna(how='all')['stop_binary'])
                race['stop_sum'] = stop_sum
                driver_df = race.where(np.logical_and(race.lap_number == last_lap, race.driverId == did)).dropna(how = 'all')
                full = full.append(driver_df)
                
            #temp = pd.get_dummies(data = full, columns = ['driverId','constructorId'], drop_first = True)
            #temp.replace(np.nan,0,inplace = True)
            #full = temp.copy(deep = True)
            #for col in temp.columns:
            #    full[col] = temp[col]
            #full = hot_encode_top(['status_clean'],full)
            #full = hot_encode_top(['status_clean','driverId', 'constructorId'],full)
            full_races = full_races.append(full)
            
        else:
            counter += 1
    #full_races = hot_encode_top(['status_clean','driverId', 'constructorId'],full_races,15)
    full_races = hot_encode_top(['status_clean','driverId'],full_races,category_count)
    full_races = full_races.replace(np.nan,0)   
    y = np.array(full_races['total_milliseconds']/60000)
    cols = [col for col in full_races.columns if col not in nogo_columns] 
    races = full_races[cols]  
    X = np.array(races)
    A = lasso.predict(X)
    mse = mean_squared_error(A, y)
    mae = MAE(A,y)
    print('mse:', mse)
    print('mae:',mae)
    print('alpha:',alpha)
    print('====================')
    
    #hinzufügen der 'wichtigen' spalten zur späteren auswertung der vorhersagen auf dem trainingsdatensatz
    full_races['prediction'] = A
    full_races['total_milliseconds'] = y
    full_races['total_minutes'] = y/60000

---shuffle races---
---training---


  positive)
  positive)


mse: 17.299965468326885
mae: 2.4759877864118827
---testing---
mse: 14.827476054289448
mae: 2.73470304443648
alpha: 0
---training---
mse: 17.8846096882247
mae: 2.4978737299185285
---testing---
mse: 15.52465420092796
mae: 2.7609354783144138
alpha: 0.1
---training---
mse: 18.360667543129985
mae: 2.515535834264905
---testing---
mse: 16.26206361865759
mae: 2.7776880142138514
alpha: 0.2
---training---
mse: 19.143986913412625
mae: 2.5524060365249275
---testing---
mse: 17.238558385765582
mae: 2.811279375569477
alpha: 0.3
---training---
mse: 20.231202845084375
mae: 2.6015110195439246
---testing---
mse: 18.47981842020273
mae: 2.85675592969557
alpha: 0.4
---training---
mse: 21.622453695948877
mae: 2.6624840885124703
---testing---
mse: 19.98222329317425
mae: 2.9188235014857593
alpha: 0.5
---training---
mse: 23.322871005487816
mae: 2.7376346918667194
---testing---
mse: 21.753544645280343
mae: 2.9916066708523648
alpha: 0.6
---training---
mse: 25.3324552939859
mae: 2.823248437510403
---testing---
mse

In [19]:
treffer = 0
nahdran = 0
weiterweg = 0
for raceId in full_races.raceId.unique():
    
    race = full_races.where(full_races.raceId == raceId).dropna(how = 'all')
    winner = race.where(race.podium_position == 1).dropna(how = 'all')['driver_fullname'].tolist()[0]
    second = race.where(race.podium_position == 2).dropna(how = 'all')['driver_fullname'].tolist()[0]
    third = race.where(race.podium_position == 3).dropna(how = 'all')['driver_fullname'].tolist()[0]
    
    fourth = race.where(race.podium_position == 4).dropna(how = 'all')['driver_fullname'].tolist()[0]
    fifth = race.where(race.podium_position == 5).dropna(how = 'all')['driver_fullname'].tolist()[0]
    sixth = race.where(race.podium_position == 6).dropna(how = 'all')['driver_fullname'].tolist()[0]
    seventh = race.where(race.podium_position == 7).dropna(how = 'all')['driver_fullname'].tolist()[0]
    
    pred_winner = race.where(race.prediction == min(race.prediction)).dropna(how = 'all')['driver_fullname'].tolist()[0]
    
    podium = [second,third]
    seven = [fourth,fifth,sixth,seventh]
    if pred_winner == winner:
        treffer += 1
    if pred_winner in podium:
        nahdran += 1
    if pred_winner in seven:
        weiterweg += 1
    print('winner, second, third:', winner,',' ,second,',' , third)
    print('predicted winner:', pred_winner,'\n')
    
print('genaue treffer:', treffer)
print('nah dran:', nahdran)
print('weiter weg:', weiterweg)

winner, second, third: Nico Rosberg , Lewis Hamilton , Felipe Massa
predicted winner: Nico Rosberg 

winner, second, third: Lewis Hamilton , Valtteri Bottas , Daniel Ricciardo
predicted winner: Lewis Hamilton 

winner, second, third: Nico Rosberg , Lewis Hamilton , Felipe Massa
predicted winner: Nico Rosberg 

winner, second, third: Sebastian Vettel , Lewis Hamilton , Valtteri Bottas
predicted winner: Lewis Hamilton 

winner, second, third: Sebastian Vettel , Fernando Alonso , Kimi RÃŒ_ikkÃŒÂ¦nen
predicted winner: Sebastian Vettel 

winner, second, third: Sebastian Vettel , Daniil Kvyat , Daniel Ricciardo
predicted winner: Carlos Sainz 

winner, second, third: Jenson Button , Fernando Alonso , Felipe Massa
predicted winner: Lewis Hamilton 

winner, second, third: Nico Rosberg , Lewis Hamilton , Daniel Ricciardo
predicted winner: Nico Rosberg 

winner, second, third: Lewis Hamilton , Nico Rosberg , Max Verstappen
predicted winner: Sergio PÃŒÂ©rez 

winner, second, third: Kimi RÃŒ_ikkÃŒÂ