# Machine Learning Model Serving

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import time
start = time.time()

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [4]:
def save_obj(obj, name ):
    with open('web/'+ name, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [5]:
data = pd.read_csv('./data_f1/data_filtered.csv')

In [6]:
data.head()

Unnamed: 0,year,date,fp1_date,fp1_time,fp2_date,fp2_time,fp3_date,fp3_time,quali_date,quali_time,...,constructor,constructor_nationality,GP_name,country,driver,age_at_gp_in_days,driver_home,constructor_home,driver_dnf,constructor_dnf
0,2011,2011-03-27,\N,\N,\N,\N,\N,\N,\N,\N,...,Alpine F1,Fre,Albert Park Grand Prix Circuit,Aus,Nick Heidfeld,12374,0,0,0,1
1,2010,2010-03-28,\N,\N,\N,\N,\N,\N,\N,\N,...,Alpine F1,Fre,Albert Park Grand Prix Circuit,Aus,Robert Kubica,9242,0,0,0,0
2,2017,2017-03-26,\N,\N,\N,\N,\N,\N,\N,\N,...,Alpine F1,Fre,Albert Park Grand Prix Circuit,Aus,Nico Hülkenberg,10812,0,0,0,1
3,2018,2018-03-25,\N,\N,\N,\N,\N,\N,\N,\N,...,Alpine F1,Fre,Albert Park Grand Prix Circuit,Aus,Nico Hülkenberg,11176,0,0,0,0
4,2019,2019-03-17,\N,\N,\N,\N,\N,\N,\N,\N,...,Alpine F1,Fre,Albert Park Grand Prix Circuit,Aus,Nico Hülkenberg,11533,0,0,0,1


In [7]:
dnf_by_driver = data.groupby('driver').sum()['driver_dnf']
driver_race_entered = data.groupby('driver').count()['driver_dnf']
driver_dnf_ratio = (dnf_by_driver/driver_race_entered)
driver_confidence = 1-driver_dnf_ratio
driver_confidence_dict = dict(zip(driver_confidence.index,driver_confidence))

In [8]:
print(driver_confidence.dtypes)

float64


In [9]:
driver_confidence

driver
Adrian Sutil          0.868421
Alexander Albon       0.940000
Alexander Rossi       1.000000
André Lotterer        1.000000
Antonio Giovinazzi    0.918033
                        ...   
Valtteri Bottas       0.958333
Vitaly Petrov         0.879310
Vitantonio Liuzzi     0.783784
Will Stevens          1.000000
Yuki Tsunoda          0.942857
Name: driver_dnf, Length: 73, dtype: float64

In [10]:
driver_confidence_dict_str = {}
for key , value in driver_confidence_dict.items():
    # Correct for New Drivers
    if value == 1.0:
        value = 0.10
    driver_confidence_dict_str[key] = np. array([value])
    print ("%s: %s" % (key, value))
    
save_obj(driver_confidence_dict_str, 'driver_dict' )

Adrian Sutil: 0.868421052631579
Alexander Albon: 0.94
Alexander Rossi: 0.1
André Lotterer: 0.1
Antonio Giovinazzi: 0.9180327868852459
Brendon Hartley: 0.88
Bruno Senna: 0.8260869565217391
Carlos Sainz: 0.9084967320261438
Charles Leclerc: 0.8829787234042553
Charles Pic: 0.9487179487179487
Christian Klien: 0.1
Daniel Ricciardo: 0.954954954954955
Daniil Kvyat: 0.9196428571428571
Esteban Gutiérrez: 0.847457627118644
Esteban Ocon: 0.9504950495049505
Felipe Massa: 0.9161290322580645
Felipe Nasr: 0.925
Fernando Alonso: 0.9326923076923077
George Russell: 0.9178082191780822
Giedo van der Garde: 0.8421052631578947
Guanyu Zhou: 0.9230769230769231
Heikki Kovalainen: 0.95
Jack Aitken: 0.1
Jaime Alguersuari: 0.9210526315789473
Jarno Trulli: 0.972972972972973
Jean-Éric Vergne: 0.9137931034482758
Jenson Button: 0.9407407407407408
Jolyon Palmer: 0.8571428571428572
Jules Bianchi: 0.9117647058823529
Jérôme d'Ambrosio: 0.95
Kamui Kobayashi: 0.8356164383561644
Karun Chandhok: 0.8181818181818181
Kevin Magnu

In [11]:
dnf_by_constructor = data.groupby('constructor').sum()['constructor_dnf']
constructor_race_entered = data.groupby('constructor').count()['constructor_dnf']
constructor_dnf_ratio = (dnf_by_constructor/constructor_race_entered)
constructor_reliability = 1-constructor_dnf_ratio
constructor_reliability_dict = dict(zip(constructor_reliability.index,constructor_reliability))

In [12]:
type(constructor_reliability)

pandas.core.series.Series

In [13]:
constructor_reliability

constructor
Alfa Romeo        0.417339
AlphaTauri        0.474849
Alpine F1         0.542339
Aston Martin      0.611222
Caterham          0.178571
Ferrari           0.876000
HRT               0.156522
Haas F1 Team      0.353160
Lotus             0.078947
Manor Marussia    0.121212
Marussia          0.166667
McLaren           0.617706
Mercedes          0.891784
Red Bull          0.837349
Virgin            0.092105
Williams          0.482966
Name: constructor_dnf, dtype: float64

In [14]:
constructor_reliability_dict_str = {}
for key , value in constructor_reliability_dict.items():
    # Correct value 
    constructor_reliability_dict_str[key] = np. array([value])
    #print ("%s: %s" % (key, value))
    
save_obj(constructor_reliability_dict_str, 'constructor_dict' )

## Analysis F1 Teams and Drivers Probability after last race

In [15]:
data2021 = pd.read_csv('./data_f1/data_filtered_2021.csv')

In [16]:
data2021=data2021.sort_values(by='date')
races = data2021['GP_name'].unique().tolist()
print(races)

['Bahrain International Circuit', 'Autodromo Enzo e Dino Ferrari', 'Autódromo Internacional do Algarve', 'Circuit de Barcelona-Catalunya', 'Circuit de Monaco', 'Baku City Circuit', 'Circuit Paul Ricard', 'Red Bull Ring', 'Silverstone Circuit', 'Hungaroring', 'Circuit de Spa-Francorchamps', 'Circuit Park Zandvoort', 'Autodromo Nazionale di Monza', 'Sochi Autodrom', 'Istanbul Park', 'Circuit of the Americas', 'Autódromo Hermanos Rodríguez', 'Autódromo José Carlos Pace', 'Losail International Circuit', 'Jeddah Corniche Circuit', 'Yas Marina Circuit']


In [17]:
drivers = data2021['driver'].unique().tolist()
print(drivers)

['George Russell', 'Pierre Gasly', 'Nikita Mazepin', 'Mick Schumacher', 'Antonio Giovinazzi', 'Kimi Räikkönen', 'Lance Stroll', 'Sebastian Vettel', 'Max Verstappen', 'Sergio Pérez', 'Yuki Tsunoda', 'Lewis Hamilton', 'Esteban Ocon', 'Fernando Alonso', 'Charles Leclerc', 'Carlos Sainz', 'Lando Norris', 'Daniel Ricciardo', 'Nicholas Latifi', 'Valtteri Bottas', 'Robert Kubica']


In [18]:
constructors = data2021['constructor'].unique().tolist()
print(constructors)

['Williams', 'AlphaTauri', 'Haas F1 Team', 'Alfa Romeo', 'Aston Martin', 'Red Bull', 'Mercedes', 'Alpine F1', 'Ferrari', 'McLaren']


In [19]:
results = {'race': [],
          'driver':[],
          'constructor':[],
          'points':[]}
for race in races:
    for driver in drivers:
        clst = data2021.loc[(data2021['GP_name']==race)&(data2021['driver']==driver)]['constructor'].unique().tolist()
        plst = data2021.loc[(data2021['GP_name']==race)&(data2021['driver']==driver)]['points'].unique().tolist()
        for c in clst:
            for p in plst:
                results['race'].append(race)
                results['driver'].append(driver)
                results['constructor'].append(c)
                results['points'].append(p)


In [20]:
df2021= pd.DataFrame(results)
print (df2021)

                              race              driver   constructor  points
0    Bahrain International Circuit      George Russell      Williams     0.0
1    Bahrain International Circuit        Pierre Gasly    AlphaTauri     0.0
2    Bahrain International Circuit      Nikita Mazepin  Haas F1 Team     0.0
3    Bahrain International Circuit     Mick Schumacher  Haas F1 Team     0.0
4    Bahrain International Circuit  Antonio Giovinazzi    Alfa Romeo     0.0
..                             ...                 ...           ...     ...
426             Yas Marina Circuit        Carlos Sainz       Ferrari    15.0
427             Yas Marina Circuit        Lando Norris       McLaren     6.0
428             Yas Marina Circuit    Daniel Ricciardo       McLaren     0.0
429             Yas Marina Circuit     Nicholas Latifi      Williams     0.0
430             Yas Marina Circuit     Valtteri Bottas      Mercedes     8.0

[431 rows x 4 columns]


In [21]:
print(f'Number of Races done: {len(races)}')

Number of Races done: 21


In [22]:
pts_by_driver = df2021.groupby('driver').sum()['points']
driver_race = df2021.groupby('driver').count()['race']
driver_points_ratio = (pts_by_driver/len(races)/25.0)
driver_points_dict = dict(zip(driver_points_ratio.index,driver_points_ratio))

In [23]:
driver_points_dict

{'Antonio Giovinazzi': 0.005714285714285714,
 'Carlos Sainz': 0.31142857142857144,
 'Charles Leclerc': 0.3028571428571428,
 'Daniel Ricciardo': 0.21714285714285717,
 'Esteban Ocon': 0.14095238095238094,
 'Fernando Alonso': 0.15428571428571428,
 'George Russell': 0.030476190476190476,
 'Kimi Räikkönen': 0.019047619047619046,
 'Lance Stroll': 0.06476190476190476,
 'Lando Norris': 0.30476190476190473,
 'Lewis Hamilton': 0.7342857142857143,
 'Max Verstappen': 0.74,
 'Mick Schumacher': 0.0,
 'Nicholas Latifi': 0.013333333333333332,
 'Nikita Mazepin': 0.0,
 'Pierre Gasly': 0.20952380952380953,
 'Robert Kubica': 0.0,
 'Sebastian Vettel': 0.0819047619047619,
 'Sergio Pérez': 0.3619047619047619,
 'Valtteri Bottas': 0.41714285714285715,
 'Yuki Tsunoda': 0.06095238095238095}

In [24]:
pts_by_constructor = df2021.groupby('constructor').sum()['points']
constructor_race = df2021.groupby('constructor').count()['race']
constructor_pts_ratio = (pts_by_constructor/len(races)/(25.0+18.0))
constructor_points_dict = dict(zip(constructor_pts_ratio.index,constructor_pts_ratio))
#constructor_points_dict = dict(zip(pts_by_constructor.index,pts_by_constructor))

In [25]:
constructor_points_dict

{'Alfa Romeo': 0.014396456256921373,
 'AlphaTauri': 0.15725359911406422,
 'Alpine F1': 0.17165005537098563,
 'Aston Martin': 0.08527131782945736,
 'Ferrari': 0.35714285714285715,
 'Haas F1 Team': 0.0,
 'McLaren': 0.3034330011074197,
 'Mercedes': 0.6694352159468439,
 'Red Bull': 0.6406423034330011,
 'Williams': 0.025470653377630124}

In [26]:
driver_pred_dict_str = {}
for key , value in driver_points_dict.items():
    driver_pred_dict_str[key] = np. array([value])
    #print ("%s: %s" % (key, value))

In [27]:
save_obj(driver_pred_dict_str, 'driver_pred_dict' )

In [28]:
constructor_pred_dict_str = {}
for key , value in constructor_points_dict.items():
    constructor_pred_dict_str[key] = np. array([value])
    #print ("%s: %s" % (key, value))

In [29]:
save_obj(constructor_pred_dict_str, 'constructor_pred_dict' )

## Generate Qualifying dataset for Predictor

In [30]:
qualif = {'race': [],
          'driver':[],
          'quali_pos':[]
         }
for race in races:
    for driver in drivers:
        qlst = data2021.loc[(data2021['GP_name']==race)&(data2021['driver']==driver)]['quali_pos'].unique().tolist()
        for q in qlst:
            qualif['race'].append(race)
            qualif['driver'].append(driver)
            qualif['quali_pos'].append(q)


In [31]:
qualif2021= pd.DataFrame(qualif)
print (qualif2021)

                              race              driver  quali_pos
0    Bahrain International Circuit      George Russell         15
1    Bahrain International Circuit        Pierre Gasly          5
2    Bahrain International Circuit      Nikita Mazepin         19
3    Bahrain International Circuit     Mick Schumacher         18
4    Bahrain International Circuit  Antonio Giovinazzi         12
..                             ...                 ...        ...
425             Yas Marina Circuit        Carlos Sainz          5
426             Yas Marina Circuit        Lando Norris          3
427             Yas Marina Circuit    Daniel Ricciardo         10
428             Yas Marina Circuit     Nicholas Latifi         16
429             Yas Marina Circuit     Valtteri Bottas          6

[430 rows x 3 columns]


In [32]:
qualif2021.to_csv('./data_f1/qualif_filtered.csv', index = False)

In [33]:
# last race
last_race = races[len(races)-1]
print(last_race)

Yas Marina Circuit


In [34]:
qualif_last= qualif2021[qualif2021['race']==last_race]
print (qualif_last)

                   race              driver  quali_pos
410  Yas Marina Circuit      George Russell         17
411  Yas Marina Circuit        Pierre Gasly         12
412  Yas Marina Circuit      Nikita Mazepin         20
413  Yas Marina Circuit     Mick Schumacher         19
414  Yas Marina Circuit  Antonio Giovinazzi         14
415  Yas Marina Circuit      Kimi Räikkönen         18
416  Yas Marina Circuit        Lance Stroll         13
417  Yas Marina Circuit    Sebastian Vettel         15
418  Yas Marina Circuit      Max Verstappen          1
419  Yas Marina Circuit        Sergio Pérez          4
420  Yas Marina Circuit        Yuki Tsunoda          8
421  Yas Marina Circuit      Lewis Hamilton          2
422  Yas Marina Circuit        Esteban Ocon          9
423  Yas Marina Circuit     Fernando Alonso         11
424  Yas Marina Circuit     Charles Leclerc          7
425  Yas Marina Circuit        Carlos Sainz          5
426  Yas Marina Circuit        Lando Norris          3
427  Yas M

In [35]:
qualif_last.to_csv('./data_f1/qualif_lastrace.csv', index = False)

## Generate Machine Learning Model: Random Forest 

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

def save_model(model, model_filepath):
    """
    Export your model as a pickle file.
    Saves trained model as pickle file to be loaded later.
    
    """
    
    filename = model_filepath
    pickle.dump(model, open(filename, 'wb'))

In [37]:
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.model_selection import cross_val_score,StratifiedKFold,RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix,precision_score,f1_score,recall_score
from sklearn.neural_network import MLPClassifier, MLPRegressor
plt.style.use('seaborn')


In [38]:
def position_index(x):
    if x<4:
        return 1
    if x>10:
        return 3
    else :
        return 2

In [39]:
n_estimators = [int(x) for x in np.linspace(start=200,stop=2000,num=10)]
max_features = ['auto','sqrt']
max_depth = [int(x) for x in np.linspace(10,110,num=11)]
min_samples_split = [2,5,8,10,15,20]
min_samples_leaf = [1,2,4,6,8,10]
bootstrap = [True,False]

random_parms = {
    'n_estimators':n_estimators,
    'max_features':max_features,
    'max_depth':max_depth,
    'min_samples_split':min_samples_split,
    'min_samples_leaf':min_samples_leaf,
    'bootstrap':bootstrap
                }

In [40]:
# Dataset generated in previous phase
data = pd.read_csv('./data_f1/cleaned_data.csv')

In [41]:
x = data[['GP_name','quali_pos','constructor','driver','position','driver_confidence','constructor_reliability','active_driver','active_constructor']]

In [42]:
x = x[x['active_constructor']==1]
x = x[x['active_driver']==1]

In [43]:
sc  = StandardScaler()
le = LabelEncoder()
x['GP_name'] = le.fit_transform(x['GP_name'])
x['constructor'] = le.fit_transform(x['constructor'])
x['driver'] = le.fit_transform(x['driver'])
x['GP_name'] = le.fit_transform(x['GP_name'])
X = x.drop(['position','active_driver','active_constructor'],1)
y = x['position'].apply(lambda x: position_index(x))

In [None]:
rf_rand = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator=rf_rand,param_distributions=random_parms,n_iter=100,cv=10,verbose=2,n_jobs=-1)
rf_random.fit(X,y)
rf_random.best_params_

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


In [None]:
rf = RandomForestClassifier(n_estimators=100,min_samples_split=20,min_samples_leaf=1,max_features='sqrt',max_depth=90,bootstrap=True)
kf = StratifiedKFold(n_splits=10,random_state=None,shuffle=False)
for train_index,test_index in kf.split(X,y):
    X_train,X_test = X.iloc[train_index],X.iloc[test_index]
    y_train,y_test = y.iloc[train_index],y.iloc[test_index]

In [None]:
rf.fit(X_train,y_train)
y_pred_rf = rf.predict(X_test)
cnf_mat_rf = confusion_matrix(y_test,y_pred_rf)
cnf_mat_rf = cnf_mat_rf/cnf_mat_rf.sum()

In [None]:
model_filepath='./models/{}.pkl'.format('RandomForestClassifier')
print('Saving model ...\n    MODEL: {}'.format(model_filepath))
save_model(rf, model_filepath)

In [None]:
print(X.columns)

In [None]:
importances = rf.feature_importances_
importances

# Analysis of Machine Learning Model Serving

In [None]:
import pickle

model_filepath='./models/{}.pkl'.format('RandomForestClassifier')

# load model using pickle l from disk
print('Loading model ...\n    MODEL: {}'.format(model_filepath))
loaded_model = pickle.load(open(model_filepath, 'rb'))
# model = joblib.load(model_filepath)


In [None]:
importances = loaded_model.feature_importances_
importances

In [None]:
feature_names=['GP_name', 'quali_pos', 'constructor', 'driver', 'driver_confidence',
       'constructor_reliability']

In [None]:
import time
import numpy as np

start_time = time.time()
std = np.std([
    tree.feature_importances_ for tree in loaded_model.estimators_], axis=0)
elapsed_time = time.time() - start_time

print(f"Elapsed time to compute the importances: "
      f"{elapsed_time:.3f} seconds")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
forest_importances = pd.Series(importances, index=feature_names)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

In [None]:
len(loaded_model.estimators_)

### The plot of first Decision Tree:

In [None]:
from sklearn import tree
plt.figure(figsize=(20,20))
_ = tree.plot_tree(loaded_model.estimators_[0], feature_names=feature_names, filled=True)

Let’s check the depth of the first tree from the Random Forest:

In [None]:
loaded_model.estimators_[0].tree_.max_depth

Our first tree has max_depth=14. Other trees have similar depth. You can check for the different trees.

### The plot of Second Decision Tree:

In [None]:
plt.figure(figsize=(20,20))
_ = tree.plot_tree(loaded_model.estimators_[1], feature_names=feature_names, filled=True)

In [None]:
fig, axes = plt.subplots(nrows = 1,ncols = 5,figsize = (10,2), dpi=900)
for index in range(0, 5):
    tree.plot_tree(loaded_model.estimators_[index],
                   feature_names = feature_names, 
                   # class_names=cn,
                   filled = True,
                   ax = axes[index]);

    axes[index].set_title('Estimator: ' + str(index), fontsize = 11)    

In [None]:
end = time.time()

In [None]:
import datetime
str(datetime.timedelta(seconds=(end - start)))

In [None]:
print(str(end - start)+" seconds")

## Let's See The Results

To see the results of the lab, you'll need to start the web server using Terminal.

1. In the menu at the top of the page, select **File->New->Terminal**.
2. Enter the following commands, hitting return after each one (feel free to copy and paste)

        cd /home/opc/redbull-analytics-hol/beginners/web
        source /home/opc/redbullenv/bin/activate
        python3 app.py
3. Open a web browser to the public IP of your Jupyter Lab, but use port 8443 instead of port 8001:

        https://xxx.xxx.xxx.xxx:8443