ML model for Formula One.

In [None]:

import sys
import os
import datetime 
import tensorflow                                      as tf
import shutil                                          as shu
import warnings                                        as warn
import pandas                                          as pd
import matplotlib.pyplot                               as plt
from   pymongo                 import MongoClient
from   pprint                  import pprint
from   colorama                import Style            as st
from   colorama                import Fore             
from   colorama                import Back             as bk
from   sklearn.model_selection import train_test_split as tts
from   sklearn.preprocessing   import StandardScaler   as sts
from   tensorflow              import keras            as ker
from   keras.models            import Sequential       as seq
from   keras.layers            import Dense            as den
from   sklearn.metrics         import classification_report

warn.filterwarnings('ignore')

In [None]:
#
# Environment Setup
#

start_time = datetime.datetime.now()
w, h       = shu.get_terminal_size()

def printSeparator():
    print(Fore.GREEN + '-' * w + Fore.WHITE)
    
def logStep(msg):
    l1 = len(msg)
    l2 = w - l1
    print(Fore.WHITE + str(datetime.datetime.now()) +  " " + Fore.YELLOW + msg + Fore.WHITE + "-" * l2  )
    sys.stdout.flush()

logStep("ENVIRONMENT PREPARATION")
warn.filterwarnings("ignore", category=DeprecationWarning)

def printDFinfo(name,dfName):
    printSeparator()
    print('Name: ',name)
    printSeparator()
    print(dfName.info())
    printSeparator()
    print(f'Row Count :{Fore.RED}')
    print(dfName.count(),Fore.WHITE)
    printSeparator()
    print(dfName.head())
    printSeparator()

def runtime_Diff(step_Number, step_Message1,base_SQL, cached_SQL):
  start_time          = datetime.datetime.now()
  logStep(step_Number + " - RUNTIME DIFFERENCE")
  time_difference     = base_SQL - cached_SQL
  logStep(F"{step_Number} - Time required for a non-cached Query : {base_SQL}")
  logStep(F"{step_Number} - Time required for a cached/part Query: {cached_SQL}")
  logStep(F"{step_Number} - Time difference                      : {time_difference}")
  logStep(step_Number + " - DONE")
  end_time            = datetime.datetime.now()
  step_elapsed_time   = end_time - start_time
  logStep(F"{step_Number} - ELAPSED TIME: {step_elapsed_time} seconds")
  return step_elapsed_time

print(F"Copyright                              : {sys.copyright}")
print(F"OS Platform                            : {sys.platform}")
print(F"OS Name                                : {os.name}")
print(F"OS HOME                                : {os.environ.get('HOME')}")
print(F"OS uName                               : {os.uname().sysname}")
print(F"OS NodeName                            : {os.uname().nodename}")
print(F"OS Release                             : {os.uname().release}")
print(F"OS Release Ver                         : {os.uname().version}")
print(F"OS Machine                             : {os.uname().machine}")
print(F"Process ID                             : {os.getpid()}")
print(F"Parent Process                         : {os.getppid()}")
print(F"OS User                                : {os.getlogin()}")
print(F"OS User ID                             : {os.getuid()}")
print(F"OS Group ID                            : {os.getgid()}")
print(F"OS Effective ID                        : {os.geteuid()}")
print(F"OS Effective GID                       : {os.getegid()}")
print(F"Current dir                            : {os.getcwd()}")
print(F"Python version                         : {sys.version}")
print(F"Version info                           : {sys.version_info}")
print(F"Python API Ver                         : {sys.api_version}")
print(F"Executable                             : {sys.executable}")
print(F"Spark UI                               : http://localhost:4040")
print(F"Spark submit                           : {sys.argv[0]}")
print(F"Hadoop Home                            : {os.environ.get('HADOOP_HOME')}")
print(F"Java Home                              : {os.environ.get('JAVA_HOME')}")
print(F"Current Working Directory              : {os.getcwd()}")

logStep("DONE");
end_time            = datetime.datetime.now()
step00_elapsed_time = end_time - start_time
logStep(F"ELAPSED TIME: {step00_elapsed_time} seconds")

In [None]:
logStep('Environment Preparation')

mongo = MongoClient(port=27017)
print(mongo.list_database_names())
db    = mongo['F1']
print(db.list_collection_names())
Results = db['Results']
logStep('Environment Preparation Completed')

In [None]:
# Check that the new restaurant was inserted
# Filter results by name
logStep('Model Preparation')

query  = ({"Position" : {"$gt": 0}})
fields = {'Position'  : 1,
          'Driver': 1,
          'Starting Grid': 1,
          'Laps': 1,
          'Time/Retired': 1,
          'Points': 1}
results = Results.find(query,fields)
results_df = pd.DataFrame(list(results))
results_df = results_df.drop(columns=['_id'])
print(results_df.head())
logStep('Model Preparation Completed')

In [None]:
printDFinfo('results_df',results_df)

In [None]:
#
# Log the processing progress
#

logStep('Model Preparation')
    
# 
# Determine the number of unique values in each column.
#

printSeparator()
print('results_df.nunique()')
printSeparator()
print(results_df.nunique())
printSeparator()

logStep('Model Preparation')
    
# 

In [None]:
#
# Log the processing progress
#

logStep('For columns that have more than 10 unique values, determine the number of data points for each unique value.')
    

for column_Name in results_df.columns:
    if results_df[column_Name].nunique() > 10:
        print('Column Name', column_Name)
        printSeparator()
        print(results_df[column_Name].value_counts())
        printSeparator()


logStep('determined the number of data points for each unique value.')
  

In [None]:
results_df.insert(0,'Champion',0)
print(results_df)

In [None]:
#
# Log the processing progress
#

logStep('Use  pd.get_dummies()  to encode categorical variables.')
#
# Convert categorical data to numeric with `pd.get_dummies`
#

results_df_numeric = pd.get_dummies(results_df)
printDFinfo('results_df_numeric',results_df_numeric)

In [None]:
#
# Log the processing progress
#

logStep('Preprocess the Data')

# 
# Split our preprocessed data into our features and target arrays
#
results_df_numeric.dropna()
X = results_df_numeric.drop(['Champion'], axis=1)
y = results_df_numeric['Champion']

#
# Split the preprocessed data into a training and testing dataset
#

X_train, X_test, y_train, y_test = tts(X, y, random_state=58)

print('X_train.shape')
print(X_train.shape)
printSeparator()
print('X_test.shape')
print(X_test.shape)
printSeparator()
print('y_train.shape')
print(y_train.shape)
printSeparator()
print('y_test.shape')
print(y_test.shape)
printSeparator()

In [None]:
#
# Log the processing progress
#

logStep('Scale the training and testing features datasets')

#
# Create a Standard Scaler instance
# Fit the Standard Scaler
# Scale the data
#

scaler         = sts()
X_scaler       = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled  = X_scaler.transform(X_test)

#
# Log the processing progress
#

print('X_train_scaled.shape')
print(X_train_scaled.shape)
printSeparator()
print('X_test_scaled.shape')
print(X_test_scaled.shape)
printSeparator()

In [None]:
#
# Log the processing progress
#

logStep('Define the model parameters')

# 
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
#

number_input_features = len(X_train_scaled[0]) # type: ignore
hidden_nodes_layer1   = 12
hidden_nodes_layer2   = 12

nn_model = seq()

#
# First hidden layer
#

nn_model.add(den(units=hidden_nodes_layer1,input_dim=number_input_features, activation="tanh"))

#
# Second hidden layer
#

nn_model.add(den(units=hidden_nodes_layer2, activation="tanh"))

#
# Output layer
#

nn_model.add(den(units=1, activation="tanh"))

#
# Check the structure of the model
#

nn_model.summary()
printSeparator()
#
# Log the processing progress
#

logStep('10 - Compile the model')

# 
# Compile the model
#

nn_model.compile(loss      = 'binary_crossentropy', 
                 optimizer = 'adam', 
                 metrics   = ['accuracy'])
print('Model compiled')
printSeparator()

#
# Log the processing progress
#

logStep('Train the model parameters')
# 
# Train the model
#

print('Model Training')
printSeparator()

fit_model = nn_model.fit(X_train_scaled,y_train,epochs=50)

printSeparator()
print('Model Training Complete')
printSeparator()

In [None]:
#
# Log the processing progress
#

logStep('Evaluate the model parameters')
# 
# Evaluate the model using the test data
#

skip_optimization = False
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test)
print(f"Loss: {model_loss:2.2f}, Accuracy: {model_accuracy:2.2f}")
accuracy = model_accuracy * 100
if (accuracy < 75):   
    print(F'Accuracy is {accuracy:2.2f}%, less than 75%')
    print("More optimization is required")
else:
    print(F'Accuracy is {accuracy:2.2f}%, greater than or equal to 75%')
    print("Model is optimized")
    history_df = pd.DataFrame(fit_model.history, index=range(1,len(fit_model.history["loss"])+1))
    history_df.plot(y="accuracy")
    plt.show()
    history_df.plot(y="loss",color='red')
    plt.show()
    #
    # Log the processing progress
    #
    logStep('13 - Export the model to a HDF5 file')
    # 
    # Export our model to HDF5 file
    #
    filename = 'Output/F1_model.h5'
    #
    # Save the model to a HDF5 file
    #

    nn_model.save(filename)
    printSeparator()
    print('Model saved to file : ',filename)
    printSeparator()
    print('End of processing')
    skip_optimization = True

In [None]:
#
# Log the processing progress
#
if skip_optimization == False:
  logStep('Attempt 2 - Add more neurons to a hidden layer')
  number_input_features = len(X_train_scaled[0]) # type: ignore
  print('Number of input features : ',number_input_features)
  for nodes in range(14, 140, 2):
    printSeparator()
    print('Number of hidden nodes   : ',nodes)
    hidden_nodes_layer1   = nodes
    hidden_nodes_layer2   = nodes
    hidden_nodes_layer3   = nodes
    hidden_nodes_layer4   = nodes
    hidden_nodes_layer5   = nodes
    nn_model2             = seq(name=f"Optimized_Model_{nodes}")
    nn_model2.add(den(units=hidden_nodes_layer1,input_dim=number_input_features, activation="tanh"))
    nn_model2.add(den(units=hidden_nodes_layer2, activation="tanh"))
    nn_model2.add(den(units=hidden_nodes_layer3, activation="tanh"))
    nn_model2.add(den(units=hidden_nodes_layer4, activation="tanh"))
    nn_model2.add(den(units=hidden_nodes_layer5, activation="tanh"))
    nn_model2.add(den(units=1,activation="tanh"))
    nn_model2.summary()
    printSeparator()
    print('Compile the model')
    nn_model2.compile(loss = 'binary_crossentropy',optimizer = 'adam',metrics   = ['accuracy'])
    print('Model compiled')
    printSeparator()
    print('Fit the model')
    fit_model2 = nn_model2.fit(X_train_scaled,y_train,epochs=50)
    print('Model fit')
    printSeparator()
    print('Evaluate the model')
    model_loss2, model_accuracy2 = nn_model2.evaluate(X_test_scaled,y_test)
    print(f"Loss: {model_loss2:2.2f}, Accuracy: {model_accuracy2:2.2f}")
    accuracy2 = model_accuracy2 * 100
    if (accuracy2 < 75):   
      print(F'Accuracy is {accuracy2:2.2f}, less than 75%')
      print("More optimization is required")
    else:
      print(F'Accuracy is {accuracy2:2.2f}, greater than or equal to 75%')
      print("Model is optimized")
      logStep('15 - Export the model to an HDF5 file')
      filename = 'Output/F1_model.h5'
      nn_model.save(filename)
      printSeparator()
      print('Model saved to file : ',filename)
      printSeparator()
      print('End of processing')
      break

In [None]:
model = tf.keras.models.load_model(filename)
predictions = model.predict(X_test_scaled[:5])

target_names = results_df['Driver']
print(target_names.head(5))
# Print the predictions
pprint(predictions)


In [None]:
mongo.close()