In [0]:
bronze_file_path = "/Volumes/catadb360dev/default/f1/"

In [0]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/Volumes/catadb360dev/default/f1/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Exploratory Data Analysis

In [0]:
DIR_PATH = bronze_file_path

# Define a function to load and display the first few rows of each dataset
def load_and_preview_csv(file_name, dir_path):
    file_path = os.path.join(dir_path, file_name)
    df = pd.read_csv(file_path)
    return df.head(), df.info(), df.describe()

# Load and preview all datasets
data_overview = {}
for file in filenames:
    data_overview[file] = load_and_preview_csv(file, DIR_PATH)

data_overview.keys()  # Display the keys to check all datasets are loaded


In [0]:
import matplotlib.pyplot as plt

# Load the races.csv dataset
races_df = pd.read_csv(os.path.join(DIR_PATH, 'races.csv'))

# Visualize the number of races per season
races_per_season = races_df['year'].value_counts().reset_index()
races_per_season.columns = ['year', 'number_of_races']
races_per_season = races_per_season.sort_values(by='year')

# Create a bar chart for the number of races per season
plt.figure(figsize=(12, 6))
plt.bar(races_per_season['year'], races_per_season['number_of_races'], color='skyblue')
plt.xlabel('Season')
plt.ylabel('Number of Races')
plt.title('Number of Races per Season')
plt.xticks(rotation=90)
plt.grid(axis='y')

# Display the chart
plt.tight_layout()
plt.show()

In [0]:
# Load the driver_standings.csv dataset
driver_standings_df = pd.read_csv(os.path.join(DIR_PATH, 'driver_standings.csv'))

# Merge with drivers.csv to get driver names
drivers_df = pd.read_csv(os.path.join(DIR_PATH, 'drivers.csv'))
merged_driver_standings = pd.merge(driver_standings_df, drivers_df, on='driverId')

# Convert 'position' to numeric, coercing errors to NaN
merged_driver_standings['position'] = pd.to_numeric(
    merged_driver_standings['position'],
    errors='coerce'
)

# Filter to include only the top 3 positions
top_driver_standings = merged_driver_standings[merged_driver_standings['position'] <= 3]

# Create a count of top positions by driver
top_driver_counts = top_driver_standings['surname'].value_counts().reset_index()
top_driver_counts.columns = ['driver', 'top_positions']

# Create a bar chart
plt.figure(figsize=(12, 6))
plt.bar(top_driver_counts['driver'], top_driver_counts['top_positions'], color='skyblue')
plt.xlabel('Driver')
plt.ylabel('Number of Top 3 Positions')
plt.title('Top 3 Positions by Driver')
plt.xticks(rotation=90)
plt.grid(axis='y')

# Display the chart
plt.tight_layout()
plt.show()

In [0]:
# Load the constructor_standings.csv dataset
constructor_standings_df = pd.read_csv(os.path.join(DIR_PATH, 'constructor_standings.csv'))

# Merge with constructors.csv to get constructor names
constructors_df = pd.read_csv(os.path.join(DIR_PATH, 'constructors.csv'))
merged_constructor_standings = pd.merge(constructor_standings_df, constructors_df, on='constructorId')

# Filter to include only the top 3 positions
top_constructor_standings = merged_constructor_standings[merged_constructor_standings['position'] <= 3]

# Create a count of top positions by constructor
top_constructor_counts = top_constructor_standings['name'].value_counts().reset_index()
top_constructor_counts.columns = ['constructor', 'top_positions']

# Create a bar chart
plt.figure(figsize=(12, 6))
plt.bar(top_constructor_counts['constructor'], top_constructor_counts['top_positions'], color='skyblue')
plt.xlabel('Constructor')
plt.ylabel('Number of Top 3 Positions')
plt.title('Top 3 Positions by Constructor')
plt.xticks(rotation=90)
plt.grid(axis='y')

# Display the chart
plt.tight_layout()
plt.show()


# Driver Performance Analysis with Regression

In [0]:
# Load the datasets
races_df = pd.read_csv(os.path.join(DIR_PATH, 'races.csv'))
drivers_df = pd.read_csv(os.path.join(DIR_PATH, 'drivers.csv'))
results_df = pd.read_csv(os.path.join(DIR_PATH, 'results.csv'))

# Merge the datasets to create a comprehensive dataset
merged_df = results_df.merge(races_df, on='raceId').merge(drivers_df, on='driverId')

# Select relevant columns for analysis
selected_columns = [
    'raceId', 'year', 'round', 'circuitId', 'driverId', 'constructorId',
    'grid', 'positionOrder', 'points', 'laps', 'milliseconds', 'fastestLap',
    'fastestLapTime', 'fastestLapSpeed', 'statusId'
]

# Create a new dataframe with selected columns
driver_performance_df = merged_df[selected_columns]

# Display the first few rows of the dataset
driver_performance_df.head()

In [0]:
import seaborn as sns
import numpy as np

# Convert fastestLapTime to total seconds for analysis
def convert_time_to_seconds(time_str):
    if time_str == '\\N':
        return np.nan
    minutes, seconds = time_str.split(':')
    return int(minutes) * 60 + float(seconds)

driver_performance_df['fastestLapTime_seconds'] = driver_performance_df['fastestLapTime'].apply(convert_time_to_seconds)

# Drop rows with missing values for the purpose of correlation analysis
correlation_df = driver_performance_df.dropna(subset=['points', 'grid', 'laps', 'fastestLapTime_seconds'])

# Calculate correlation matrix
correlation_matrix = correlation_df[['points', 'grid', 'laps', 'fastestLapTime_seconds']].corr()

# Plot the correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()


In [0]:
# Plot distributions of key features
plt.figure(figsize=(16, 12))

# Distribution of grid position
plt.subplot(3, 1, 1)
sns.histplot(driver_performance_df['grid'].dropna(), bins=20, kde=True, color='skyblue')
plt.title('Distribution of Grid Position')
plt.xlabel('Grid Position')
plt.ylabel('Frequency')

# Distribution of laps completed
plt.subplot(3, 1, 2)
sns.histplot(driver_performance_df['laps'].dropna(), bins=20, kde=True, color='green')
plt.title('Distribution of Laps Completed')
plt.xlabel('Laps Completed')
plt.ylabel('Frequency')

# Distribution of fastest lap time (in seconds)
plt.subplot(3, 1, 3)
sns.histplot(driver_performance_df['fastestLapTime_seconds'].dropna(), bins=20, kde=True, color='red')
plt.title('Distribution of Fastest Lap Time (in seconds)')
plt.xlabel('Fastest Lap Time (seconds)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


## Baseline with Linear Regression and Random Forest Regressor

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Drop rows with missing values for model training
model_df = driver_performance_df.dropna(subset=['points', 'grid', 'laps', 'fastestLapTime_seconds'])

# Select features and target variable
X = model_df[['grid', 'laps', 'fastestLapTime_seconds']]
y = model_df['points']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the models
linear_regression_model = LinearRegression()
random_forest_model = RandomForestRegressor(random_state=42)

# Train the models
linear_regression_model.fit(X_train, y_train)
random_forest_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_lr = linear_regression_model.predict(X_test)
y_pred_rf = random_forest_model.predict(X_test)

# Evaluate the models
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

(mae_lr, r2_lr, mae_rf, r2_rf)

# Feature Engineering

In [0]:
from sklearn.impute import SimpleImputer

# Calculate driver experience (number of races participated in up to the current race)
driver_performance_df['race_date'] = pd.to_datetime(driver_performance_df['raceId'].map(races_df.set_index('raceId')['date']))
driver_performance_df = driver_performance_df.sort_values(by=['driverId', 'race_date'])

# Group by driverId and calculate the cumulative count of races for each driver
driver_performance_df['driver_experience'] = driver_performance_df.groupby('driverId').cumcount()

# Drop the race_date column as it's no longer needed
driver_performance_df = driver_performance_df.drop(columns=['race_date'])

# Select features and target variable, including driver experience
X = driver_performance_df[['grid', 'laps', 'fastestLapTime_seconds', 'driver_experience']]
y = driver_performance_df['points']

# Impute missing values with the median
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

# Split the imputed data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Re-initialize the models with the imputed data
linear_regression_model = LinearRegression()
random_forest_model = RandomForestRegressor(random_state=42)

# Train the models
linear_regression_model.fit(X_train, y_train)
random_forest_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_lr = linear_regression_model.predict(X_test)
y_pred_rf = random_forest_model.predict(X_test)

# Evaluate the models
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

(mae_lr, r2_lr, mae_rf, r2_rf)

### Performance improves with feature engineering

# Race Outcome Prediction 

## Baseline - Logistic Regression

### Predicting top 3 finishers

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Create a binary target variable: 1 if the driver finished in the top 3, 0 otherwise
driver_performance_df['top_3'] = driver_performance_df['positionOrder'] <= 3

# Select features and the binary target variable
X_classification = driver_performance_df[['grid', 'laps', 'fastestLapTime_seconds', 'driver_experience']]
y_classification = driver_performance_df['top_3']

# Impute missing values with the median
X_classification_imputed = imputer.fit_transform(X_classification)

# Split the data into training and testing sets
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_classification_imputed, y_classification, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
logistic_regression_model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model
logistic_regression_model.fit(X_train_class, y_train_class)

# Make predictions on the test set
y_pred_class = logistic_regression_model.predict(X_test_class)

# Evaluate the model
accuracy = accuracy_score(y_test_class, y_pred_class)
precision = precision_score(y_test_class, y_pred_class)
recall = recall_score(y_test_class, y_pred_class)
f1 = f1_score(y_test_class, y_pred_class)
conf_matrix = confusion_matrix(y_test_class, y_pred_class)
class_report = classification_report(y_test_class, y_pred_class)

accuracy, precision, recall, f1
print(conf_matrix)
print(class_report)

## Random Forest and XGBoost

In [0]:
# %pip install xgboost

In [0]:
from sklearn.ensemble import RandomForestClassifier
import mlflow
# import mlflow.sklearn
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error
# from sklearn.model_selection import train_test_split
# from sklearn.datasets import load_boston
import xgboost as xgb

# Initialize the Random Forest and XGBoost classifiers
random_forest_classifier = RandomForestClassifier(random_state=42)
xgboost_classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Train the Random Forest model
random_forest_classifier.fit(X_train_class, y_train_class)
y_pred_rf_class = random_forest_classifier.predict(X_test_class)

# Train the XGBoost model
mlflow.autolog(
    log_input_examples=False,
    log_model_signatures=True,
    log_models=True,
    disable=False,
    exclusive=False,
    disable_for_unsupported_versions=True,
    silent=False
)
with mlflow.start_run():
    xgboost_classifier.fit(X_train_class, y_train_class)
    y_pred_xgb_class = xgboost_classifier.predict(X_test_class)

    # Evaluate the Random Forest model
    accuracy_rf = accuracy_score(y_test_class, y_pred_rf_class)
    precision_rf = precision_score(y_test_class, y_pred_rf_class)
    recall_rf = recall_score(y_test_class, y_pred_rf_class)
    f1_rf = f1_score(y_test_class, y_pred_rf_class)
    conf_matrix_rf = confusion_matrix(y_test_class, y_pred_rf_class)
    class_report_rf = classification_report(y_test_class, y_pred_rf_class)

    # Evaluate the XGBoost model
    accuracy_xgb = accuracy_score(y_test_class, y_pred_xgb_class)
    precision_xgb = precision_score(y_test_class, y_pred_xgb_class)
    recall_xgb = recall_score(y_test_class, y_pred_xgb_class)
    f1_xgb = f1_score(y_test_class, y_pred_xgb_class)
    conf_matrix_xgb = confusion_matrix(y_test_class, y_pred_xgb_class)
    class_report_xgb = classification_report(y_test_class, y_pred_xgb_class)

    # Collect and display results
    rf_results = {
        'Model': 'Random Forest',
        'Accuracy': accuracy_rf,
        'Precision': precision_rf,
        'Recall': recall_rf,
        'F1 Score': f1_rf,
        'Confusion Matrix': conf_matrix_rf,
        'Classification Report': class_report_rf
    }

    xgb_results = {
        'Model': 'XGBoost',
        'Accuracy': accuracy_xgb,
        'Precision': precision_xgb,
        'Recall': recall_xgb,
        'F1 Score': f1_xgb,
        'Confusion Matrix': conf_matrix_xgb,
        'Classification Report': class_report_xgb
    }


In [0]:
rf_results

In [0]:
xgb_results

### Overall an improved performance with XGBoost