# IndyCar Lap Time Analysis

This notebook analyzes and visualizes lap time data for three IndyCar drivers:
- Pato O'Ward (Car #5)
- Alexander Rossi (Car #7)
- Alex Palou (Car #10)

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot styles
plt.style.use('ggplot')
sns.set_context("notebook", font_scale=1.2)

# Enable inline plotting
%matplotlib inline

## Loading the Data

In [None]:
# Load the lap time data
df = pd.read_csv('indycar-lap-times.csv')

# Display basic information
print(f"Total lap times recorded: {len(df)}")
print("\nCounts by driver:")
print(df['Driver'].value_counts())

# Display the first few rows
df.head()

## Data Preprocessing

In [None]:
# Convert the Time column to float if it's not already
df['T (Time)'] = df['T (Time)'].astype(float)

# Create a more readable driver column for plotting
df['DriverName'] = df['Driver'].apply(lambda x: x.split(',')[0])

# Calculate some basic statistics
stats = df.groupby('DriverName')['T (Time)'].agg(['mean', 'std', 'min', 'max', 'count'])
stats.columns = ['Average Time', 'Std Dev', 'Fastest Lap', 'Slowest Lap', 'Lap Count']
stats

## Visualization: Lap Times Throughout the Race

In [None]:
plt.figure(figsize=(14, 8))

# Create a scatter plot for each driver
for driver, color in zip(['O\'Ward', 'Rossi', 'Palou'], ['blue', 'red', 'green']):
    driver_data = df[df['DriverName'] == driver]
    plt.plot(driver_data['Lap'], driver_data['T (Time)'], 'o-', label=driver, alpha=0.7, linewidth=1, markersize=5)

plt.title('Lap Times Throughout the Race', fontsize=16)
plt.xlabel('Lap Number', fontsize=14)
plt.ylabel('Lap Time (seconds)', fontsize=14)
plt.legend(fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)

# Add a horizontal line for the average lap time of each driver
for driver, color in zip(['O\'Ward', 'Rossi', 'Palou'], ['blue', 'red', 'green']):
    avg_time = df[df['DriverName'] == driver]['T (Time)'].mean()
    plt.axhline(y=avg_time, color=color, linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()

## Visualization: Lap Time Distribution

In [None]:
plt.figure(figsize=(14, 6))

# Box plot
sns.boxplot(x='DriverName', y='T (Time)', data=df, palette=['blue', 'green', 'red'])
plt.title('Distribution of Lap Times by Driver', fontsize=16)
plt.xlabel('Driver', fontsize=14)
plt.ylabel('Lap Time (seconds)', fontsize=14)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Violin plot for more detailed distribution
plt.figure(figsize=(14, 6))
sns.violinplot(x='DriverName', y='T (Time)', data=df, palette=['blue', 'green', 'red'], inner='quartile')
plt.title('Detailed Distribution of Lap Times', fontsize=16)
plt.xlabel('Driver', fontsize=14)
plt.ylabel('Lap Time (seconds)', fontsize=14)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

## Visualization: Histogram of Lap Times

In [None]:
plt.figure(figsize=(14, 6))

# Histogram with KDE
for driver, color in zip(['O\'Ward', 'Rossi', 'Palou'], ['blue', 'red', 'green']):
    driver_data = df[df['DriverName'] == driver]['T (Time)']
    sns.histplot(driver_data, kde=True, label=driver, color=color, alpha=0.3, bins=15)

plt.title('Histogram of Lap Times', fontsize=16)
plt.xlabel('Lap Time (seconds)', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.legend(fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

## Visualization: Lap Time Differences from Personal Best

In [None]:
# Calculate gap to each driver's personal best
driver_best = df.groupby('DriverName')['T (Time)'].min().to_dict()
df['Gap to Personal Best'] = df.apply(lambda row: row['T (Time)'] - driver_best[row['DriverName']], axis=1)

plt.figure(figsize=(14, 8))

for driver, color in zip(['O\'Ward', 'Rossi', 'Palou'], ['blue', 'red', 'green']):
    driver_data = df[df['DriverName'] == driver]
    plt.plot(driver_data['Lap'], driver_data['Gap to Personal Best'], 'o-', label=driver, alpha=0.7, linewidth=1, markersize=5)

plt.title('Gap to Personal Best Lap Throughout Race', fontsize=16)
plt.xlabel('Lap Number', fontsize=14)
plt.ylabel('Gap (seconds)', fontsize=14)
plt.legend(fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

## Visualization: Heat Map of Lap Times

In [None]:
# Create a pivot table for the heat map
# Group laps into sets of 5 for better visualization
df['Lap Group'] = (df['Lap'] - 1) // 5 + 1
pivot_data = df.pivot_table(values='T (Time)', index='DriverName', columns='Lap Group', aggfunc='mean')

# Create the heat map
plt.figure(figsize=(16, 6))
sns.heatmap(pivot_data, annot=True, fmt=".2f", cmap="YlGnBu", linewidths=.5)
plt.title('Average Lap Times by 5-Lap Segments', fontsize=16)
plt.xlabel('Lap Group (sets of 5 laps)', fontsize=14)
plt.ylabel('Driver', fontsize=14)
plt.tight_layout()
plt.show()

## Statistical Comparison

In [None]:
from scipy import stats

# Compare the drivers' lap time distributions
print("Statistical comparison of lap time distributions:")
print("\nANOVA test (comparing all three drivers):")
groups = [df[df['DriverName'] == driver]['T (Time)'] for driver in ['O\'Ward', 'Rossi', 'Palou']]
f_val, p_val = stats.f_oneway(*groups)
print(f"F-value: {f_val:.4f}, p-value: {p_val:.4f}")
if p_val < 0.05:
    print("There is a statistically significant difference between at least two drivers' lap times.")
else:
    print("There is no statistically significant difference between the drivers' lap times.")

# Pairwise t-tests
print("\nPairwise t-tests:")
for i, driver1 in enumerate(['O\'Ward', 'Rossi', 'Palou']):
    for driver2 in ['O\'Ward', 'Rossi', 'Palou'][i+1:]:
        t_val, p_val = stats.ttest_ind(
            df[df['DriverName'] == driver1]['T (Time)'],
            df[df['DriverName'] == driver2]['T (Time)'],
            equal_var=False  # Using Welch's t-test which doesn't assume equal variances
        )
        print(f"{driver1} vs {driver2}: t-value = {t_val:.4f}, p-value = {p_val:.4f}")
        if p_val < 0.05:
            print(f"  There is a statistically significant difference between {driver1} and {driver2}")
        else:
            print(f"  No statistically significant difference between {driver1} and {driver2}")

## Conclusion

Based on the visualizations and statistical analysis above, we can draw the following conclusions about the three drivers' performances:

1. **Consistency**: The box plots and violin plots show which driver had the most consistent lap times.
2. **Speed**: The average lap times indicate which driver was fastest overall.
3. **Patterns**: The line charts reveal patterns in how each driver's pace evolved throughout the race.
4. **Strategy**: Significant spikes in lap times might indicate pit stops or caution periods.

These insights could be valuable for race strategists, team managers, and the drivers themselves in understanding their performance and identifying areas for improvement.