## Load Data

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

data_path = Path('./data/raw')
data_files = list(data_path.glob('*.csv'))
data_files

[PosixPath('data/raw/driver_pts.csv'),
 PosixPath('data/raw/team_price.csv'),
 PosixPath('data/raw/team_pts.csv'),
 PosixPath('data/raw/driver_price.csv')]

In [2]:
frames = {}

for f in data_files:
    frames[f.stem] = pd.read_csv(f, sep='|')


In [3]:
pts_driver = frames.get('driver_pts').copy()
pts_team = frames.get('team_pts').copy()
price_driver = frames.get('driver_price').copy()
price_team = frames.get('team_price').copy()

## Clean and Calculate

### Points Tables

In [4]:
track_cols = [col for col in pts_driver.columns if (len(col) == 3 and col.isupper()) or col.endswith('.1') or col.endswith('.2')]
track_cols

['BAH',
 'SAU',
 'AUS',
 'ITA',
 'USA',
 'SPA',
 'MON',
 'AZE',
 'CAN',
 'BRI',
 'AUS.1',
 'FRA',
 'HUN',
 'BEL',
 'NET',
 'ITA.1',
 'SIN',
 'JAP',
 'USA.1',
 'MEX',
 'BRA',
 'ABU']

#### Tracks with Scores

In [5]:
keep_track_cols = pts_driver[track_cols].sum()[pts_driver[track_cols].sum() > 0].index

#### Points Metrics

In [6]:
def calc_metrics(df):

    df = df.replace(0, np.nan).dropna(axis=1)

    df['avg'] = df[keep_track_cols].mean(axis=1)
    df['max'] = df[keep_track_cols].max(axis=1)
    df['median'] = df[keep_track_cols].median(axis=1)
    
    return df

pts_driver = calc_metrics(pts_driver)
pts_team = calc_metrics(pts_team)


driver_total_avg_points = pts_driver['avg'].sum()  # The total points on average for all drivers
car_total_avg_points = pts_team['avg'].sum()

### Price Tables

In [7]:
def fix_prices(df):
    for col in ['Current Price', 'Season Start PriceSeason Price']: 
        df[col] = df[col].str.replace('$', '').str.replace('m', '').astype(float)

    return df

price_driver = fix_prices(price_driver)
price_team = fix_prices(price_team)

  df[col] = df[col].str.replace('$', '').str.replace('m', '').astype(float)


In [8]:
driver_total_price = price_driver['Current Price'].sum()
car_total_price = price_team['Current Price'].sum()

In [9]:
car_total_price

165.9

In [10]:
price_team['Season Start PriceSeason Price'].sum()

167.5

In [11]:
driver_total_price / (driver_total_price + car_total_price)

0.627692998204668

In [12]:
driver_total_price / car_total_price

1.6859553948161545

## Performance Prices

Calculate the price a driver or car should fetch based on its actual scoring relative to:

* The total price of all drivers/cars
* Total average points scored by all drivers/cars

In [13]:
pts_driver['pts_price'] = pts_driver['avg'] * driver_total_price / driver_total_avg_points
pts_team['pts_price'] = pts_team['avg'] * car_total_price / car_total_avg_points

pts_driver['pts_price_med'] = pts_driver['median'] * driver_total_price / driver_total_avg_points
pts_team['pts_price_med'] = pts_team['median'] * car_total_price / car_total_avg_points


In [14]:
pts_driver

Unnamed: 0,Driver,BAH,SAU,AUS,ITA,USA,Total,avg,max,median,pts_price,pts_price_med
0,Hamilton Mercedes,34,13,26,5,19,97,19.4,34,19.0,17.358221,17.00032
1,Verstappen Red Bull,5,45,4,62,56,172,34.4,62,45.0,30.779527,40.263916
2,Russell Mercedes,28,26,33,33,36,156,31.2,36,33.0,27.916315,29.526871
3,Perez Red Bull,4,22,35,44,28,133,26.6,44,28.0,23.800448,25.053103
4,Alonso Alpine,9,-3,-5,2,11,14,2.8,11,2.0,2.50531,1.789507
5,Leclerc Ferrari,49,41,49,30,50,219,43.8,50,49.0,39.190211,43.84293
6,Gasly AlphaTauri,-4,17,14,13,-1,39,7.8,17,13.0,6.979079,11.631798
7,Vettel Aston Martin,4,15,-7,24,-5,31,6.2,24,4.0,5.547473,3.579015
8,Sainz Ferrari,32,27,-5,7,31,92,18.4,32,27.0,16.463468,24.158349
9,Tsunoda AlphaTauri,19,-9,-1,30,1,40,8.0,30,1.0,7.158029,0.894754


## Merge

### Driver

* Inconsistent naming, build a match

In [15]:
# Driver Names don't match across tables, grab last name
pts_driver['last_name'] = pts_driver['Driver'].str.split(' ', expand=True)[[0]]

In [16]:
# Cross product, identify which indices contain matching names
index_map = pts_driver['last_name'].apply(lambda y: price_driver['Driver'].str.contains(y).astype(int))
index_map

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
5,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
8,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [17]:
driver_map = pd.DataFrame(zip(*np.where(index_map.eq(1))), columns=['pts', 'price'])
driver_map

Unnamed: 0,pts,price
0,0,1
1,1,0
2,2,2
3,3,4
4,4,9
5,5,3
6,6,8
7,7,11
8,8,5
9,9,15


In [18]:
driver_df = pts_driver.merge(driver_map, left_index=True, right_on='pts').merge(price_driver, left_on='price', right_index=True, suffixes=['', '_price'])
driver_df

Unnamed: 0.1,Driver,BAH,SAU,AUS,ITA,USA,Total,avg,max,median,pts_price,pts_price_med,last_name,pts,price,Unnamed: 0,Driver_price,Current Price,Season Start PriceSeason Price,Points/Million
0,Hamilton Mercedes,34,13,26,5,19,97,19.4,34,19.0,17.358221,17.00032,Hamilton,0,1,1,Lewis Hamilton HAM Mercedes,30.2,31.0,0.64
1,Verstappen Red Bull,5,45,4,62,56,172,34.4,62,45.0,30.779527,40.263916,Verstappen,1,0,0,Max Verstappen VER Red Bull,30.3,30.5,1.14
2,Russell Mercedes,28,26,33,33,36,156,31.2,36,33.0,27.916315,29.526871,Russell,2,2,2,George Russell RUS Mercedes,24.2,24.0,1.29
3,Perez Red Bull,4,22,35,44,28,133,26.6,44,28.0,23.800448,25.053103,Perez,3,4,4,Sergio Perez PER Red Bull,18.2,17.5,1.46
4,Alonso Alpine,9,-3,-5,2,11,14,2.8,11,2.0,2.50531,1.789507,Alonso,4,9,9,Fernando Alonso ALO Alpine,12.5,12.5,0.22
5,Leclerc Ferrari,49,41,49,30,50,219,43.8,50,49.0,39.190211,43.84293,Leclerc,5,3,3,Charles Leclerc LEC Ferrari,18.9,18.0,2.32
6,Gasly AlphaTauri,-4,17,14,13,-1,39,7.8,17,13.0,6.979079,11.631798,Gasly,6,8,8,Pierre Gasly GAS AlphaTauri,13.0,13.5,0.6
7,Vettel Aston Martin,4,15,-7,24,-5,31,6.2,24,4.0,5.547473,3.579015,Vettel,7,11,11,Sebastian Vettel VET Aston Martin,11.4,11.5,0.54
8,Sainz Ferrari,32,27,-5,7,31,92,18.4,32,27.0,16.463468,24.158349,Sainz,8,5,5,SAI Carlos Sainz Ferrari,17.2,17.0,1.07
9,Tsunoda AlphaTauri,19,-9,-1,30,1,40,8.0,30,1.0,7.158029,0.894754,Tsunoda,9,15,15,Yuki Tsunoda TSU AlphaTauri,8.3,8.5,0.96


### Car/Team Join

Straightforward

In [19]:
car_df = pts_team.merge(price_team, on='Team')

In [20]:
car_df

Unnamed: 0.1,Team,BAH,SAU,AUS,ITA,USA,Total,avg,max,median,pts_price,pts_price_med,Unnamed: 0,Current Price,Season Start PriceSeason Price,Points/Million
0,Mercedes,57,34,64,31,50,236.0,47.2,64,50.0,29.81904,31.587966,0,33.8,34.5,1.4
1,Red Bull,4,62,39,99,79,283.0,56.6,99,62.0,35.757578,39.169078,1,32.1,32.5,1.76
2,Alpine,24,13,10,8,19,74.0,14.8,24,13.0,9.350038,8.212871,4,13.9,14.0,1.06
3,Ferrari,76,63,44,30,76,289.0,57.8,76,63.0,36.515689,39.800838,2,25.8,25.0,2.24
4,AlphaTauri,10,3,8,36,-5,52.0,10.4,36,8.0,6.570297,5.054075,6,10.1,10.5,1.03
5,Aston Martin,14,19,3,32,-4,64.0,12.8,32,14.0,8.086519,8.844631,5,11.0,11.5,1.16
6,Alfa Romeo,31,3,24,35,3,96.0,19.2,35,24.0,12.129779,15.162224,7,8.4,8.0,2.29
7,Williams,15,-1,8,17,25,64.0,12.8,25,15.0,8.086519,9.47639,9,6.5,7.0,1.97
8,Haas,27,1,13,8,5,54.0,10.8,27,8.0,6.823001,5.054075,8,6.7,6.0,1.61
9,Mclaren,9,9,37,41,5,101.0,20.2,41,9.0,12.761538,5.685834,3,17.6,18.5,1.15


## Actual to Performance Price Differentials

In [21]:
driver_df['diff_price'] = driver_df['pts_price'] - driver_df['Current Price']
driver_df[['Driver', 'diff_price', 'Current Price', 'avg', 'median']].sort_values(by='avg', ascending=False)

Unnamed: 0,Driver,diff_price,Current Price,avg,median
5,Leclerc Ferrari,20.290211,18.9,43.8,49.0
1,Verstappen Red Bull,0.479527,30.3,34.4,45.0
2,Russell Mercedes,3.716315,24.2,31.2,33.0
3,Perez Red Bull,5.600448,18.2,26.6,28.0
0,Hamilton Mercedes,-12.841779,30.2,19.4,19.0
8,Sainz Ferrari,-0.736532,17.2,18.4,27.0
11,Bottas Alfa Romeo,6.884517,9.4,18.2,20.0
19,Norris Mclaren,-0.352335,16.1,17.6,22.0
10,Ocon Alpine,3.168714,12.4,17.4,20.0
16,Magnussen Haas,5.889699,6.1,13.4,14.0


In [22]:
car_df['diff_price'] = car_df['pts_price'] - car_df['Current Price']
car_df[['Team', 'diff_price', 'Current Price', 'avg', 'median']].sort_values(by='avg', ascending=False)

Unnamed: 0,Team,diff_price,Current Price,avg,median
3,Ferrari,10.715689,25.8,57.8,63.0
1,Red Bull,3.657578,32.1,56.6,62.0
0,Mercedes,-3.98096,33.8,47.2,50.0
9,Mclaren,-4.838462,17.6,20.2,9.0
6,Alfa Romeo,3.729779,8.4,19.2,24.0
2,Alpine,-4.549962,13.9,14.8,13.0
5,Aston Martin,-2.913481,11.0,12.8,14.0
7,Williams,1.586519,6.5,12.8,15.0
8,Haas,0.123001,6.7,10.8,8.0
4,AlphaTauri,-3.529703,10.1,10.4,8.0


## Save Data

In [23]:
save_path = Path('./data/transform')

In [24]:
driver_df.to_csv(save_path / 'driver.csv', sep='|', index=False)
car_df.to_csv(save_path / 'car.csv', sep='|', index=False)