## Load Data

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

data_path = Path('./data/raw')
data_files = list(data_path.glob('*.csv'))
data_files

[PosixPath('data/raw/driver_pts.csv'),
 PosixPath('data/raw/team_price.csv'),
 PosixPath('data/raw/team_pts.csv'),
 PosixPath('data/raw/driver_price.csv')]

In [2]:
frames = {}

for f in data_files:
    frames[f.stem] = pd.read_csv(f, sep='|')


In [3]:
pts_driver = frames.get('driver_pts').copy()
pts_team = frames.get('team_pts').copy()
price_driver = frames.get('driver_price').copy()
price_team = frames.get('team_price').copy()

## Clean and Calculate

### Points Tables

In [4]:
track_cols = [col for col in pts_driver.columns if (len(col) == 3 and col.isupper()) or col.endswith('.1') or col.endswith('.2')]
track_cols

['BAH',
 'SAU',
 'AUS',
 'ITA',
 'USA',
 'SPA',
 'MON',
 'AZE',
 'CAN',
 'BRI',
 'AUS.1',
 'FRA',
 'HUN',
 'BEL',
 'NET',
 'ITA.1',
 'SIN',
 'JAP',
 'USA.1',
 'MEX',
 'BRA',
 'ABU']

#### Tracks with Scores

In [5]:
keep_track_cols = pts_driver[track_cols].sum()[pts_driver[track_cols].sum() > 0].index
keep_track_cols

Index(['BAH', 'SAU', 'AUS', 'ITA', 'USA', 'SPA', 'MON', 'AZE', 'CAN', 'BRI',
       'AUS.1', 'FRA', 'HUN', 'BEL', 'NET'],
      dtype='object')

In [6]:
pts_driver[keep_track_cols]

Unnamed: 0,BAH,SAU,AUS,ITA,USA,SPA,MON,AZE,CAN,BRI,AUS.1,FRA,HUN,BEL,NET
0,34,13,26,5,19,21,11,26,48,39,39,38,44,4,25
1,5,45,4,62,56,45,28,54,44,16,53,45,55,49,54
2,28,26,33,33,36,33,26,34,27,1,35,30,27,30,53
3,4,22,35,44,28,49,46,38,-8,36,4,22,23,32,20
4,9,-3,-5,2,5,14,19,22,7,37,18,23,12,24,34
5,49,41,49,30,50,5,22,5,23,22,60,5,14,20,31
6,-4,17,14,13,-1,5,15,26,11,-6,8,13,15,16,6
7,4,15,-7,24,-5,17,10,25,14,19,8,11,16,19,12
8,32,27,-5,7,31,25,34,0,42,49,10,32,26,31,11
9,19,-9,-1,30,3,15,-5,-3,-9,4,3,-2,-2,12,-3


In [7]:
pts_driver[keep_track_cols].mean(axis=1)

0     26.133333
1     41.000000
2     30.133333
3     26.333333
4     14.533333
5     28.400000
6      9.866667
7     12.133333
8     23.466667
9      3.466667
10    15.533333
11    11.000000
12    12.133333
13     8.733333
14     4.466667
15     5.000000
16     6.933333
17     5.866667
18     6.000000
19    16.733333
dtype: float64

#### Points Metrics

In [8]:
def calc_metrics(df):

    # df = df.replace(0, np.nan).dropna(axis=1)

    df['avg'] = df[keep_track_cols].mean(axis=1)
    df['max'] = df[keep_track_cols].max(axis=1)
    df['median'] = df[keep_track_cols].median(axis=1)
    
    return df

pts_driver = calc_metrics(pts_driver)
pts_team = calc_metrics(pts_team)


driver_total_avg_points = pts_driver['avg'].sum()  # The total points on average for all drivers
car_total_avg_points = pts_team['avg'].sum()

### Price Tables

In [9]:
def fix_prices(df):
    for col in ['Current Price', 'Season Start PriceSeason Price']: 
        df[col] = df[col].str.replace('$', '').str.replace('m', '').astype(float)

    return df

price_driver = fix_prices(price_driver)
price_team = fix_prices(price_team)

  df[col] = df[col].str.replace('$', '').str.replace('m', '').astype(float)


In [10]:
driver_total_price = price_driver['Current Price'].sum()
car_total_price = price_team['Current Price'].sum()

In [11]:
car_total_price

167.7

In [12]:
price_team['Season Start PriceSeason Price'].sum()

167.5

In [13]:
driver_total_price / (driver_total_price + car_total_price)

0.6259201427615436

In [14]:
driver_total_price / car_total_price

1.673225998807394

## Performance Prices

Calculate the price a driver or car should fetch based on its actual scoring relative to:

* The total price of all drivers/cars
* Total average points scored by all drivers/cars

In [15]:
pts_driver['pts_price'] = pts_driver['avg'] * driver_total_price / driver_total_avg_points
pts_team['pts_price'] = pts_team['avg'] * car_total_price / car_total_avg_points

pts_driver['pts_price_med'] = pts_driver['median'] * driver_total_price / driver_total_avg_points
pts_team['pts_price_med'] = pts_team['median'] * car_total_price / car_total_avg_points


In [16]:
pts_driver

Unnamed: 0.1,Unnamed: 0,Driver,BAH,SAU,AUS,ITA,USA,SPA,MON,AZE,...,USA.1,MEX,BRA,ABU,Total,avg,max,median,pts_price,pts_price_med
0,0,Hamilton Mercedes,34,13,26,5,19,21,11,26,...,0,0,0,0,392,26.133333,48,26.0,23.818796,23.697272
1,1,Verstappen Red Bull,5,45,4,62,56,45,28,54,...,0,0,0,0,615,41.0,62,45.0,37.368774,41.014508
2,2,Russell Mercedes,28,26,33,33,36,33,26,34,...,0,0,0,0,452,30.133333,53,30.0,27.46453,27.343006
3,3,Perez Red Bull,4,22,35,44,28,49,46,38,...,0,0,0,0,395,26.333333,49,28.0,24.001083,25.520139
4,4,Alonso Alpine,9,-3,-5,2,5,14,19,22,...,0,0,0,0,218,14.533333,37,14.0,13.246167,12.760069
5,5,Leclerc Ferrari,49,41,49,30,50,5,22,5,...,0,0,0,0,426,28.4,60,23.0,25.884712,20.962971
6,6,Gasly AlphaTauri,-4,17,14,13,-1,5,15,26,...,0,0,0,0,148,9.866667,26,13.0,8.992811,11.848636
7,7,Vettel Aston Martin,4,15,-7,24,-5,17,10,25,...,0,0,0,0,182,12.133333,25,14.0,11.058727,12.760069
8,8,Sainz Ferrari,32,27,-5,7,31,25,34,0,...,0,0,0,0,352,23.466667,49,27.0,21.388307,24.608705
9,9,Tsunoda AlphaTauri,19,-9,-1,30,3,15,-5,-3,...,0,0,0,0,52,3.466667,30,-1.0,3.159636,-0.911434


## Merge

### Driver

* Inconsistent naming, build a match

In [17]:
# Driver Names don't match across tables, grab last name
pts_driver['last_name'] = pts_driver['Driver'].str.split(' ', expand=True)[[0]]

In [18]:
# Cross product, identify which indices contain matching names
index_map = pts_driver['last_name'].apply(lambda y: price_driver['Driver'].str.contains(y).astype(int))
index_map

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
5,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
8,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [19]:
driver_map = pd.DataFrame(zip(*np.where(index_map.eq(1))), columns=['pts', 'price'])
driver_map

Unnamed: 0,pts,price
0,0,0
1,1,1
2,2,2
3,3,4
4,4,9
5,5,3
6,6,8
7,7,11
8,8,5
9,9,14


In [20]:
driver_df = pts_driver.merge(driver_map, left_index=True, right_on='pts').merge(price_driver, left_on='price', right_index=True, suffixes=['', '_price'])
driver_df

Unnamed: 0.1,Unnamed: 0,Driver,BAH,SAU,AUS,ITA,USA,SPA,MON,AZE,...,pts_price,pts_price_med,last_name,pts,price,Unnamed: 0_price,Driver_price,Current Price,Season Start PriceSeason Price,Points/Million
0,0,Hamilton Mercedes,34,13,26,5,19,21,11,26,...,23.818796,23.697272,Hamilton,0,0,0,Lewis Hamilton HAM Mercedes,30.6,31.0,0.85
1,1,Verstappen Red Bull,5,45,4,62,56,45,28,54,...,37.368774,41.014508,Verstappen,1,1,1,Max Verstappen VER Red Bull,30.4,30.5,1.35
2,2,Russell Mercedes,28,26,33,33,36,33,26,34,...,27.46453,27.343006,Russell,2,2,2,George Russell RUS Mercedes,23.8,24.0,1.27
3,3,Perez Red Bull,4,22,35,44,28,49,46,38,...,24.001083,25.520139,Perez,3,4,4,Sergio Perez PER Red Bull,17.9,17.5,1.47
4,4,Alonso Alpine,9,-3,-5,2,5,14,19,22,...,13.246167,12.760069,Alonso,4,9,9,Fernando Alonso ALO Alpine,12.7,12.5,1.14
5,5,Leclerc Ferrari,49,41,49,30,50,5,22,5,...,25.884712,20.962971,Leclerc,5,3,3,Charles Leclerc LEC Ferrari,18.7,18.0,1.52
6,6,Gasly AlphaTauri,-4,17,14,13,-1,5,15,26,...,8.992811,11.848636,Gasly,6,8,8,Pierre Gasly GAS AlphaTauri,13.0,13.5,0.76
7,7,Vettel Aston Martin,4,15,-7,24,-5,17,10,25,...,11.058727,12.760069,Vettel,7,11,11,Sebastian Vettel VET Aston Martin,11.8,11.5,1.03
8,8,Sainz Ferrari,32,27,-5,7,31,25,34,0,...,21.388307,24.608705,Sainz,8,5,5,SAI Carlos Sainz Ferrari,17.2,17.0,1.36
9,9,Tsunoda AlphaTauri,19,-9,-1,30,3,15,-5,-3,...,3.159636,-0.911434,Tsunoda,9,14,14,Yuki Tsunoda TSU AlphaTauri,8.4,8.5,0.41


### Car/Team Join

Straightforward

In [21]:
car_df = pts_team.merge(price_team, on='Team')

In [22]:
car_df

Unnamed: 0,Unnamed: 0_x,Team,BAH,SAU,AUS,ITA,USA,SPA,MON,AZE,...,Total,avg,max,median,pts_price,pts_price_med,Unnamed: 0_y,Current Price,Season Start PriceSeason Price,Points/Million
0,0,Mercedes,57,34,64,31,40,49,42,60,...,760.0,50.666667,76,55.0,33.788971,36.678818,0,34.2,34.5,1.48
1,1,Red Bull,4,62,39,99,69,94,69,77,...,911.0,60.733333,99,63.0,40.502306,42.013918,1,32.7,32.5,1.86
2,2,Alpine,24,13,10,8,19,33,15,27,...,347.0,23.133333,40,23.0,15.427333,15.338415,4,14.1,14.0,1.64
3,3,Ferrari,76,63,44,30,56,30,51,3,...,697.0,46.466667,76,47.0,30.988043,31.343717,2,25.8,25.0,1.8
4,4,AlphaTauri,10,3,8,36,-3,15,5,18,...,121.0,8.066667,36,6.0,5.37956,4.001326,6,10.2,10.5,0.79
5,5,Aston Martin,14,19,3,32,1,20,15,28,...,285.0,19.0,32,20.0,12.670864,13.337752,5,11.8,11.5,1.61
6,6,Alfa Romeo,31,3,24,35,3,10,21,3,...,146.0,9.733333,35,7.0,6.491039,4.668213,7,8.6,8.0,1.13
7,7,Williams,15,-1,8,17,26,14,1,20,...,127.0,8.466667,26,8.0,5.646315,5.335101,9,6.2,7.0,1.37
8,8,Haas,27,1,13,8,5,-6,-16,3,...,116.0,7.733333,38,5.0,5.157264,3.334438,8,6.4,6.0,1.21
9,9,Mclaren,9,9,37,41,1,13,26,24,...,262.0,17.466667,41,16.0,11.648303,10.670201,3,17.7,18.5,0.99


## Actual to Performance Price Differentials

In [23]:
driver_df['diff_price'] = driver_df['pts_price'] - driver_df['Current Price']
driver_df[['Driver', 'diff_price', 'Current Price', 'avg', 'median']].sort_values(by='diff_price', ascending=False)

Unnamed: 0,Driver,diff_price,Current Price,avg,median
5,Leclerc Ferrari,7.184712,18.7,28.4,23.0
1,Verstappen Red Bull,6.968774,30.4,41.0,45.0
3,Perez Red Bull,6.101083,17.9,26.333333,28.0
8,Sainz Ferrari,4.188307,17.2,23.466667,27.0
2,Russell Mercedes,3.66453,23.8,30.133333,30.0
12,Stroll Aston Martin,1.958727,9.1,12.133333,11.0
10,Ocon Alpine,1.857601,12.3,15.533333,19.0
4,Alonso Alpine,0.546167,12.7,14.533333,14.0
11,Bottas Alfa Romeo,0.525769,9.5,11.0,14.0
13,Albon Williams,0.259853,7.7,8.733333,10.0


In [24]:
car_df['diff_price'] = car_df['pts_price'] - car_df['Current Price']
car_df[['Team', 'diff_price', 'Current Price', 'avg', 'median']].sort_values(by='avg', ascending=False)

Unnamed: 0,Team,diff_price,Current Price,avg,median
1,Red Bull,7.802306,32.7,60.733333,63.0
0,Mercedes,-0.411029,34.2,50.666667,55.0
3,Ferrari,5.188043,25.8,46.466667,47.0
2,Alpine,1.327333,14.1,23.133333,23.0
5,Aston Martin,0.870864,11.8,19.0,20.0
9,Mclaren,-6.051697,17.7,17.466667,16.0
6,Alfa Romeo,-2.108961,8.6,9.733333,7.0
7,Williams,-0.553685,6.2,8.466667,8.0
4,AlphaTauri,-4.82044,10.2,8.066667,6.0
8,Haas,-1.242736,6.4,7.733333,5.0


## Save Data

In [25]:
save_path = Path('./data/transform')

In [26]:
driver_df.to_csv(save_path / 'driver.csv', sep='|', index=False)
car_df.to_csv(save_path / 'car.csv', sep='|', index=False)