## Load Data

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

data_path = Path('./data/raw')
data_files = list(data_path.glob('*.csv'))
data_files

In [None]:
frames = {}

for f in data_files:
    frames[f.stem] = pd.read_csv(f, sep='|')


In [None]:
pts_driver = frames.get('driver_pts').copy()
pts_team = frames.get('team_pts').copy()
price_driver = frames.get('driver_price').copy()
price_team = frames.get('team_price').copy()

## Clean and Calculate

### Points Tables

In [None]:
track_cols = [col for col in pts_driver.columns if (len(col) == 3 and col.isupper()) or col.endswith('.1') or col.endswith('.2')]
track_cols

#### Tracks with Scores

In [None]:
keep_track_cols = pts_driver[track_cols].sum()[pts_driver[track_cols].sum() > 0].index
keep_track_cols

In [None]:
pts_driver[keep_track_cols]

In [None]:
pts_driver[keep_track_cols].mean(axis=1)

#### Points Metrics

In [None]:
def calc_metrics(df):

    # df = df.replace(0, np.nan).dropna(axis=1)

    df['avg'] = df[keep_track_cols].mean(axis=1)
    df['avg_8r'] = df[keep_track_cols[-8:]].mean(axis=1)
    df['avg_4r'] = df[keep_track_cols[-4:]].mean(axis=1)
    df['max'] = df[keep_track_cols].max(axis=1)
    df['median'] = df[keep_track_cols].median(axis=1)
    
    return df

pts_driver = calc_metrics(pts_driver)
pts_team = calc_metrics(pts_team)


driver_total_avg_points = pts_driver['avg'].sum()  # The total points on average for all drivers
car_total_avg_points = pts_team['avg'].sum()

In [None]:
keep_track_cols[-4:]

In [None]:
pts_driver

### Price Tables

In [None]:
def fix_prices(df):
    for col in ['Current Price', 'Season Start PriceSeason Price']: 
        df[col] = df[col].str.replace('$', '').str.replace('m', '').astype(float)

    return df

price_driver = fix_prices(price_driver)
price_team = fix_prices(price_team)

In [None]:
driver_total_price = price_driver['Current Price'].sum()
car_total_price = price_team['Current Price'].sum()

In [None]:
car_total_price

In [None]:
price_team['Season Start PriceSeason Price'].sum()

In [None]:
driver_total_price / (driver_total_price + car_total_price)

In [None]:
driver_total_price / car_total_price

## Performance Prices

Calculate the price a driver or car should fetch based on its actual scoring relative to:

* The total price of all drivers/cars
* Total average points scored by all drivers/cars

In [None]:
pts_driver['pts_price'] = pts_driver['avg'] * driver_total_price / driver_total_avg_points
pts_team['pts_price'] = pts_team['avg'] * car_total_price / car_total_avg_points

pts_driver['pts_price_med'] = pts_driver['median'] * driver_total_price / driver_total_avg_points
pts_team['pts_price_med'] = pts_team['median'] * car_total_price / car_total_avg_points


In [None]:
pts_driver

## Merge

### Driver

* Inconsistent naming, build a match

In [None]:
# Driver Names don't match across tables, grab last name
pts_driver['last_name'] = pts_driver['Driver'].str.split(' ', expand=True)[[0]]

In [None]:
# Cross product, identify which indices contain matching names
index_map = pts_driver['last_name'].apply(lambda y: price_driver['Driver'].str.contains(y).astype(int))
index_map

In [None]:
driver_map = pd.DataFrame(zip(*np.where(index_map.eq(1))), columns=['pts', 'price'])
driver_map

In [None]:
driver_df = pts_driver.merge(driver_map, left_index=True, right_on='pts').merge(price_driver, left_on='price', right_index=True, suffixes=['', '_price'])
driver_df

### Car/Team Join

Straightforward

In [None]:
car_df = pts_team.merge(price_team, on='Team')

In [None]:
car_df

## Actual to Performance Price Differentials

In [None]:
driver_df['diff_price'] = driver_df['pts_price'] - driver_df['Current Price']
driver_df[['Driver', 'pts_price', 'Current Price', 'diff_price', 'avg', 'median', 'Points/Million']].sort_values(by='diff_price', ascending=False)

In [None]:
driver_md = driver_df[['Driver', 'pts_price', 'Current Price', 'diff_price', 'avg', 'median']].sort_values(by='diff_price', ascending=False).to_markdown()
print(driver_md)

In [None]:
car_df['diff_price'] = car_df['pts_price'] - car_df['Current Price']
car_df[['Team', 'pts_price', 'Current Price', 'diff_price', 'avg', 'median']].sort_values(by='avg', ascending=False)

In [None]:
car_md = car_df[['Team', 'pts_price', 'Current Price', 'diff_price', 'avg', 'median']].sort_values(by='diff_price', ascending=False).to_markdown() 
print(car_md)

## Save Data

In [None]:
keep_cols_driver = [col for col in driver_df.columns if not col.startswith('Unnamed')]
keep_cols_car = [col for col in car_df.columns if not col.startswith('Unnamed')]

In [None]:
save_path = Path('./data/transform')

In [None]:
driver_df[keep_cols_driver].to_csv(save_path / 'driver.csv', sep='|', index=False)
car_df[keep_cols_car].to_csv(save_path / 'car.csv', sep='|', index=False)