# Capstone Project
Exported from Filament on Tue, 12 Apr 2022 15:25:13 GMT

---

In [None]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 

import statsmodels.api as sm
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix, accuracy_score

# from sklearn import svm
# from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import os

In [None]:
# reading in f1_csv files
f1_csv = [
            'drivers.csv', 'results.csv', 'driver_standings.csv',
            'constructors.csv', 'constructor_results.csv',
            'constructor_standings.csv', 'races.csv'
          ]

drivers_cols = ['driverId', 'forename', 'surname', 'dob',
                'nationality']
results_cols = ['resultId', 'raceId', 'driverId', 'constructorId',
                'grid', 'positionOrder', 'points']
driver_standings_cols = ['driverStandingsId', 'raceId', 'driverId',
                        'points', 'position', 'wins']
constructors_cols = ['constructorId', 'name', 'nationality']
constructor_results_cols = ['constructorResultsId', 'raceId',
                           'constructorId', 'points']
constructor_standings_cols = ['constructorStandingsId', 'raceId',
                              'constructorId', 'points', 'position',
                              'positionText', 'wins']
races_cols = ['raceId', 'year', 'round']

drivers = pd.read_csv('drivers.csv', usecols=drivers_cols)
drivers = drivers.set_index('driverId')

results = pd.read_csv('results.csv', usecols=results_cols)

driver_standings = pd.read_csv('driver_standings.csv', usecols=driver_standings_cols)

constructors = pd.read_csv('constructors.csv', usecols=constructors_cols)
constructors = constructors.set_index('constructorId')

constructor_results = pd.read_csv('constructor_results.csv', 
                                  usecols=constructor_results_cols)

constructor_standings = pd.read_csv('constructor_standings.csv',
                                   usecols=constructor_standings_cols)

races = pd.read_csv('races.csv', usecols=races_cols)
races = races.set_index('raceId')

In [None]:
# checking for nulls and datatype in each dataframe
f1_dfs = [drivers, results, driver_standings, constructors,
          constructor_results, constructor_standings, races]

for i in f1_dfs:
    print(i.dtypes)
    for columns in i.columns:  
        nulls = i[columns].isna().sum()
        print(f'{columns}: {nulls}')
    print('\n')

In [None]:
## This was checking all erronous results that will be used in
## this Series are assigned an E for expulsion
constructor_standings[constructor_standings.positionText == 'E']

In [None]:
## The following code was to assertain strange driver
## rankings from odd years of f1, this is also a reason we look
## at the last 20 years, although 1996 onwards would be acceptable
## as well

driver_standings.position.unique()

## Working on 1 year of data for proof of concept

In [None]:
results_race = results.join(races, on='raceId')
results_2002 = results_race.loc[results_race.year == 2002]
results_2002['podium'] = results_2002.positionOrder.map(lambda x: 1 if x<=3 else 0)
results_2002['win'] = results_2002.positionOrder.map(lambda x: 1 if x==1 else 0)

results_2002 = results_2002.groupby(['year','driverId']).agg({
    'points': 'sum',
    'positionOrder': 'median',
    'podium': 'sum', 
    'win': 'sum',
    'grid': 'median', 
    'driverId': 'count',
    'constructorId': lambda x:x.value_counts().index[0]
                }).rename(columns={
                            'positionOrder': 'median_position',
                            'grid': 'median_start_position',
                            'podium': 'podiums',
                            'win': 'wins',
                            'driverId': 'num_races'
                                        })
results_2002['percentage_races'] = results_2002['num_races'] / results_2002['num_races'].max()
results_2002['full_season'] = results_2002.percentage_races.map(lambda x: 1 if x==1 else 0)


## Working on driver rankings for one season
ds = driver_standings.join(races, on='raceId')
ds2002 = ds.loc[ds.year == 2002]
ds2002 = ds2002.loc[ds2002['round'] == ds2002['round'].max()]
ds2002 = ds2002[['driverId', 'position']].set_index('driverId')

## Working on team rankings for one season
cs = constructor_standings.join(races, on='raceId')
cs2002 = cs.loc[cs.year == 2002]
cs2002 = cs2002.loc[cs2002['round'] == cs2002['round'].max()]
cs2002 = cs2002[['constructorId', 'points', 'position']].set_index('constructorId')

## join our information
results_2002 = results_2002.join(ds2002, on='driverId')
results_2002 = results_2002.join(cs2002, on='constructorId',
                                 lsuffix='_driver', rsuffix='_constructor')
results_2002['position_driver'] = results_2002['position_driver'].fillna((results_2002['position_driver'].max() + 1))
results_2002.reset_index()

## Function to enable DF generation for any year

In [None]:
def driver_attributes_year(input_year):
    '''
    Generates a data frame for analysis with the following Series
    attributes
    
    year: season of championship
    driverId: driverId number, information in the drivers.csv file
    points_driver: sum of driver points for that season
    median_position: median final position of driver for the season
    podiums: number of season podiums (position = 1, 2, 3)
    wins: number of season wins (position = 1)
    median_start_position: median qualifying position of the driver
        (this is post sprint race result for 2021 onwards)
    num_races: number of race weekends entered
        (some drivers do not qualify fast enough for the race)
    percentage_races: percent of races taken part in
    full_season: driver complete a full season (1 = yes, 0 = no)
    constructorId: or team, Id number, information in the
        constructors.csv file. This is the team the driver raced for
        most in the season (modal class)
    position_driver: season ending rank of the driver
    points_constructor: number of constructor points that season 
    position_constructor: season ending rank of the constructor

    input_year - format: int, the championship year dataframe 
                            to be generated
    '''
    # removing slice warning
    pd.options.mode.chained_assignment = None

    # working on driver result information for the season
    res_race = results.join(races, on='raceId')
    res_year = res_race.loc[res_race.year == input_year]
    res_year['podium'] = res_year.positionOrder.map(lambda x: 1 if x<=3 else 0)
    res_year['win'] = res_year.positionOrder.map(lambda x: 1 if x==1 else 0)  
    res_year = res_year.groupby(['year','driverId']).agg(
        {
        'points': 'sum',
        'positionOrder': 'median',
        'podium': 'sum', 
        'win': 'sum',
        'grid': 'median', 
        'driverId': 'count',
        'constructorId': lambda x:x.value_counts().index[0]
                    }).rename(columns={
                                'positionOrder': 'median_position',
                                'grid': 'median_start_position',
                                'podium': 'podiums',
                                'win': 'wins',
                                'driverId': 'num_races'
                                            })
    res_year['percentage_races'] = res_year['num_races'] / res_year['num_races'].max()
    res_year['full_season'] = res_year.percentage_races.map(lambda x: 1 if x==1 else 0)
    
    ## Working on driver rankings for one season
    ds = driver_standings.join(races, on='raceId')
    ds_year = ds.loc[ds.year == input_year]
    ds_year = ds_year.loc[ds_year['round'] == ds_year['round'].max()]
    ds_year = ds_year[['driverId',
                       'position']].set_index('driverId')
    
    ## Working on team rankings for one season
    cs = constructor_standings.join(races, on='raceId')
    cs_year = cs.loc[cs.year == input_year]
    cs_year = cs_year.loc[cs_year['round'] == cs_year['round'].max()]
    cs_year = cs_year[['constructorId',
                       'points',
                       'position']].set_index('constructorId')
    
    ## join our information
    res_year = res_year.join(ds_year, on='driverId')
    res_year = res_year.join(cs_year,
                             on='constructorId',
                             lsuffix='_driver',
                             rsuffix='_constructor')
    res_year['position_driver'] = res_year['position_driver'].fillna((results_2002['position_driver'].max() + 1))
    return res_year.reset_index()


def analysis_years(start=2002, stop=2021):
    '''
    A range of year values of which to generate a concatenated 
    dataframe for. Column information is found in doc_string of
    driver_attribute_year function

    start = format: int, first year of data frame
    stop = format: int, last year of data frame

    Default values are start=2002 and stop=2021, generating a
    current 20 year period
    
    '''
    df = driver_attributes_year(start)
    for i in range(start+1, stop+1):
        df1 = driver_attributes_year(i)
        df = pd.concat([df, df1], ignore_index=True)
    return df
        

In [None]:
twenty_year_results = analysis_years()
twenty_year_results.to_csv('twenty_year_results.csv')

In [None]:
df = pd.read_csv('twenty_year_results.csv', index_col=0)
df = df.join(drivers, on='driverId')
df['dob'] = pd.to_datetime(df['dob'], format="%Y-%m-%d")
df = df.join(constructors, on='constructorId', lsuffix ='_driver', rsuffix = '_constructor')
df['driver_move'] = 0
df.info()

In [None]:
# Calculates the churn value, based on if a duplicate exists for
# driver and constructor pair in the following year. 
for i in range(df.year.min(), df.year.max() + 1):
    driver_change = pd.Series(df[(df.year <= i+1) & (df.year >= i)].duplicated(keep='last', subset=['driverId', 'constructorId']), name='bool')
    driver_change = driver_change.to_dict()
    for k, v in driver_change.items():
        if v == True:
            df.driver_move[k] = 0
        elif v == False:
            df.driver_move[k] = 1
             
# Will always assume last year leave as it has nothing to compare
# it to. These are the indexes of remaining drivers 2021 in df to 
# set to 0
remain_final_year_list = [460, 461, 464, 465, 466, 468, 469, 470,
                    471, 473, 474, 475, 477, 478, 479, 480]
df.loc[remain_final_year_list, 'driver_move'] = 0

In [None]:
print(f"Stay 0 = {round(256/481, 4)*100}%")
print(f"Leave 1 = {round(225/481, 4)*100}%")
df.groupby('driver_move').agg({'driver_move': 'count'})

Our data is generally equally distributed between 0 and 1 outcome, so our base metric will be accuracy. F1 score will also be used as metric as well

## Feature Engineering

In [None]:
df['driver_age'] = df.year - pd.DatetimeIndex(df['dob']).year
df['percentage_of_constructor'] = df.points_driver.div(df.points_constructor).fillna(0).map(lambda x: 1 if x>1 else x)
df['position_gain'] = df.median_start_position - df.median_position
df['nationality_match'] = np.where(df.nationality_driver == df.nationality_constructor, 1, 0)
df['out_perform_constructor'] = (df.position_constructor*2) - df.position_driver
df['podium_scored'] = np.where(df.podiums >= 1, 1, 0)
df.head()

## Functions for modelling

In [None]:
def conf_matrix(matrix):
    '''
    A function for the genration of a confusion matrix heatmap
    
    inputs:
        matrix: as type (numpy.ndarray)
    '''
    ax = plt.subplot()
    sns.heatmap(matrix, annot = True, ax = ax, fmt = 'g', vmin=0, cmap='crest',
               annot_kws={"fontsize":30})
    ax.xaxis.set_ticklabels(['predicted remainer (0)', 'predicted leaver (1)'])
    # plt.setp( ax.xaxis.get_majorticklabels(), rotation=-45, ha="left" )

    plt.setp(ax.yaxis.set_ticklabels(['actual remainer (0)', 'actual leaver(1)']), va='center')

def a_p_r(y_pred, y_real):
    '''
    Generates the following metrics and prints them.
        Accuracy: number of correct/total predictions
        Precision: true positive/true positive + false positive
        Recall: true positive/true positive + false negative
        F1 score: 2*(precision*recall / precision+recall)
    
    inputs:
        y_pred: pd.Series object
        y_real: pd.Series object
    '''
    accuracy = metrics.accuracy_score(y_real, y_pred)
    precision = metrics.precision_score(y_real, y_pred)
    recall = metrics.recall_score(y_real, y_pred)
    f1 = metrics.f1_score(y_real, y_pred)
    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f"F1:{f1}")
    return accuracy, precision, recall, f1

In [None]:
number_df_cols = feature_cols = [
                # 'year',
                # 'driverId',
                'points_driver',
                'median_position',
                'podiums',
                'wins', 
                'median_start_position', 
                # 'num_races',
                'percentage_races',
                'full_season',
                # 'constructorId',
                'position_driver',
                'points_constructor',
                'position_constructor',
                # 'forename',
                # 'surname',
                # 'dob',
                # 'nationality_driver',
                'driver_age',
                # 'constructorRef',
                # 'name',
                # 'nationality_constructor',
                # 'driver_move',
                'percentage_of_constructor', 
                'position_gain',
                'nationality_match',
                'podium_scored',
                'out_perform_constructor'
                ]
number_df = df[number_df_cols]
plt.figure(figsize=(10,5))
sns.heatmap(number_df.corr(),cbar=True,fmt =' .2f', annot=True, cmap='coolwarm')

## Data Prepped for modelling

In [None]:
df.shape
print(df.columns)

In [None]:
removed_cols = [
    'year',
    'driverId',
    'constructorId',
    'forename',
    'surname',
    'dob',
    'nationality_driver',
    'name',
    'nationality_constructor',
    'driver_move',
    ]

featured_cols = [x for x in df.columns if x not in removed_cols]
X = df[featured_cols]
y = df.driver_move
X_train, X_test, y_train, y_test = train_test_split(X,
                                        y,
                                        test_size = 0.25,
                                        random_state = 22,
                                                   )

## Exploratory Data analysis 

In [None]:
ax = sns.countplot(x=y_train, data=X_train, hue='full_season')
ax.set(xlabel= None, ylabel = "Count of Results")
ax.set_xticklabels(['Driver Remain', 'Driver Leave'])
plt.legend(title='Full Season', loc='upper right', labels=['No', 'Yes'])
plt.show()

In [None]:
ax = sns.countplot(x=y_train, data=X_train, hue='nationality_match')
ax.set(xlabel= None, ylabel = "Count of Results")
ax.set_xticklabels(['Driver Remain', 'Driver Leave'])
plt.legend(title='Nationality Match', loc='upper right', labels=['No', 'Yes'])
plt.show()

In [None]:
x_input = 'median_position'
sns.histplot(x = x_input, hue = y_train,
                  data = X_train, multiple='dodge').set(xlabel='Median Position',
                                                ylabel = "Count of Results")
plt.legend(title='Driver Leaves?', loc='upper right', labels=['Yes', 'No'])
plt.figure(figsize=(30,10))
plt.show()
# plt.figure(figsize=(35,5))
# ((X_train[y_train==1].position_gain.value_counts().sort_index()/len(y_train))*100).plot(kind='bar',color='r')
# ((X_train[y_train==0].position_gain.value_counts().sort_index()/len(y_train))*100).plot(kind='bar',color='g',alpha=0.4)

## Logistic Regression (Logit sm)

In [None]:
feature_cols = [
                # 'year',
                # 'driverId',
                'points_driver',
                'median_position',
                'podiums',
                'wins', 
                'median_start_position', 
                'num_races',
                'percentage_races',
                # 'full_season',
                # 'constructorId',
                'position_driver',
                # 'points_constructor',
                # 'position_constructor',
                # 'forename',
                # 'surname',
                # 'dob',
                # 'nationality_driver',
                'driver_age',
                # 'constructorRef',
                # 'name',
                # 'nationality_constructor',
                # 'driver_move',
                # 'percentage_of_constructor', 
                # 'position_gain',
                # 'nationality_match',
                # 'podium_scored',
                # 'out_perform_constructor'
                ]
lg_X_train = X_train[feature_cols]
lg_X_test = X_test[feature_cols]
lg_X_train = sm.add_constant(lg_X_train)
lg_X_test = sm.add_constant(lg_X_test)

driver_model = sm.Logit(y_train, lg_X_train).fit()
driver_model.summary()

In [None]:
def test_accuracy(cut_off):
    lg_X_train['binary_pred'] = np.where(lg_X_train['pred'] > cut_off, 1, 0)
    test_accuracy = accuracy_score(y_train, lg_X_train['binary_pred'])
    print(f'Test accuracy is {test_accuracy} with cut off of {cut_off}')
    
def accuracy_0to1(list):
    for i in list:
        print(test_accuracy(i))

In [None]:
lg_X_train['pred'] = driver_model.predict(lg_X_train)
lg_X_train['binary_pred'] = np.where(lg_X_train['pred'] > 0.50, 1, 0)
lg_X_test['pred'] = driver_model.predict(lg_X_test)
lg_X_test['binary_pred'] = np.where(lg_X_test['pred'] > 0.50, 1, 0)

In [None]:
driver_matrix = confusion_matrix(lg_X_test.binary_pred, y_test)
print(type(driver_matrix))
conf_matrix(driver_matrix)

In [None]:
print(metrics.classification_report(lg_X_train.binary_pred, y_train))
print(metrics.classification_report(lg_X_test.binary_pred, y_test))

## Logistic Regression (Sklearn)

In [None]:
feature_cols = [
                # 'points_driver',
                'median_position',
                # 'podiums',
                # 'wins', 
                # 'median_start_position', 
                # 'num_races',
                # 'percentage_races',
                'full_season',
                # 'position_driver',
                # 'points_constructor',
                # 'position_constructor',
                # 'driver_age',
                # 'driver_move',
                'percentage_of_constructor', 
                # 'position_gain',
                # 'nationality_match',
                'podium_scored',
                'out_perform_constructor'
                ]

lg_X_train = X_train[feature_cols]
lg_X_test = X_test[feature_cols]

params = [
    {
        'penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'C': np.logspace(-4, 4, 20),
        'solver': ['lbfgs', 'newton-cg', 'sag', 'saga'],
        'max_iter': [100, 500, 1000]
    }
]

lg = GridSearchCV(LogisticRegression(), param_grid = params, scoring='accuracy')
lg.fit(lg_X_train, y_train)

train_pred = lg.predict(lg_X_train)
test_pred = lg.predict(lg_X_test)

In [None]:
print(lg.best_params_)
print(lg.best_score_)
print(lg.best_estimator_)

In [None]:
print(metrics.classification_report(train_pred, y_train))
tr_lg_matrix = confusion_matrix(train_pred, y_train)
conf_matrix(tr_lg_matrix)

In [None]:
a_p_r(test_pred, y_test)
te_lg_matrix = confusion_matrix(test_pred, y_test)
conf_matrix(te_lg_matrix)

## SVM

## decision trees

In [None]:
dt_cols = [
                # 'year',
                # 'driverId',
                # 'points_driver',
                'median_position',
                # 'podiums',
                # 'wins', 
                # 'median_start_position', 
                # 'num_races',
                # 'percentage_races',
                'full_season',
                # 'constructorId',
                'position_driver',
                # 'points_constructor',
                # 'position_constructor',
                # 'forename',
                # 'surname',
                # 'dob',
                # 'nationality_driver',
                # 'driver_age',
                # 'constructorRef',
                # 'name',
                # 'nationality_constructor',
                # 'driver_move',
                'percentage_of_constructor', 
                # 'position_gain',
                # 'nationality_match',
                'podium_scored'
                ]
dt_X_train = X_train[dt_cols]
dt_X_test = X_test[dt_cols]

dt = GridSearchCV(estimator = DecisionTreeClassifier(),
                    param_grid = {'max_depth': [2, 3, 4, 5, 6], # all the parameters we are testing for
                                  'min_samples_split': [2, 3, 4],
                                  'min_samples_leaf': [2, 3, 4]},
                    cv = 5,
                    refit = True,
                    verbose = 1, # getting an output
                    scoring = 'accuracy')

dt.fit(dt_X_train, y_train)
dt_train_pred = dt.predict(dt_X_train)
dt_test_pred = dt.predict(dt_X_test)

In [None]:
print(dt.best_params_)
print(dt.best_score_)
print(dt.best_estimator_)

In [None]:
dt = DecisionTreeClassifier(max_depth=3,
                              min_samples_leaf=4,
                              min_samples_split=2)
dt.fit(dt_X_train, y_train)

In [None]:
fig = plt.figure(figsize=(175,140))
tree_plot = tree.plot_tree(dt, 
                   feature_names=dt_cols,  
                   class_names=['Stay', 'Left'],
                   filled=True)

In [None]:
print(f'Score on training set: {dt.score(dt_X_train, y_train)}')
print(f'Score on testing set: {dt.score(dt_X_test, y_test)}')
importance = list(zip(dt_cols, list(dt.feature_importances_)))
importance

In [None]:
test_results = dt_X_test.copy()
test_results['y_pred'] = dt.predict(dt_X_test)
test_results['y_real'] = y_test
test_results['y_prob'] = dt.predict_proba(dt_X_test)[:,1]
a_p_r(test_results.y_pred, test_results.y_real)

## RandomForest

In [None]:
rf_cols = [
                # 'year',
                # 'driverId',
                # 'points_driver',
                'median_position',
                # 'podiums',
                # 'wins', 
                # 'median_start_position', 
                # 'num_races',
                # 'percentage_races',
                'full_season',
                # 'constructorId',
                # 'position_driver',
                # 'points_constructor',
                # 'position_constructor',
                # 'forename',
                # 'surname',
                # 'dob',
                # 'nationality_driver',
                # 'driver_age',
                # 'constructorRef',
                # 'name',
                # 'nationality_constructor',
                # 'driver_move',
                'percentage_of_constructor', 
                'position_gain',
                # 'nationality_match',
                'podium_scored',
                'out_perform_constructor'
                ]
rf_X_train = X_train[rf_cols]
rf_X_test = X_test[rf_cols]
random = 28
rfc = RandomForestClassifier(random_state=random)

rf_params = {
    'n_estimators': [7, 8, 9, 10, 11, 12, 13, 14, 15],
    'max_depth': [2, 3, 4, 5, 6],
            }

# gs = grid search
gs = GridSearchCV(rfc, param_grid=rf_params,
                  scoring = 'accuracy', cv=5)
gs.fit(rf_X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

In [None]:
driver_rfc = RandomForestClassifier(n_estimators = gs.best_params_.get('n_estimators'),
                                  max_depth = gs.best_params_.get('max_depth'),
                                   random_state = random)
driver_rfc.fit(rf_X_train, y_train)

In [None]:
rfc_tr_results = rf_X_train.copy()
rfc_tr_results['y_pred'] = driver_rfc.predict(rf_X_train)
rfc_tr_results['y_real'] = y_train
rfc_tr_results['y_prob'] = driver_rfc.predict_proba(rf_X_train)[:,1]
a_p_r(rfc_tr_results.y_pred, rfc_tr_results.y_real)

In [None]:
rfc_te_results = rf_X_test.copy()
rfc_te_results['y_pred'] = driver_rfc.predict(rf_X_test)
rfc_te_results['y_real'] = y_test
rfc_te_results['y_prob'] = driver_rfc.predict_proba(rf_X_test)[:,1]
a_p_r(rfc_te_results.y_pred, rfc_te_results.y_real)

In [None]:
rfc_matrix = confusion_matrix(rfc_te_results.y_pred,
                                 rfc_te_results.y_real)
conf_matrix(rfc_matrix)

In [None]:
rfc_tr_results[rfc_tr_results.y_pred != rfc_tr_results.y_real].sort_values('y_prob')

In [None]:
col = ['median_position', 'full_season',
       'percentage_of_constructor',	'position_gain',
       'podium_scored', 'out_perform_constructor']
#modelname.feature_importance_
y1 = driver_rfc.feature_importances_
#plot
fig, ax = plt.subplots() 
width = 0.4 # the width of the bars 
ind = np.arange(len(y1)) # the x locations for the groups
ax.barh(ind, y1, width, color='#0064dc')
ax.set_yticks(ind+width/10)
ax.set_yticklabels(col, minor=False)
plt.title('Feature importance in RandomForest Classifier')
plt.xlabel('Relative importance')
plt.figure(figsize=(5,5))
fig.set_size_inches(6.5, 4.5, forward=True)