# Data Acquisition

## Import Relevant Modules

In [1]:
# import modules
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
import requests

## Request Data
# ========================================
### Men

In [2]:
path = r'https://iusf.indiana.edu/little500/results.html?raceType=Individual+Time+Trials&year=All&gender=M&teamName=&RiderName=&l500submit=Search#results'

In [3]:
html = requests.get(path)

In [4]:
mITT = pd.read_html(html.text)[0]
mITT = mITT.astype(dtype={'Year': 'str'})

In [5]:
def getSecs(time):
    """given a string in form mm:ss.ms, returns seconds as float
    
    str -> float"""
    if type(time) != str:
        return time
    else:
        try:
            time = '00:' + time.strip()
            secs = pd.to_timedelta(time).total_seconds()
            return secs
        except:
            return -1

In [6]:
# convert 'Finish Time' col to seconds
mITT['Finish Time'] = mITT['Finish Time'].apply(lambda x: getSecs(x))
mITT = mITT.rename(columns={'Finish Time': 'secITT'})

In [7]:
# Drop corrupted rows
mITT = mITT.drop(mITT[mITT['secITT'] == -1].index)
mITT = mITT.dropna()

In [8]:
# add column for 'rank within each team'
mITT['RankWithinTeam'] = mITT.groupby(by=['Team', 'Year'])['secITT'].transform(lambda x: np.arange(1, len(x) + 1))

In [9]:
# add column for z-score by year
zscore = zscore = lambda x: (x - x.mean()) / x.std()
mITT['zscore'] = mITT.groupby('Year')['secITT'].transform(zscore)

In [10]:
mITT.head()

Unnamed: 0,Year,Gender,Place,Name,Team,secITT,RankWithinTeam,zscore
0,2018,M,1,Joseph Krahulik,Sigma Alpha Epsilon,137.893,1.0,-2.061474
1,2018,M,2,Xavier Martinez,Black Key Bulls,139.437,1.0,-1.884708
2,2018,M,3,Ben Harris,Sigma Phi Epsilon,142.467,1.0,-1.537817
3,2018,M,4,Tom Settle,Sigma Phi Epsilon,142.503,2.0,-1.533695
4,2018,M,5,Matthew Thompson,3PH Cycling,142.63,1.0,-1.519155


# ========================================

### Women

In [11]:
fpath = r'https://iusf.indiana.edu/little500/results.html?raceType=Individual+Time+Trials&year=All&gender=F&teamName=&RiderName=&l500submit=Search#results'

In [12]:
fhtml = requests.get(fpath)

In [13]:
fITT = pd.read_html(fhtml.text)[0]
fITT = fITT.astype(dtype={'Year': 'str'})

In [14]:
# convert 'Finish Time' col to seconds
fITT['Finish Time'] = fITT['Finish Time'].apply(lambda x: getSecs(x))
fITT = fITT.rename(columns={'Finish Time': 'secITT'})

In [15]:
# Drop corrupted rows
fITT = fITT.drop(fITT[fITT['secITT'] == -1].index)
fITT = fITT.dropna()

In [16]:
# add column for 'rank within each team'
fITT['RankWithinTeam'] = fITT.groupby(by=['Team', 'Year'])['secITT'].transform(lambda x: np.arange(1, len(x) + 1))

In [17]:
# add column for z-score by year
zscore = zscore = lambda x: (x - x.mean()) / x.std()
fITT['zscore'] = fITT.groupby('Year')['secITT'].transform(zscore)

In [18]:
fITT.head()

Unnamed: 0,Year,Gender,Place,Name,Team,secITT,RankWithinTeam,zscore
0,2018,F,1,Brooke Hannon,Melanzana,153.083,1.0,-1.824415
1,2018,F,2,Rachel Brown,Kappa Alpha Theta,153.637,1.0,-1.788392
2,2018,F,3,Caitlin Kamplain,Theta Phi Alpha,157.197,1.0,-1.556911
3,2018,F,4,Hanna Coppens,Delta Gamma,157.235,1.0,-1.55444
4,2018,F,5,Audrey Morlan,Delta Gamma,159.583,2.0,-1.401767


# ========================================

### Men Team Pursuit

In [19]:
mppath = r'https://iusf.indiana.edu/little500/results.html?raceType=Team+Pursuit&year=All&gender=M&teamName=&l500submit=Search#results'
mphtml = requests.get(mppath)

In [20]:
mTP = pd.read_html(mphtml.text)[0]
mTP = mTP.drop(labels='Finals Time', axis=1)
mTP = mTP.astype(dtype={'Year': 'str'})

In [21]:
# convert 'Finish Time' col to seconds
mTP['Finish Time'] = mTP['Finish Time'].apply(lambda x: getSecs(x))
mTP = mTP.rename(columns={'Finish Time': 'secTP'})

In [22]:
# clean up trailing numbers from team names and drop 2nd teams

def removeNum(team):
    """removes trailing number '1' from teams that enter two teams
    
    str -> str"""
    if team[-1] == '1':
        return team[:-2]
    else:
        return team
    
mTP['Team'] = mTP['Team'].transform(lambda x: removeNum(x))

In [23]:
# add z-score to each row based on entries from the same year
mTP['zscore'] = mTP.groupby('Year')['secTP'].transform(lambda x: zscore(x))

In [24]:
mTP.head()

Unnamed: 0,Year,Gender,Place,Team,secTP,zscore
0,2018,M,1,Sigma Phi Epsilon,550.4,-0.910551
1,2018,M,2,Black Key Bulls,556.2,-0.827738
2,2018,M,3,Sigma Alpha Epsilon,564.11,-0.714798
3,2018,M,4,Bears,571.87,-0.604001
4,2018,M,5,Beta Theta Pi,573.51,-0.580585


# ========================================

### Women Team Pursuit

In [25]:
fppath = r'https://iusf.indiana.edu/little500/results.html?raceType=Team+Pursuit&year=All&gender=F&teamName=&l500submit=Search#results'
fphtml = requests.get(fppath)

In [26]:
fTP = pd.read_html(fphtml.text)[0]
fTP = fTP.drop(labels='Finals Time', axis=1)
fTP = fTP.astype(dtype={'Year': 'str'})

In [27]:
# convert 'Finish Time' col to seconds
fTP['Finish Time'] = fTP['Finish Time'].apply(lambda x: getSecs(x))
fTP = fTP.rename(columns={'Finish Time': 'secTP'})

In [28]:
fTP['Team'] = fTP['Team'].transform(lambda x: removeNum(x))

In [29]:
# add z-score to each row based on entries from the same year
fTP['zscore'] = fTP.groupby('Year')['secTP'].transform(lambda x: zscore(x))

In [30]:
fTP.head()

Unnamed: 0,Year,Gender,Place,Team,secTP,zscore
0,2018,F,1,Melanzana,500.05,-1.655863
1,2018,F,2,Kappa Alpha Theta,511.28,-1.37666
2,2018,F,3,SKI,523.62,-1.069859
3,2018,F,4,Teter,523.65,-1.069113
4,2018,F,5,Alpha Chi Omega,524.13,-1.057179


# ========================================

### Men Race

In [31]:
mrpath = r'https://iusf.indiana.edu/little500/results.html?raceType=Little+500+Race&year=All&gender=M&teamName=&l500submit=Search#results'
mrhtml = requests.get(mrpath)

In [32]:
mR = pd.read_html(mrhtml.text)[0]
mR = mR.astype(dtype={'Year': 'int64'})

In [33]:
mR[(mR['Laps']>=197) & (mR['Year'] >2009)].groupby('Year')['Laps'].count().mean()

14.555555555555555

# ========================================

### Women Race

In [34]:
wrpath = r'https://iusf.indiana.edu/little500/results.html?raceType=Little+500+Race&year=All&gender=F&teamName=&l500submit=Search#results'
wrhtml = requests.get(wrpath)

In [35]:
wR = pd.read_html(wrhtml.text)[0]
wR = wR.astype(dtype={'Year': 'int64'})

In [36]:
wR[(wR['Laps']>=97) & (wR['Year'] >2009)].groupby('Year')['Laps'].count().mean()

15.88888888888889

# ========================================

### Men Quals

In [37]:
mqualsPath = r'https://iusf.indiana.edu/little500/results.html?raceType=Qualifications&year=All&gender=M&teamName=&l500submit=Search#results'
mqualtshtml = requests.get(mqualsPath)

In [38]:
mQ = pd.read_html(mqualtshtml.text)[0]
mQ = mQ.astype(dtype={'Year': 'int64'})
mQ = mQ.rename(columns={'Finish Time': 'secQuals'})

In [39]:
mQ['secQuals'] = mQ['secQuals'].apply(lambda x: getSecs(x))
mQ['zscore'] = mQ.groupby('Year')['secQuals'].apply(lambda x: zscore(x))

In [40]:
mQ.head()

Unnamed: 0,Year,Gender,Place,Team,secQuals,zscore
0,2018,M,1,Cutters,154.973,-1.741349
1,2018,M,2,Kappa Sigma,155.052,-1.731198
2,2018,M,3,Sigma Alpha Epsilon,157.798,-1.378366
3,2018,M,4,Sigma Phi Epsilon,159.377,-1.175481
4,2018,M,5,Pi Lambda Phi,159.491,-1.160833


# ========================================

### Women Quals

In [41]:
fqualsPath = r'https://iusf.indiana.edu/little500/results.html?raceType=Qualifications&year=All&gender=F&teamName=&l500submit=Search#results'
fqualtshtml = requests.get(fqualsPath)

In [42]:
fQ = pd.read_html(fqualtshtml.text)[0]
fQ = fQ.astype(dtype={'Year': 'int64'})
fQ = fQ.rename(columns={'Finish Time': 'secQuals'})

In [43]:
fQ['secQuals'] = fQ['secQuals'].apply(lambda x: getSecs(x))
fQ['zscore'] = fQ.groupby('Year')['secQuals'].apply(lambda x: zscore(x))

In [44]:
fQ.head()

Unnamed: 0,Year,Gender,Place,Team,secQuals,zscore
0,2018,F,1,Delta Gamma,171.359,-1.648757
1,2018,F,2,Kappa Alpha Theta,177.661,-1.248071
2,2018,F,3,Alpha Gamma Delta,178.694,-1.182392
3,2018,F,4,Alpha Chi Omega,180.349,-1.077166
4,2018,F,5,Teter,180.452,-1.070617


# Merging the DataFrames

In [45]:
# DataFrame will contain 'aITT', 'bITT', 'cITT', 'TP', 'Result'

# mR only select years after 2000
# Create TeamIndex as primary key
# Create TeamIndex in mTP and mITT as foreign key

wrk_Race = mR.iloc[:, [0,3,2]]
wrk_Race = wrk_Race[wrk_Race['Year']>=2000]
wrk_Race['TeamID'] = np.arange(0,len(wrk_Race),1)
wrk_Race = wrk_Race[['TeamID', 'Year', 'Team', 'Place']]

In [48]:
wrk_ITT.dtypes

ID          int64
Year       object
Team       object
zscore    float64
dtype: object

In [49]:
wrk_Race.dtypes

TeamID     int64
Year       int64
Team      object
Place      int64
dtype: object

In [50]:
mITT.head()
wrk_ITT = mITT.iloc[:, [0,4,7]]
wrk_ITT['ID'] = np.arange(0, len(wrk_ITT), 1)
wrk_ITT = wrk_ITT.iloc[:, [3,0,1,2]]
wrk_ITT = wrk_ITT.astype(dtype={'Year': 'int64'})

wrk_ITT = wrk_ITT.merge(right=wrk_Race[['TeamID','Year', 'Team']],
              on=['Team', 'Year'], how='left').dropna().astype(dtype={'TeamID': 'int64'})
wrk_ITT = wrk_ITT.iloc[:, [0,4,1,2,3]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [51]:
mTP['ID'] = np.arange(0,len(mTP), 1)
wrk_TP = mTP.iloc[:, [6,0,3,5]]
wrk_TP = wrk_TP.astype(dtype={'Year': 'int64'})
wrk_TP = wrk_TP.merge(right=wrk_Race.iloc[:, [0,1,2]], 
                      on=['Year', 'Team'], how='left').dropna().astype(dtype={'TeamID': 'int64'})
wrk_TP = wrk_TP.iloc[:, [0,4,1,2,3]]

DataFrames are all stripped down to include only relevant information.  Now need to merge team pursuit data into race frame as well as merge ITT data into race frame.  I will do team pursuit first since it is an easier composition.

Merging Team Pursuit

In [52]:
wrk_TP.head()

Unnamed: 0,ID,TeamID,Year,Team,zscore
0,0,3,2018,Sigma Phi Epsilon,-0.910551
1,1,2,2018,Black Key Bulls,-0.827738
2,2,5,2018,Sigma Alpha Epsilon,-0.714798
3,3,6,2018,Bears,-0.604001
4,4,7,2018,Beta Theta Pi,-0.580585


In [53]:
wrk_table = wrk_Race.merge(right=wrk_TP.iloc[:,[1,4]], on='TeamID', how='left')
wrk_table = wrk_table.rename(columns={'zscore': 'TP'})
# Success

Merging ITT results

In [54]:
# Start by grouping ITT results by TeamID
grouped_ITT = wrk_ITT.groupby('TeamID')
grouped_ITT = grouped_ITT['zscore'].apply(lambda x: x.reset_index(drop=True)).unstack().reset_index()

wrk_table = wrk_table.merge(right=grouped_ITT.iloc[:, [0,1,2,3,4]], on='TeamID', how='left')
wrk_table = wrk_table.rename(columns={0: 'aITT', 1: 'bITT', 2: 'cITT', 3: 'dITT'})

In [55]:
wrk_table.head()

Unnamed: 0,TeamID,Year,Team,Place,TP,aITT,bITT,cITT,dITT
0,0,2018,Cutters,1,,-1.431803,-0.897155,-0.892919,0.56872
1,1,2018,Gray Goat,2,-0.391686,-1.477139,-0.931042,-0.905512,-0.833043
2,2,2018,Black Key Bulls,3,-0.827738,-1.884708,-1.358875,-1.212677,-0.790683
3,3,2018,Sigma Phi Epsilon,4,-0.910551,-1.537817,-1.533695,-1.308845,0.103107
4,4,2018,Jetblach,5,,-1.103572,-0.809001,-0.704017,-0.607849


# ML Classifiers

In [56]:
# Split data into X, y
X = wrk_table.iloc[:, 4:9].values
y = wrk_table.iloc[:, 3].values

In [57]:
# Preprocess
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [58]:
# Setup numeric transformer
# This will usually entail handling missing variables then scaling
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0))])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, [0,1,2,3,4])])

X = preprocessor.fit_transform(X)

In [59]:
# Split data into train and test set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Logistic Regression

In [61]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

# Convert y into binary column where 1=win, 0=else
y_win_train = (y_train == 1).astype('int64')
y_win_test = (y_test == 1).astype('int64')

log_regressor = LogisticRegression(solver='lbfgs')
log_regressor.fit(X_train, y_win_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [62]:
import statsmodels.formula.api as sm

In [64]:
X_train_2 = np.append(np.ones(shape=(len(X_train), 1)), X_train, axis=1)

In [65]:
logit_reg = sm.Logit(y_win_train, X_train_2[:, [0,1,3,4,5]]).fit()

Optimization terminated successfully.
         Current function value: 0.088921
         Iterations 10


In [66]:
logit_reg.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,501.0
Model:,Logit,Df Residuals:,496.0
Method:,MLE,Df Model:,4.0
Date:,"Sun, 10 Mar 2019",Pseudo R-squ.:,0.4255
Time:,19:26:18,Log-Likelihood:,-44.55
converged:,True,LL-Null:,-77.545
,,LLR p-value:,1.591e-13

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-6.9813,1.236,-5.648,0.000,-9.404,-4.559
x1,-0.5412,0.593,-0.912,0.362,-1.704,0.621
x2,-3.2678,1.230,-2.657,0.008,-5.679,-0.857
x3,0.1381,1.192,0.116,0.908,-2.198,2.474
x4,-0.5972,1.046,-0.571,0.568,-2.648,1.454


In [67]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score

In [73]:
print('Training Accuracy:', accuracy_score(y_win_train, log_regressor.predict(X_train)))
print('Test Accuracy:', accuracy_score(y_win_test, log_regressor.predict(X_test)))

Training Accuracy: 0.9700598802395209
Test Accuracy: 0.9920634920634921


The tremendous accuracy achieved by this model is a bit of a red herring. There are very few winners in the sample (1 per year, or 1/33). The model therefore achieves pretty high accuracy simply assigning 0 outcome to every case.

In [77]:
print('Training Accuracy:', balanced_accuracy_score(y_win_train, log_regressor.predict(X_train)))
print('Test Accuracy:', balanced_accuracy_score(y_win_test, log_regressor.predict(X_test)))

Training Accuracy: 0.5833333333333334
Test Accuracy: 0.5


In [91]:
log_regressor.predict_proba(X_test)[log_regressor.predict_proba(X_test)[:, 1] > .10]

array([[0.88618761, 0.11381239],
       [0.89274303, 0.10725697],
       [0.65150493, 0.34849507],
       [0.89230586, 0.10769414],
       [0.88273647, 0.11726353],
       [0.89737216, 0.10262784],
       [0.79951802, 0.20048198],
       [0.73126076, 0.26873924],
       [0.87931223, 0.12068777],
       [0.8678081 , 0.1321919 ],
       [0.83633711, 0.16366289],
       [0.54497899, 0.45502101]])

Looking at the probabilities assigned to to the predictions shows just how heavily skewed towards zero this model is. Within the training set only two teams achieved a winning probability great enough to be assigned a 1 outcome. This may not necessarily be a shortcoming of the model. Very rarely are there teams that have realistically more than 1/5 odds of winning. If that wasn't reflected here then that would indicate bigger problems.

## Model Top-5

In [92]:
y_train_5 = (y_train <= 5).astype('int64')
y_test_5 = (y_test <= 5).astype('int64')

log_regressor_5 = sm.Logit(y_train_5, X_train).fit()
log_regressor_5.summary()

Optimization terminated successfully.
         Current function value: 0.457195
         Iterations 7


0,1,2,3
Dep. Variable:,y,No. Observations:,501.0
Model:,Logit,Df Residuals:,496.0
Method:,MLE,Df Model:,4.0
Date:,"Sun, 10 Mar 2019",Pseudo R-squ.:,-0.08292
Time:,19:41:36,Log-Likelihood:,-229.05
converged:,True,LL-Null:,-211.52
,,LLR p-value:,1.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x1,-0.9708,0.250,-3.889,0.000,-1.460,-0.482
x2,1.8409,0.280,6.581,0.000,1.293,2.389
x3,-0.8441,0.363,-2.324,0.020,-1.556,-0.132
x4,-0.8559,0.314,-2.724,0.006,-1.472,-0.240
x5,-1.1368,0.293,-3.886,0.000,-1.710,-0.563


In [93]:
log_regressor_5 = LogisticRegression(solver='lbfgs')
log_regressor_5.fit(X_train, y_train_5)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [97]:
print('Training Accuracy:', accuracy_score(y_train_5, log_regressor_5.predict(X_train)))
print('Test Accuracy:', accuracy_score(y_test_5, log_regressor_5.predict(X_test)))

Training Accuracy: 0.8862275449101796
Test Accuracy: 0.873015873015873


In [98]:
print('Training Accuracy:', balanced_accuracy_score(y_train_5, log_regressor_5.predict(X_train)))
print('Test Accuracy:', balanced_accuracy_score(y_test_5, log_regressor_5.predict(X_test)))

Training Accuracy: 0.7298591549295774
Test Accuracy: 0.7216981132075472


Using a logistic classification model for the top 5 seems to yield pretty reliable results. Accuracy scores hovering in the upper 80%s for both test and training data, and low-70% for balanced accuracy metrics.

In [99]:
from sklearn.metrics import brier_score_loss

In [102]:
brier_score_loss(y_test_5, log_regressor_5.predict_proba(X_test)[:, 1])

0.07874906334552223

Here we can see that this model is pretty good represented by the brier loss score of 0.07.

## Random Forest Classifier

In [121]:
from sklearn.ensemble import RandomForestClassifier

In [141]:
RFR_classifier = RandomForestClassifier(n_estimators=300)
RFR_classifier.fit(X_train, y_win_train)

print('Training Accuracy (Winner):', accuracy_score(y_win_train, RFR_classifier.predict(X_train)))
print('Test Accuracy (Winner):', accuracy_score(y_win_test, RFR_classifier.predict(X_test)), '\n')
print('Training Balanced Accuracy (Winner):', balanced_accuracy_score(y_win_train, RFR_classifier.predict(X_train)))
print('Test Balanced Accuracy (Winner):', balanced_accuracy_score(y_win_test, RFR_classifier.predict(X_test)), '\n')

print('Brier Score (Winner):', brier_score_loss(y_win_test, RFR_classifier.predict_proba(X_test)[:, 1]))

Training Accuracy (Winner): 1.0
Test Accuracy (Winner): 0.9920634920634921 

Training Balanced Accuracy (Winner): 1.0
Test Balanced Accuracy (Winner): 0.5 

Brier Score (Winner): 0.012486155202821871


The above model has a lot of promise. Still, the P(win) is adequatly small that there should be room for skepticism. Next, I'll make a similar Random Forest Classifer for Top-3 finishers.

In [142]:
y_train_3 = (y_train <= 3).astype('int64')
y_test_3 = (y_test <= 3).astype('int64')

RFR_classifier.fit(X_train, y_train_3)

print('Training Accuracy (Top-3):', accuracy_score(y_train_3, RFR_classifier.predict(X_train)))
print('Test Accuracy (Top-3):', accuracy_score(y_test_3, RFR_classifier.predict(X_test)), '\n')
print('Training Balanced Accuracy (Top-3):', balanced_accuracy_score(y_train_3, RFR_classifier.predict(X_train)))
print('Test Balanced Accuracy (Top-3):', balanced_accuracy_score(y_test_3, RFR_classifier.predict(X_test)), '\n')

print('Brier Score (Top-3):', brier_score_loss(y_test_3, RFR_classifier.predict_proba(X_test)[:, 1]))

Training Accuracy (Top-3): 0.998003992015968
Test Accuracy (Top-3): 0.8968253968253969 

Training Balanced Accuracy (Top-3): 0.9891304347826086
Test Balanced Accuracy (Top-3): 0.5324110671936759 

Brier Score (Top-3): 0.06600264892437474


More promising Results here! Overall score is down slightly in the test set, but the model is more generally applicable.

# ML Regressors

In [103]:
from sklearn.ensemble import RandomForestRegressor

In [104]:
RFR_regressor = RandomForestRegressor(n_estimators=300)
RFR_regressor.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [105]:
y_pred = RFR_regressor.predict(X_test)

In [108]:
from sklearn.metrics import mean_squared_error

In [120]:
RMSE = (mean_squared_error(y_test, y_pred))**(1/2)

print('RMSE:', RMSE)

RMSE: 5.348691730695017


# Visualizations

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.scatter(log_regressor.predict_proba(X_test)[:, 1], y_win_test, color='red')
plt.title('Test Parameters')
plt.show()

In [None]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

In [None]:
RFR_regressor = RandomForestRegressor(n_estimators=100)
RFR_regressor.fit(X, y)

In [None]:
RFR_classifier = RandomForestClassifier(n_estimators=100)
RFR_classifier.fit(X_train, y_train_5)

a_train = (y_train <= 3).astype('int64')
a_test = (y_test <= 3).astype('int64')

RFR_classifier.fit(X_train, a_train)
RFR_classifier.score(X_test, a_test)

In [None]:
import sklearn.metrics as met

In [None]:
met.roc_auc_score(a_test, RFR_classifier.predict(X_test))

In [None]:
y_win_test

In [None]:
RFR_regressor.predict(X)

In [None]:
from sklearn.metrics import mean_squared_error

# Export to MySQL

In [None]:
import sqlalchemy
import pymysql
import pandas as pd

In [None]:
engine = sqlalchemy.create_engine('mysql+pymysql://root:Purpsqlle01@localhost:3306/exercises')
# engine = sqlalchemy.create_engine('mysql+pymysql://user:password@server:port/schema')

engine.connect()

In [None]:
mITT.to_sql('ITT', con=engine)

In [None]:
mR.to_sql('Race', con=engine)

In [None]:
mTP.to_sql('TP', con=engine)