In [10]:
import os
import sys
import warnings
from joblib import dump, load

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, train_test_split,RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor,StackingRegressor
from sklearn.svm import LinearSVR
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_transformer


warnings.filterwarnings('ignore')
random_seed = 100


In [33]:
# load data
train_df = pd.read_csv(r'D:\Envision Racing\data\train.csv')
test_df = pd.read_csv(r'D:\Envision Racing\data\test.csv')

In [3]:
# split data in to train and val set
X_train,X_val,Y_train,Y_val = train_test_split(train_df.drop(columns=["LAP_TIME"]),
                                               train_df["LAP_TIME"],
                                               test_size=0.03,
                                               random_state=random_seed,
                                               stratify=train_df['LOCATION'])


In [4]:

# modify column names
def mod_names(df):
    df.columns = df.columns.str.lstrip() 
    df.columns = df.columns.str.lower()

# drop unwanted features
def drop_cols(df):
    df.drop(columns = {
        "number",
        "driver_number",
        "lap_improvement",
        "s1",
        "s2",
        "s3",
        "s1_improvement",
        "s2_improvement",
        "s3_improvement",
        "crossing_finish_line_in_pit",
        "pit_time",
        "group",
        "team",
        "power",
        "kph",
        "hour",
        "elapsed"
    },inplace=True)

# drop rows with missing values
# nan_ft = ['s3_large','s1_large','s2_large']
def drop_na_rows(df):
    df.fillna(0, inplace=True)
    # df.dropna(subset=nan_ft, axis=0, inplace=True)

# convert hour feature to seconds
def convert_time_to_seconds(x):
    if type(x) is (int or float):
        return x
    else:
        min = x.split(":")[0]
        sec = x.split(":")[1]
        sec = float(sec)
        min = float(min)
        sec = sec/60
        return min + sec


In [5]:
# cleanse data
def cleanse(df):
    mod_names(df)
    drop_cols(df)
    drop_na_rows(df)
    df["s3_large"] = df["s3_large"].apply(convert_time_to_seconds)
    df["s2_large"] = df["s2_large"].apply(convert_time_to_seconds)
    df["s1_large"] = df["s1_large"].apply(convert_time_to_seconds)


In [6]:
cleanse(X_train)
cleanse(X_val)

In [7]:
print(X_train.shape,X_val.shape,Y_train.shape,Y_val.shape)


(9967, 7) (309, 7) (9967,) (309,)


In [8]:
cat_fts = ["driver_name","location","event"]
num_fts = ["lap_number","s1_large","s2_large","s3_large"]

ohe = OneHotEncoder(sparse=False)
std_scaler = StandardScaler()

ct = make_column_transformer(
                        (std_scaler, num_fts),
                        (ohe, cat_fts),
                        remainder='passthrough',
                        n_jobs=-1,
                        verbose=True)

In [12]:
lsvr_params = {'C': 1, 'dual': True, 'epsilon': 0.1, 'loss': 'epsilon_insensitive', 'tol': 0.001}
linear_svr = LinearSVR(
    C=lsvr_params['C'],
    epsilon=lsvr_params['epsilon'],
    tol=lsvr_params['tol'],
    loss=lsvr_params['loss'],
    dual=lsvr_params['dual'],
    verbose=1,
    max_iter=100000,
    random_state=random_seed
)

In [13]:
rf = RandomForestRegressor(n_estimators=200,criterion="mae",
                            max_depth=5,
                            n_jobs=-1,random_state=random_seed)


In [15]:
estimators = [
    ('lsvr',linear_svr),
    ('rf',rf)
]

In [27]:
stk = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor(n_estimators=10,max_depth=3,random_state=random_seed),
    passthrough=False,
    n_jobs=-1
)

# final estimator is trained only on preds of lsvr and rf 

In [28]:
stk_pipeline = make_pipeline(ct,stk)

In [29]:
stk_pipeline.fit(X_train,Y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(n_jobs=-1, remainder='passthrough',
                                   transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['lap_number', 's1_large',
                                                   's2_large', 's3_large']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['driver_name', 'location',
                                                   'event'])],
                                   verbose=True)),
                ('stackingregressor',
                 StackingRegressor(estimators=[('lsvr',
                                                LinearSVR(C=1, epsilon=0.1,
                                                          max_iter=100000,
                  

In [30]:
# dump trained model
dump(stk_pipeline,'rf_lsvr_pipeline.joblib')

['rf_lsvr_pipeline.joblib']

In [31]:
#  validate
y_pred = stk_pipeline.predict(X_val)

print("validation score: ",np.sqrt(metrics.mean_squared_log_error(Y_val,y_pred)))
# print("stk regressor train fit score",stk_pipeline.score(X_train,X_val))
# print("Validation_fit score: ",(stk_pipeline.score(X_val,Y_val)))

validation score:  0.4903479528510646


In [34]:
cleanse(test_df)

In [35]:
# make predictions
preds = stk_pipeline.predict(test_df.drop(columns=["lap_time"]))

submission = pd.read_csv(r'D:\Envision Racing\data\submission.csv')
submission['LAP_TIME'] = preds
submission.to_csv('rf_lsvr_ensemble_preds.csv', index=False)

print(submission.head())
print(submission.describe())

    LAP_TIME
0  96.172533
1  96.172533
2  96.683025
3  97.280423
4  97.280423
         LAP_TIME
count  420.000000
mean    90.541840
std     10.287115
min     60.966906
25%     76.099256
50%     96.172533
75%     97.280423
max    100.340029
