In [1]:
import os
import sys
import warnings
from joblib import dump, load

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.svm import SVR,LinearSVR
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import  make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV


warnings.filterwarnings('ignore')
random_seed = 999


In [2]:

# load data
train_df = pd.read_csv(r'D:\Envision Racing\data\train.csv')
test_df = pd.read_csv(r'D:\Envision Racing\data\test.csv')


In [3]:
# split data in to train and val set
X_train,X_val,Y_train,Y_val = train_test_split(train_df.drop(columns=["LAP_TIME"]),
                                               train_df["LAP_TIME"],
                                               test_size=0.03,
                                               random_state=random_seed,
                                               stratify=train_df['LOCATION'])


In [4]:

# modify column names
def mod_names(df):
    df.columns = df.columns.str.lstrip() 
    df.columns = df.columns.str.lower()

# drop unwanted features
def drop_cols(df):
    df.drop(columns = {
        "number",
        "driver_number",
        "lap_improvement",
        "s1",
        "s2",
        "s3",
        "s1_improvement",
        "s2_improvement",
        "s3_improvement",
        "crossing_finish_line_in_pit",
        "pit_time",
        "group",
        "team",
        "power",
        "kph",
        "hour",
        "elapsed"
    },inplace=True)

# drop rows with missing values
# nan_ft = ['s3_large','s1_large','s2_large']
def drop_na_rows(df):
    df.fillna(0, inplace=True)
    # df.dropna(subset=nan_ft, axis=0, inplace=True)

# convert hour feature to seconds
def convert_time_to_seconds(x):
    if type(x) is (int or float):
        return x
    else:
        min = x.split(":")[0]
        sec = x.split(":")[1]
        sec = float(sec)
        min = float(min)
        sec = sec + (min*60)
        return sec


In [5]:

# cleanse data
def cleanse(df):
    mod_names(df)
    drop_cols(df)
    drop_na_rows(df)
    df["s3_large"] = df["s3_large"].apply(convert_time_to_seconds)
    df["s2_large"] = df["s2_large"].apply(convert_time_to_seconds)
    df["s1_large"] = df["s1_large"].apply(convert_time_to_seconds)


In [6]:
# cleanse(train_df)
# cleanse(test_df)

cleanse(X_train)
cleanse(X_val)

In [7]:
print(X_train.shape,X_val.shape,Y_train.shape,Y_val.shape)

(9967, 7) (309, 7) (9967,) (309,)


In [8]:
cat_fts = ["driver_name","location","event"]
num_fts = ["lap_number","s1_large","s2_large","s3_large"]

ohe = OneHotEncoder(sparse=False)
std_scaler = StandardScaler()

ct = make_column_transformer(
                        (std_scaler, num_fts),
                        (ohe, cat_fts),
                        remainder='passthrough',
                        n_jobs=-1,
                        verbose=True)

In [8]:
# linsvr = LinearSVR(random_state=random_seed,
#                    verbose =1,
#                    max_iter = 100000)
# linearsvr_pipeline = make_pipeline(ct,linsvr)

In [22]:
# params = {
#     "linearsvr__epsilon": [0.1,0.5,1,2,5,10,20],
#     "linearsvr__C": [0.1,1, 10],
#     "linearsvr__loss": ["epsilon_insensitive","squared_epsilon_insensitive"],
#     "linearsvr__dual": [True,False],
#     "linearsvr__tol": [1e-3,1e-4,1e-5],
# }

In [21]:
# linsvr_gs.get_params().keys()

dict_keys(['cv', 'error_score', 'estimator__memory', 'estimator__steps', 'estimator__verbose', 'estimator__columntransformer', 'estimator__linearsvr', 'estimator__columntransformer__n_jobs', 'estimator__columntransformer__remainder', 'estimator__columntransformer__sparse_threshold', 'estimator__columntransformer__transformer_weights', 'estimator__columntransformer__transformers', 'estimator__columntransformer__verbose', 'estimator__columntransformer__standardscaler', 'estimator__columntransformer__onehotencoder', 'estimator__columntransformer__standardscaler__copy', 'estimator__columntransformer__standardscaler__with_mean', 'estimator__columntransformer__standardscaler__with_std', 'estimator__columntransformer__onehotencoder__categories', 'estimator__columntransformer__onehotencoder__drop', 'estimator__columntransformer__onehotencoder__dtype', 'estimator__columntransformer__onehotencoder__handle_unknown', 'estimator__columntransformer__onehotencoder__sparse', 'estimator__linearsvr__C',

In [23]:
# linsvr_gs = GridSearchCV(
#                         linearsvr_pipeline,
#                         cv=5,
#                         scoring="neg_mean_squared_log_error",
#                         n_jobs=-1,
#                         param_grid=params,
#                         verbose=1,
#                         return_train_score=True,
#                     )

In [24]:
# linsvr_gs.fit(X_train,Y_train)
# print('===============gs finished=================')
# linsvr_gs_results = pd.DataFrame(linsvr_gs.cv_results_)
# linsvr_gs_results.to_csv(r'D:\Envision Racing\Svr\linsvr_gs_results.csv')

Fitting 5 folds for each of 252 candidates, totalling 1260 fits


In [25]:
# with open('linear_svr_gridsearch.txt','a+') as f:
#     f.write('=================gridsearch finished=================\n')
#     f.write(str(linsvr_gs.best_params_)+'\n')
#     f.write(str(linsvr_gs.best_score_)+'\n')
#     f.write(str(linsvr_gs.best_estimator_)+'\n')
#     f.write('=================gridsearch finished=================\n')
# print(linsvr_gs.best_params_)
# print(linsvr_gs.best_score_)
# print(linsvr_gs.best_estimator_)

{'linearsvr__C': 1, 'linearsvr__dual': True, 'linearsvr__epsilon': 0.1, 'linearsvr__loss': 'epsilon_insensitive', 'linearsvr__tol': 0.001}
-0.3073265719090502
Pipeline(steps=[('columntransformer',
                 ColumnTransformer(n_jobs=-1, remainder='passthrough',
                                   transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['lap_number', 's1_large',
                                                   's2_large', 's3_large']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['driver_name', 'location',
                                                   'event'])],
                                   verbose=True)),
                ('linearsvr',
                 LinearSVR(C=1, epsilon=0.1, max_iter=100000, random_state

In [9]:
params = {'C': 1, 'dual': True, 'epsilon': 0.1, 'loss': 'epsilon_insensitive', 'tol': 0.001}

In [10]:
linear_svr = LinearSVR(
    C=params['C'],
    epsilon=params['epsilon'],
    tol=params['tol'],
    loss=params['loss'],
    dual=params['dual'],
    verbose=1,
    max_iter=100000,
    random_state=random_seed
)

In [11]:
linear_svr_finetuned_pipeline = make_pipeline(ct,linear_svr)

In [12]:
linear_svr_finetuned_pipeline.fit(X_train,Y_train)
y_preds = linear_svr_finetuned_pipeline.predict(X_val)
print('Validation Error:',np.sqrt(metrics.mean_squared_log_error(Y_val,y_preds)))

[LibLinear]Validation Error: 0.6456068873436835


In [13]:
dump(linear_svr_finetuned_pipeline,'linear_svr_finetuned_pipeline.joblib')

['linear_svr_finetuned_pipeline.joblib']

In [None]:
# naive_svr_pipeline = load(r'D:\Envision Racing\Svr\naive_svr_ppline.joblib')
# naive_svr_pipeline.get_params()
# """
# 'svr__C': 1.0,
#  'svr__cache_size': 200,
#  'svr__coef0': 0.0,
#  'svr__degree': 3,
#  'svr__epsilon': 0.1,
#  'svr__gamma': 'scale',
#  'svr__kernel': 'rbf',
#  'svr__max_iter': -1,
#  'svr__shrinking': True,
#  'svr__tol': 0.001,
#  'svr__verbose': True
# """

In [15]:
cleanse(test_df)

In [16]:
#  make predictions
predictor = load('linear_svr_finetuned_pipeline.joblib')                            
preds = predictor.predict(test_df.drop(columns=["lap_time"]))                                                                                                                                   

submission = pd.read_csv(r'D:\Envision Racing\data\submission.csv')                                                                                                                                                                                                                                                                                     
submission['LAP_TIME'] = preds                                                                                                                                                                           
submission.to_csv('linear_svr_finetuned_submission_file.csv', index=False)                                                                                                                                                                  

In [17]:
submission.head()

Unnamed: 0,LAP_TIME
0,91.199936
1,91.08278
2,90.914063
3,91.166867
4,91.042211


In [38]:
submission.describe()

Unnamed: 0,LAP_TIME
count,420.0
mean,85.935999
std,9.734873
min,67.880315
25%,71.671951
50%,91.134917
75%,92.024341
max,94.263886
