In [1]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

#model import 
from sklearn.svm import LinearSVR

#splitting and scaling
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#parameter tuning
from sklearn.model_selection import GridSearchCV

#evaluation
from sklearn import linear_model, metrics
from sklearn.metrics import (mean_squared_error, r2_score, mean_absolute_error, 
mean_squared_log_error, explained_variance_score, max_error)
from sklearn.model_selection import LeaveOneOut, cross_val_score, cross_val_predict

#from sklearn.svm import LinearSVR
#from sklearn.svm import SVR
#from sklearn.datasets import make_regression
#from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import QuantileTransformer
#from sklearn.compose import TransformedTargetRegressor
#from sklearn.pipeline import Pipeline
#from sklearn.model_selection import GridSearchCV

import warnings
# We silence warnings concerning future version updates
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

In [2]:
# Load training and testing data
X_train = np.loadtxt("X_train.csv", delimiter=',', skiprows=1)
X_test = np.loadtxt("X_test.csv", delimiter=',', skiprows=1)
y_train = np.loadtxt("y_train.csv", delimiter=',', skiprows=1)[:,1]

In [3]:
#scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [4]:
def saveFile(y_pred,name):
    test_header = "Id,PRP"
    n_points = X_test.shape[0]
    y_pred_pp = np.ones((n_points, 2))
    y_pred_pp[:, 0] = range(n_points)
    y_pred_pp[:, 1] = y_pred
    np.savetxt(name, y_pred_pp, fmt='%d,%f', delimiter=",",
               header=test_header, comments="")


In [5]:
#split the dataset for training
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

#scaling split training dataset
scaler = StandardScaler()
X_train1_scaled = scaler.fit_transform(X_train1)
X_test1_scaled = scaler.fit_transform(X_test1)

In [6]:
# Defining basic untuned Linear SVR regression model

def basicLinearSVR(X_train1, X_test1, y_train1, y_test1):
    svrReg = LinearSVR(max_iter = 100000)
    svrReg.fit(X_train1, y_train1)
    y_pred = svrReg.predict(X_test1)
    print("R2 score: " + str(svrReg.score(X_train1, y_train1)))
    print("Explained variance: " + str(explained_variance_score(y_test1, y_pred)))
    print("Max error: " + str(max_error(y_test1, y_pred)))
    print("Mean absolute error: " + str(mean_absolute_error(y_test1, y_pred)))
    print("Mean squared error: " + str(mean_squared_error(y_test1, y_pred)))
    return y_pred

In [7]:
basicLinearSVR(X_train1, X_test1, y_train1, y_test1)

R2 score: 0.7099560324371561
Explained variance: 0.5053430782493595
Max error: 279.9663561598935
Mean absolute error: 53.431755635995934
Mean squared error: 6620.305402472979




array([397.5010276 ,  50.17019483, 242.85427521,  56.6625412 ,
        31.36063715,  31.2753753 ,  28.88870874,  24.6181763 ,
       143.10704282,  16.28791559,  32.71076266, 111.92098834,
        48.46385281,  60.07059431,  29.55359442,  25.54883122,
       315.49410162, 108.49838541, 278.33592534, 102.41181945,
        22.72165873,  17.28284905, 125.25996225,  40.96616396,
        40.29374014, 201.31318422,  14.34453753, 112.34127777,
       229.64339764,  30.84765221,  60.07059431,  15.47676649,
        81.1481393 , 166.91446068,  18.68477904, 419.96635616,
       239.35664087,  56.24916715,  76.67599737, 137.61777523,
        26.24878034, 102.1046307 , 164.78786694,  32.24502589,
       179.44554488,  85.48931019,  11.07390507,  63.71479319,
       395.7896106 ,  42.55568476,  10.76975242])

In [8]:
basicLinearSVR(X_train1_scaled, X_test1_scaled, y_train1, y_test1)

R2 score: 0.4985793364344304
Explained variance: 0.7009071887115044
Max error: 239.03036681910095
Mean absolute error: 32.45594189860802
Mean squared error: 4073.3840376162925


array([177.53568235,  26.11616007,  97.72609414,  36.55182149,
        14.62642806,  11.72016042,  12.89543134,  14.87393761,
        67.47040352,  22.97954656,  50.2740813 ,  67.54378803,
        55.35260278,  20.84020923,  31.05198004,  39.24134663,
       111.73585746,  45.01441047, 250.14172331,  41.9928992 ,
         8.07269795,  66.27775851,  66.15004463,  19.72407891,
        24.2483988 ,  95.52719127,   0.30712776,  57.98523896,
       225.96963318,  63.5370714 ,  20.84020923,  19.86144239,
        53.1807604 ,  86.38319459,  30.04515691, 188.47023489,
       177.42487197,  36.30124351,  67.13020309,  85.93340504,
        23.57990564,  47.20273757,  96.83660516,  22.96383988,
       123.01788658,  48.07026398,   8.19204255,  26.50014536,
       281.05908739,  17.66694513,  -2.36108015])

In [9]:
# Linear Support Vector Regression - Further Tuning with Grid Search
lsvr = LinearSVR()
# Look at parameters used by our regression
print('Parameters currently in use:\n')
print(lsvr.get_params())

#Creating the Random Grid
#pipe = Pipeline([('regression model' , LinearSVR())])

param_grid = [
    {'epsilon' : [0,75, 0.5, 0.25, 0],
    'tol' : [1e-4, 2e-4],
    'C' : np.linspace(0.5,100,200),
    'fit_intercept' : [True, False],
    'verbose' : [0, 1],
    'max_iter' : [500, 1000, 1500, 2000, 2500, 3000]}
]

Parameters currently in use:

{'C': 1.0, 'dual': True, 'epsilon': 0.0, 'fit_intercept': True, 'intercept_scaling': 1.0, 'loss': 'epsilon_insensitive', 'max_iter': 1000, 'random_state': None, 'tol': 0.0001, 'verbose': 0}


In [10]:
#fit the model with the parameter
reg = GridSearchCV(LinearSVR(), param_grid = param_grid, cv = 10, verbose=True, n_jobs=-1)
# Fit the random search model
best_reg= reg.fit(X_train1_scaled, y_train1)


best_reg.best_params_

Fitting 10 folds for each of 48000 candidates, totalling 480000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 11147 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 33147 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done 63947 tasks      | elapsed:   22.4s
[Parallel(n_jobs=-1)]: Done 103547 tasks      | elapsed:   38.2s
[Parallel(n_jobs=-1)]: Done 151947 tasks      | elapsed:   58.7s
[Parallel(n_jobs=-1)]: Done 209147 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 275147 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 349947 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 433547 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 480000 out of 480000 | elapsed:  3.7min finished


[LibLinear]

{'C': 6.0,
 'epsilon': 0.5,
 'fit_intercept': True,
 'max_iter': 500,
 'tol': 0.0002,
 'verbose': 1}

In [11]:
def tunedLinearSVR(X_train1, X_test1, y_train1, y_test1):
    svrReg = LinearSVR(C = 6, epsilon = 0.5, fit_intercept = True, max_iter = 500, tol = 0.0002, verbose = 0)
    svrReg.fit(X_train1, y_train1)
    y_pred = svrReg.predict(X_test1)
    print("R2 score: " + str(svrReg.score(X_train1, y_train1)))
    print("Explained variance: " + str(explained_variance_score(y_test1, y_pred)))
    print("Max error: " + str(max_error(y_test1, y_pred)))
    print("Mean absolute error: " + str(mean_absolute_error(y_test1, y_pred)))
    print("Mean squared error: " + str(mean_squared_error(y_test1, y_pred)))
    
    return y_pred

In [12]:
tunedLinearSVR(X_train1, X_test1, y_train1, y_test1)

R2 score: 0.8386051184696204
Explained variance: 0.7456951402196036
Max error: 297.87651413323573
Mean absolute error: 48.9273285414242
Mean squared error: 4830.738847872142




array([331.41145413,  45.39571904, 199.24750965,  73.65211504,
        32.83868394,  26.53410499,  28.89874658,  20.57687623,
       136.89316109,  48.38295707,  75.00455096, 125.76839079,
       101.84788136,  55.31565487,  46.27419591,  83.14438906,
       239.63108463,  90.02453853, 421.26917723,  89.75873625,
        27.23388218,  94.78516471, 112.842588  ,  42.47898697,
        39.32771627, 218.45590955,  28.78348597, 105.13789261,
       419.14275902, 110.46706195,  55.31565487,  30.19786019,
        84.8590965 , 144.37709624,  64.84604023, 437.87651413,
       300.50782915,  76.13234728, 121.17731556, 153.61526524,
        55.67896496, 110.82008974, 139.07304782,  51.98140268,
       255.41753412,  92.00745305,  25.38594039,  57.45901823,
       522.23198715,  42.96194258,  38.47742467])

In [13]:
tunedLinearSVR(X_train1_scaled, X_test1_scaled, y_train1, y_test1)

R2 score: 0.8047055236061277
Explained variance: 0.8111766799125693
Max error: 180.2690673365624
Mean absolute error: 34.29550627950622
Mean squared error: 2439.4827138373203


array([288.27186615,  23.72168481, 144.04693774,  41.75932823,
         4.060322  ,  -1.48865612,   1.27641316,   3.61561212,
        89.38215523,  19.35446902,  68.55547891,  95.67897803,
        75.64880481,  19.01969437,  35.13692449,  56.5445864 ,
       180.75290464,  54.87585899, 428.82169546,  49.99926243,
        -2.96422876,  90.83033389,  84.9736402 ,  11.49648314,
        21.54628031, 147.45654925,  -5.55412689,  74.8947261 ,
       389.80892295,  88.35372099,  19.01969437,  12.33343412,
        71.40934725, 119.73024717,  34.44786437, 320.26906734,
       278.98411316,  43.0367963 ,  92.37936423, 129.56394016,
        33.0482864 ,  62.92363656, 142.61806262,  19.91328441,
       205.41676502,  57.76242344,  -1.50224243,  23.32463219,
       487.91742932,  10.77671263,  -4.0092307 ])

In [14]:
#run our final prediction
LinearSVR_tuned = LinearSVR(C = 6, epsilon = 0.5, fit_intercept = True, max_iter = 500, tol = 0.0002, verbose = 0)
LinearSVR_tuned.fit(X_train_scaled, y_train)
y_pred = LinearSVR_tuned.predict(X_test_scaled)
saveFile(y_pred, "LinearSVR_submission.csv")