In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

df_train = pd.read_csv('illinois_basing_train.csv')
df_train.drop('Avg_PLT_CO2InjRate_TPH', axis=1, inplace = True)

df_train[df_train.columns[-1]].fillna(0, inplace = True)
y = df_train[df_train.columns[-1]]
df_train.drop(df_train.columns[-1], axis=1, inplace = True)

df_train['Month'] = pd.to_datetime(df_train['SampleTimeUTC']).dt.month
df_train['Day'] = pd.to_datetime(df_train['SampleTimeUTC']).dt.day
df_train['Hour'] = pd.to_datetime(df_train['SampleTimeUTC']).dt.hour
df_train['Year'] = pd.DatetimeIndex(df_train['SampleTimeUTC']).year
df_train.drop('SampleTimeUTC', axis=1, inplace = True)


cols = [i for i in df_train.columns if df_train[i].isnull().any()]
for i in cols:
    df_train[i].fillna(df_train[i].mean(), inplace=True)
    

    
# scaler = StandardScaler()
# # transform data
# df_train = scaler.fit_transform(df_train)


In [2]:
X_train, X_val, y_train, y_val = train_test_split(
    df_train, y, test_size=0.1, random_state=13
)

param_grid = {'C': [0.1, 1, 10, 100], 
              'gamma': [1, 0.1, 0.01, 0.001],
              'kernel': ['rbf']} 


regr = SVR()

  
grid = GridSearchCV(regr, param_grid, cv = 3, refit = True, verbose = 3)
  
# fitting the model for grid search
grid.fit(X_train, y_train)



Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV 1/3] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.000 total time=  15.2s
[CV 2/3] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.000 total time=  15.1s
[CV 3/3] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.000 total time=  15.2s
[CV 1/3] END .....C=0.1, gamma=0.1, kernel=rbf;, score=-0.000 total time=  15.2s
[CV 2/3] END .....C=0.1, gamma=0.1, kernel=rbf;, score=-0.000 total time=  15.3s
[CV 3/3] END .....C=0.1, gamma=0.1, kernel=rbf;, score=-0.000 total time=  16.6s
[CV 1/3] END ....C=0.1, gamma=0.01, kernel=rbf;, score=-0.000 total time=  16.6s
[CV 2/3] END ....C=0.1, gamma=0.01, kernel=rbf;, score=-0.000 total time=  16.9s
[CV 3/3] END ....C=0.1, gamma=0.01, kernel=rbf;, score=-0.000 total time=  17.5s
[CV 1/3] END ...C=0.1, gamma=0.001, kernel=rbf;, score=-0.000 total time=  18.3s
[CV 2/3] END ...C=0.1, gamma=0.001, kernel=rbf;, score=-0.000 total time=  17.0s
[CV 3/3] END ...C=0.1, gamma=0.001, kernel=rbf;,

GridSearchCV(cv=3, estimator=SVR(),
             param_grid={'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001],
                         'kernel': ['rbf']},
             verbose=3)

In [3]:
print('Best Score', grid.best_score_)
print('Best Params', grid.best_params_)

Best Score 0.000126847592766303
Best Params {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}


In [2]:
# convert array into dataframe
X_train, X_val, y_train, y_val = train_test_split(
    df_train, y, test_size=0.1, random_state=13
)

# best_model = SVR(C = grid.best_params_['C'], gamma = grid.best_params_['gamma'], kernel = grid.best_params_['kernel'])
best_model = SVR(C = 10, gamma = 0.001, kernel = 'rbf')
best_model.fit(X_train, y_train)
mse = mean_squared_error(y_val, best_model.predict(X_val))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))


The mean squared error (MSE) on test set: 13.8118


In [3]:
df_test = pd.read_csv('illinois_basing_test.csv')
df_test['Month'] = pd.to_datetime(df_test['SampleTimeUTC']).dt.month
df_test['Day'] = pd.to_datetime(df_test['SampleTimeUTC']).dt.day
df_test['Hour'] = pd.to_datetime(df_test['SampleTimeUTC']).dt.hour
df_test['Year'] = pd.DatetimeIndex(df_test['SampleTimeUTC']).year
df_test.drop('SampleTimeUTC', axis=1, inplace = True)
cols = [i for i in df_test.columns if df_test[i].isnull().any()]
for i in cols:
    df_test[i].fillna(df_test[i].mean(), inplace=True)
    

In [4]:
print(df_train.columns)

Index(['Avg_PLT_CO2VentRate_TPH', 'Avg_CCS1_WHCO2InjPs_psi',
       'Avg_CCS1_WHCO2InjTp_F', 'Avg_CCS1_ANPs_psi', 'Avg_CCS1_DH6325Ps_psi',
       'Avg_CCS1_DH6325Tp_F', 'Avg_VW1_WBTbgPs_psi', 'Avg_VW1_WBTbgTp_F',
       'Avg_VW1_ANPs_psi', 'Avg_VW1_Z11D4917Ps_psi', 'Avg_VW1_Z11D4917Tp_F',
       'Avg_VW1_Z10D5001Ps_psi', 'Avg_VW1_Z10D5001Tp_F',
       'Avg_VW1_Z09D5653Ps_psi', 'Avg_VW1_Z09D5653Tp_F',
       'Avg_VW1_Z08D5840Ps_psi', 'Avg_VW1_Z08D5840Tp_F',
       'Avg_VW1_Z07D6416Ps_psi', 'Avg_VW1_Z07D6416Tp_F',
       'Avg_VW1_Z06D6632Ps_psi', 'Avg_VW1_Z06D6632Tp_F',
       'Avg_VW1_Z05D6720Ps_psi', 'Avg_VW1_Z05D6720Tp_F',
       'Avg_VW1_Z04D6837Ps_psi', 'Avg_VW1_Z04D6837Tp_F',
       'Avg_VW1_Z03D6945Ps_psi', 'Avg_VW1_Z03D6945Tp_F',
       'Avg_VW1_Z02D6982Ps_psi', 'Avg_VW1_Z02D6982Tp_F',
       'Avg_VW1_Z01D7061Ps_psi', 'Avg_VW1_Z01D7061Tp_F',
       'Avg_VW1_Z0910D5482Ps_psi', 'Avg_VW1_Z0910D5482Tp_F', 'Month', 'Day',
       'Hour', 'Year'],
      dtype='object')


In [5]:

    
test_predict = best_model.predict(df_test.values)
preds = pd.DataFrame(test_predict, columns=['inj_diff'])
preds.to_csv("preds_svr.csv", index = False)

  "X does not have valid feature names, but"
