In [1]:
# ! wget "https://he-public-data.s3.ap-southeast-1.amazonaws.com/shell_dataset.zip"
# ! unzip -q shell_dataset.zip
# ! unzip -q dataset/train.zip
# ! unzip -q dataset/test.zip

In [2]:
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVR
from sklearn.preprocessing import StandardScaler,PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold,RandomizedSearchCV,GridSearchCV,train_test_split
from sklearn.metrics import mean_absolute_error as mae
from joblib import dump,load
import os
import sys

import warnings
warnings.filterwarnings('ignore','warnings ignored')
random_seed=42

In [3]:
path = r'data/train.csv'
df = pd.read_csv(path)
df

Unnamed: 0,DATE (MM/DD),MST,Global CMP22 (vent/cor) [W/m^2],Direct sNIP [W/m^2],Azimuth Angle [degrees],Tower Dry Bulb Temp [deg C],Tower Wet Bulb Temp [deg C],Tower Dew Point Temp [deg C],Tower RH [%],Total Cloud Cover [%],Peak Wind Speed @ 6ft [m/s],Avg Wind Direction @ 6ft [deg from N],Station Pressure [mBar],Precipitation (Accumulated) [mm],Snow Depth [cm],Moisture,Albedo (CMP11)
0,1/1,00:00,-0.962276,0.000000,356.85640,7.216,0.988,-7.312,32.33,-1,9.95,271.3,806.779,0.0,0.219,0.0,0.0
1,1/1,00:01,-0.937921,0.000000,357.65505,7.251,1.040,-7.260,32.40,-1,8.20,272.9,806.840,0.0,0.206,0.0,0.0
2,1/1,00:02,-0.944395,0.000000,358.45438,7.256,1.093,-7.207,32.54,-1,6.70,288.8,806.876,0.0,0.148,0.0,0.0
3,1/1,00:03,-0.951350,-0.029673,359.25416,7.254,1.060,-7.440,31.89,-1,7.70,294.0,806.823,0.0,0.235,0.0,0.0
4,1/1,00:04,-0.934976,-0.054401,0.05415,7.331,1.081,-7.419,31.78,-1,7.20,285.5,806.762,0.0,0.182,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
527035,12/31,23:55,-1.360910,-0.340704,352.62902,-0.469,-3.940,-10.140,43.61,-1,0.00,0.0,816.186,0.0,2.899,0.0,0.0
527036,12/31,23:56,-1.342520,-0.325891,353.41779,-0.499,-3.927,-10.127,43.77,-1,0.00,0.0,816.185,0.0,2.866,0.0,0.0
527037,12/31,23:57,-1.341260,-0.320952,354.20842,-0.522,-3.958,-10.158,43.73,-1,0.00,0.0,816.198,0.0,2.882,0.0,0.0
527038,12/31,23:58,-1.334130,-0.320953,355.00071,-0.558,-3.979,-10.079,44.17,-1,0.00,0.0,816.194,0.0,2.805,0.0,0.0


In [4]:
# preprocessing
def preprocess(df,mode = 'train'):
  if mode=='train':
    # fill missing data
    df['Total Cloud Cover [%]'].replace(-7999,np.nan,inplace = True)
    df['Total Cloud Cover [%]'].replace(-6999,np.nan,inplace = True)
    df['Total Cloud Cover [%]'].interpolate(limit = 10,limit_direction = 'both',inplace = True)  

    #  create targets
    df['t_30'] = df.groupby('DATE (MM/DD)')['Total Cloud Cover [%]'].shift(periods = -30,fill_value = -1)
    df['t_60'] = df.groupby('DATE (MM/DD)')['Total Cloud Cover [%]'].shift(periods = -60,fill_value = -1)
    df['t_90'] = df.groupby('DATE (MM/DD)')['Total Cloud Cover [%]'].shift(periods = -90,fill_value = -1)
    df['t_120'] = df.groupby('DATE (MM/DD)')['Total Cloud Cover [%]'].shift(periods = -120,fill_value = -1)

    cond = (df['Total Cloud Cover [%]'] == -1)
    req_samples = df[cond].sample(frac = 0.10,random_state = random_seed)
    not_req_samples = df[cond].drop(req_samples.index)
    df.drop(not_req_samples.index,inplace=True)
    print(df.shape)

    # drop unwanted features
    df.drop([
            'DATE (MM/DD)',
            'MST',
            'Direct sNIP [W/m^2]',                # this feature is highly correlated with cmp22
            'Tower Wet Bulb Temp [deg C]',        # highly correlated with other temperature readings
            'Tower Dew Point Temp [deg C]',
            'Snow Depth [cm]',
            'Moisture',
            'Albedo (CMP11)',
            'Precipitation (Accumulated) [mm]',
            'Azimuth Angle [degrees]'
    ],axis =1,inplace = True)

    return df
  if mode == 'test':
    df['Total Cloud Cover [%]'].replace(-7999,np.nan,inplace = True)
    df['Total Cloud Cover [%]'].replace(-6999,np.nan,inplace = True)
    df['Total Cloud Cover [%]'].interpolate(limit = 10,limit_direction = 'both',inplace = True)  

    df.drop(columns={
      'Time [Mins]',
      'Direct sNIP [W/m^2]',                # this feature is highly correlated with cmp22
      'Tower Wet Bulb Temp [deg C]',        # highly correlated with other temperature readings
      'Tower Dew Point Temp [deg C]',
      'Snow Depth [cm]',
      'Moisture',
      'Albedo (CMP11)',
      'Precipitation (Accumulated) [mm]',
      'Azimuth Angle [degrees]' 
    },inplace = True)
    return df

In [6]:
# path = './train/train.csv'
# data = pd.read_csv(path)
# unused_df = data.drop([idx for idx in df.index],axis = 0)
# unused_df.to_csv('unused_df.csv',index=False)

In [7]:
df = preprocess(df,mode='train')

(255008, 21)


In [8]:
# def split(x,y,train_size=0.80):
#   return train_test_split(x,y,train_size=train_size,random_state=random_seed)

# X_train_30,X_val_30,Y_train_30,Y_val_30 = split(df.iloc[:,:-4].values,df['t_30'].values)
# X_train_60,X_test_60,Y_train_60,Y_test_60 = split(df.iloc[:,:-3].values,df['t_60'].values)
# X_train_90,X_test_90,Y_train_90,Y_test_90 = split(df.iloc[:,:-2].values,df['t_90'].values)
# X_train_120,X_test_120,Y_train_120,Y_test_120 = split(df.iloc[:,:-1].values,df['t_120'].values)

In [9]:
X,y = df.iloc[:,:-4].values,df['t_30'].values
X.shape,y.shape

((255008, 7), (255008,))

In [10]:
# model_30 = LinearSVR(verbose=1)
# model_30.fit(X_train_30,Y_train_30)
# preds_30  = model_30.predict(X_test_30)
# score_30 = mae(Y_test_30,preds_30)
# print(f"model_30 mae score: {score_30}")
# dump(model_30,'new_LinearSVRmodel_30.joblib')


# model_60 = LinearSVR(verbose=1)
# model_60.fit(X_train_60,Y_train_60)
# preds_60  = model_60.predict(X_test_60)
# score_60 = mae(Y_test_60,preds_60)
# print(f"model_60 mae score: {score_60}")
# dump(model_60,'new_LinearSVRmodel_60.joblib')


# model_90 = LinearSVR(verbose=1)
# model_90.fit(X_train_90,Y_train_90)
# preds_90  = model_90.predict(X_test_90)
# score_90 = mae(Y_test_90,preds_90)
# print(f"model_90 mae score: {score_90}")
# dump(model_90,'new_LinearSVRmodel_90.joblib')


# model_120 = LinearSVR(verbose=1)
# model_120.fit(X_train_120,Y_train_120)
# preds_120  = model_120.predict(X_test_120)
# score_120 = mae(Y_test_120,preds_120)
# print(f"model_120 mae score: {score_120}")
# dump(model_120,'new_LineraSVRmodel_120.joblib')

In [11]:
kf = KFold(n_splits = 5,random_state=random_seed,shuffle=True)
params = {
    'sgd__epsilon': [5,1,0,0.1,0.01,0.001],
    'sgd__loss' : ['epsilon_insensitive','squared_epsilon_insensitive'],
    'lsvr__C' : [0.01,0.1,0.0,1,5,10],
    'lsvr__dual' : [False,True],
    'lsvr__fit_intercept' : [False,True],
    'lsvr__random_state': [random_seed]
}

model = LinearSVR()
pipeline = Pipeline([
    ('scaler',StandardScaler()),
    ('transform',PowerTransformer()),
    ('lsvr',model)
])
# have to give list of tuples

In [12]:
search = RandomizedSearchCV(pipeline,params,verbose=1,cv=kf,scoring='neg_mean_absolute_error')

In [13]:
search

RandomizedSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
                   estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('transform', PowerTransformer()),
                                             ('lsvr', LinearSVR())]),
                   param_distributions={'lsvr__C': [0.01, 0.1, 0.0, 1, 5, 10],
                                        'lsvr__dual': [False, True],
                                        'lsvr__epsilon': [5, 1, 0, 0.1, 0.01,
                                                          0.001],
                                        'lsvr__fit_intercept': [False, True],
                                        'lsvr__loss': ['epsilon_insensitive',
                                                       'squared_epsilon_insensitive'],
                                        'lsvr__random_state': [42]},
                   scoring='neg_mean_absolute_error', verbose=1)

In [None]:
search.fit(X,y)

In [15]:
search

RandomizedSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
                   estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('transform', PowerTransformer()),
                                             ('lsvr', LinearSVR())]),
                   param_distributions={'lsvr__C': [0.01, 0.1, 0.0, 1, 5, 10],
                                        'lsvr__dual': [False, True],
                                        'lsvr__epsilon': [5, 1, 0, 0.1, 0.01,
                                                          0.001],
                                        'lsvr__fit_intercept': [False, True],
                                        'lsvr__loss': ['epsilon_insensitive',
                                                       'squared_epsilon_insensitive'],
                                        'lsvr__random_state': [42]},
                   scoring='neg_mean_absolute_error', verbose=1)

In [16]:
search_results = pd.DataFrame(search.cv_results_)
search_results.to_csv('LinearSVR_random_search.csv')

In [17]:
search.best_params_

{'lsvr__random_state': 42,
 'lsvr__loss': 'squared_epsilon_insensitive',
 'lsvr__fit_intercept': True,
 'lsvr__epsilon': 0,
 'lsvr__dual': False,
 'lsvr__C': 1}

In [18]:
search.best_score_

-12.073207178007063

In [None]:
grid_search = GridSearchCV(pipeline,params,verbose=1,cv=kf,scoring='neg_mean_absolute_error')
grid_search.fit(X,y)

In [None]:
# GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
#              estimator=Pipeline(steps=[('scaler', StandardScaler()),
#                                        ('transform', PowerTransformer()),
#                                        ('lsvr', LinearSVR())]),
#              param_grid={'lsvr__C': [0.01, 0.1, 0.0, 1, 5, 10],
#                          'lsvr__dual': [False, True],
#                          'lsvr__epsilon': [5, 1, 0, 0.1, 0.01, 0.001],
#                          'lsvr__fit_intercept': [False, True],
#                          'lsvr__loss': ['epsilon_insensitive',
#                                         'squared_epsilon_insensitive'],
#                          'lsvr__random_state': [42]},
#              scoring='neg_mean_absolute_error', verbose=1)

In [20]:
gs_results = pd.DataFrame(grid_search.cv_results_)
gs_results.to_csv('LinearSVR_grid_search.csv')

In [21]:
grid_search.scoring

'neg_mean_absolute_error'

In [22]:
grid_search.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()), ('transform', PowerTransformer()),
                ('lsvr', LinearSVR(C=1, epsilon=0.1, random_state=42))])

In [23]:
grid_search.best_params_

{'lsvr__C': 1,
 'lsvr__dual': True,
 'lsvr__epsilon': 0.1,
 'lsvr__fit_intercept': True,
 'lsvr__loss': 'epsilon_insensitive',
 'lsvr__random_state': 42}

In [None]:
grid_search.best_score_

In [5]:
import pandas as pd
df = pd.read_csv('LinearSVR_grid_search.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lsvr__C,param_lsvr__dual,param_lsvr__epsilon,param_lsvr__fit_intercept,param_lsvr__loss,param_lsvr__random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0,1.996103,0.153652,0.0,0.0,0.01,False,5.0,False,epsilon_insensitive,42,"{'lsvr__C': 0.01, 'lsvr__dual': False, 'lsvr__...",,,,,,,,201
1,1,1.901642,0.106663,0.021856,0.007634,0.01,False,5.0,False,squared_epsilon_insensitive,42,"{'lsvr__C': 0.01, 'lsvr__dual': False, 'lsvr__...",-48.102049,-47.938597,-47.954553,-48.007025,-47.860815,-47.972608,0.0799,173
2,2,1.95951,0.249866,0.0,0.0,0.01,False,5.0,True,epsilon_insensitive,42,"{'lsvr__C': 0.01, 'lsvr__dual': False, 'lsvr__...",,,,,,,,225
3,3,2.117734,0.122968,0.029672,0.010341,0.01,False,5.0,True,squared_epsilon_insensitive,42,"{'lsvr__C': 0.01, 'lsvr__dual': False, 'lsvr__...",-12.87008,-12.823488,-12.814745,-12.741429,-12.797944,-12.809537,0.041618,89
4,4,2.025976,0.094849,0.0,0.0,0.01,False,1.0,False,epsilon_insensitive,42,"{'lsvr__C': 0.01, 'lsvr__dual': False, 'lsvr__...",,,,,,,,226


In [6]:
df[df.rank_test_score < 10]

Unnamed: 0.1,Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lsvr__C,param_lsvr__dual,param_lsvr__epsilon,param_lsvr__fit_intercept,param_lsvr__loss,param_lsvr__random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
82,82,2.130192,0.245311,0.021821,0.003098,0.1,True,0.0,True,epsilon_insensitive,42,"{'lsvr__C': 0.1, 'lsvr__dual': True, 'lsvr__ep...",-11.156603,-11.150818,-11.055314,-11.003444,-11.081771,-11.08959,0.058132,8
86,86,2.092999,0.224931,0.028697,0.002404,0.1,True,0.1,True,epsilon_insensitive,42,"{'lsvr__C': 0.1, 'lsvr__dual': True, 'lsvr__ep...",-11.156634,-11.150613,-11.054917,-11.003441,-11.081252,-11.089371,0.058158,2
90,90,2.331249,0.213456,0.019507,0.007989,0.1,True,0.01,True,epsilon_insensitive,42,"{'lsvr__C': 0.1, 'lsvr__dual': True, 'lsvr__ep...",-11.156677,-11.15113,-11.055341,-11.003422,-11.081131,-11.08954,0.058236,7
94,94,2.30914,0.215666,0.027349,0.00533,0.1,True,0.001,True,epsilon_insensitive,42,"{'lsvr__C': 0.1, 'lsvr__dual': True, 'lsvr__ep...",-11.15682,-11.151678,-11.054968,-11.003687,-11.081381,-11.089707,0.058344,9
182,182,2.515053,0.198841,0.021848,0.007652,1.0,True,0.1,True,epsilon_insensitive,42,"{'lsvr__C': 1, 'lsvr__dual': True, 'lsvr__epsi...",-11.156268,-11.150852,-11.054909,-11.003477,-11.080993,-11.0893,0.058122,1
186,186,2.380799,0.091354,0.015623,1.6e-05,1.0,True,0.01,True,epsilon_insensitive,42,"{'lsvr__C': 1, 'lsvr__dual': True, 'lsvr__epsi...",-11.156765,-11.150401,-11.055306,-11.003307,-11.081112,-11.089378,0.058142,3
230,230,4.328947,0.130003,0.031222,1.7e-05,5.0,True,0.1,True,epsilon_insensitive,42,"{'lsvr__C': 5, 'lsvr__dual': True, 'lsvr__epsi...",-11.156589,-11.150649,-11.054845,-11.00354,-11.08152,-11.089429,0.058127,4
274,274,5.497769,0.156345,0.0194,0.006058,10.0,True,0.0,True,epsilon_insensitive,42,"{'lsvr__C': 10, 'lsvr__dual': True, 'lsvr__eps...",-11.15681,-11.150943,-11.054882,-11.00414,-11.080779,-11.089511,0.05808,6
278,278,5.500415,0.115298,0.023065,0.007046,10.0,True,0.1,True,epsilon_insensitive,42,"{'lsvr__C': 10, 'lsvr__dual': True, 'lsvr__eps...",-11.156272,-11.150672,-11.055212,-11.00364,-11.081603,-11.08948,0.057984,5
