In [15]:
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler,PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold,RandomizedSearchCV,GridSearchCV,train_test_split
from sklearn.metrics import mean_absolute_error as mae
from joblib import dump,load
import os
import sys

import warnings
warnings.filterwarnings('error','warnings ignored')
random_seed=42

In [10]:
path = 'train.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,DATE (MM/DD),MST,Global CMP22 (vent/cor) [W/m^2],Direct sNIP [W/m^2],Azimuth Angle [degrees],Tower Dry Bulb Temp [deg C],Tower Wet Bulb Temp [deg C],Tower Dew Point Temp [deg C],Tower RH [%],Total Cloud Cover [%],Peak Wind Speed @ 6ft [m/s],Avg Wind Direction @ 6ft [deg from N],Station Pressure [mBar],Precipitation (Accumulated) [mm],Snow Depth [cm],Moisture,Albedo (CMP11)
0,1/1,00:00,-0.962276,0.0,356.8564,7.216,0.988,-7.312,32.33,-1,9.95,271.3,806.779,0.0,0.219,0.0,0.0
1,1/1,00:01,-0.937921,0.0,357.65505,7.251,1.04,-7.26,32.4,-1,8.2,272.9,806.84,0.0,0.206,0.0,0.0
2,1/1,00:02,-0.944395,0.0,358.45438,7.256,1.093,-7.207,32.54,-1,6.7,288.8,806.876,0.0,0.148,0.0,0.0
3,1/1,00:03,-0.95135,-0.029673,359.25416,7.254,1.06,-7.44,31.89,-1,7.7,294.0,806.823,0.0,0.235,0.0,0.0
4,1/1,00:04,-0.934976,-0.054401,0.05415,7.331,1.081,-7.419,31.78,-1,7.2,285.5,806.762,0.0,0.182,0.0,0.0


In [11]:
# preprocessing
def preprocess(df,mode = 'train'):
  if mode=='train':
    # fill missing data
    df['Total Cloud Cover [%]'].replace(-7999,np.nan,inplace = True)
    df['Total Cloud Cover [%]'].replace(-6999,np.nan,inplace = True)
    df['Total Cloud Cover [%]'].interpolate(limit = 10,limit_direction = 'both',inplace = True)  

    #  create targets
    df['t_30'] = df.groupby('DATE (MM/DD)')['Total Cloud Cover [%]'].shift(periods = -30,fill_value = -1)
    df['t_60'] = df.groupby('DATE (MM/DD)')['Total Cloud Cover [%]'].shift(periods = -60,fill_value = -1)
    df['t_90'] = df.groupby('DATE (MM/DD)')['Total Cloud Cover [%]'].shift(periods = -90,fill_value = -1)
    df['t_120'] = df.groupby('DATE (MM/DD)')['Total Cloud Cover [%]'].shift(periods = -120,fill_value = -1)

    cond = (df['Total Cloud Cover [%]'] == -1)
    req_samples = df[cond].sample(frac = 0.10,random_state = random_seed)
    not_req_samples = df[cond].drop(req_samples.index)
    df.drop(not_req_samples.index,inplace=True)
    print(df.shape)

    # drop unwanted features
    df.drop([
            'DATE (MM/DD)',
            'MST',
            'Direct sNIP [W/m^2]',                # this feature is highly correlated with cmp22
            'Tower Wet Bulb Temp [deg C]',        # highly correlated with other temperature readings
            'Tower Dew Point Temp [deg C]',
            'Snow Depth [cm]',
            'Moisture',
            'Albedo (CMP11)',
            'Precipitation (Accumulated) [mm]',
            'Azimuth Angle [degrees]'
    ],axis =1,inplace = True)

    return df
  if mode == 'test':
    df['Total Cloud Cover [%]'].replace(-7999,np.nan,inplace = True)
    df['Total Cloud Cover [%]'].replace(-6999,np.nan,inplace = True)
    df['Total Cloud Cover [%]'].interpolate(limit = 10,limit_direction = 'both',inplace = True)  

    df.drop(columns={
      'Time [Mins]',
      'Direct sNIP [W/m^2]',                # this feature is highly correlated with cmp22
      'Tower Wet Bulb Temp [deg C]',        # highly correlated with other temperature readings
      'Tower Dew Point Temp [deg C]',
      'Snow Depth [cm]',
      'Moisture',
      'Albedo (CMP11)',
      'Precipitation (Accumulated) [mm]',
      'Azimuth Angle [degrees]' 
    },inplace = True)
    return df

In [12]:
df = preprocess(df,mode='train')

(279738, 21)


In [13]:
X,y = df.iloc[:,:-4].values,df['t_30'].values
X.shape,y.shape

((279738, 7), (279738,))

In [16]:
kf = KFold(n_splits = 5,random_state=random_seed,shuffle=True)
params = {
    'sgd__loss' : ['squared_error','huber','epsilon_insenstive'],
    'sgd__penalty' : ['l1','l2','elasticnet'],
    'sgd__alpha' : [0.01,0.001,0.0001],
    'sgd__max_iter' : [10_000],
    'sgd__epsilon'  : [1,0.1,0.001],
    'sgd__random_state' : [random_seed],
    'sgd__learning_rate' : ['constant','optimal','invscaling',"adapting"],
    'sgd__n_iter_no_change': [10],
    'sgd__early_stopping' : [True]
}

model = SGDRegressor()
pipeline = Pipeline([
    ('scaler',StandardScaler()),
    ('transform',PowerTransformer()),
    ('sgd',model)
])
# have to give list of tuples

In [None]:
search = RandomizedSearchCV(pipeline,params,verbose=1,cv=kf,scoring='neg_mean_absolute_error',n_jobs=-1,n_iter=150)
search.fit(X,y)

In [None]:
print(search.best_params_)

In [None]:
search_results = pd.DataFrame(search.cv_results_)
search_results.to_csv('SGD_finetuned_results.csv')

In [None]:
# grid_search = GridSearchCV(pipeline,params,verbose=1,cv=kf,scoring='neg_mean_absolute_error')
# grid_search.fit(X,y)