In [40]:
from sklearn.ensemble import VotingRegressor 
from lightgbm import LGBMRegressor
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import StandardScaler,PowerTransformer
from sklearn.model_selection import cross_val_score,cross_val_predict
from sklearn.pipeline import Pipeline
from joblib import dump,load
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
random_seed = 1000

In [7]:
path = r'data\train.csv'
# load data
df = pd.read_csv(path)

In [8]:
# preprocessing
def preprocess(df,mode = 'train'):
  frac = 0.1
  if mode=='train':
    # fill missing data
    df['Total Cloud Cover [%]'].replace(-7999,np.nan,inplace = True)
    df['Total Cloud Cover [%]'].replace(-6999,np.nan,inplace = True)
    df['Total Cloud Cover [%]'].interpolate(limit = 10,limit_direction = 'both',inplace = True)  

    #  create targets
    df['t_30'] = df.groupby('DATE (MM/DD)')['Total Cloud Cover [%]'].shift(periods = -30,fill_value = -1)
    df['t_60'] = df.groupby('DATE (MM/DD)')['Total Cloud Cover [%]'].shift(periods = -60,fill_value = -1)
    df['t_90'] = df.groupby('DATE (MM/DD)')['Total Cloud Cover [%]'].shift(periods = -90,fill_value = -1)
    df['t_120'] = df.groupby('DATE (MM/DD)')['Total Cloud Cover [%]'].shift(periods = -120,fill_value = -1)

    cond = (df['Total Cloud Cover [%]'] == -1)
    req_samples = df[cond].sample(frac = frac,random_state = random_seed)
    not_req_samples = df[cond].drop(req_samples.index)
    df.drop(not_req_samples.index,inplace=True)
    
    # selected fts---> []
    # drop unwanted features
    df.drop([
            'DATE (MM/DD)',
            'MST',
            'Direct sNIP [W/m^2]',                # this feature is highly correlated with cmp22
            'Tower Wet Bulb Temp [deg C]',        # highly correlated with other temperature readings
            'Tower Dew Point Temp [deg C]',
            'Snow Depth [cm]',
            'Moisture',
            'Albedo (CMP11)',
            'Precipitation (Accumulated) [mm]',
            'Azimuth Angle [degrees]'
    ],axis =1,inplace = True)

    return df
  if mode == 'test':
    df['Total Cloud Cover [%]'].replace(-7999,np.nan,inplace = True)
    df['Total Cloud Cover [%]'].replace(-6999,np.nan,inplace = True)
    df['Total Cloud Cover [%]'].interpolate(limit = 10,limit_direction = 'both',inplace = True)  

    df.drop(columns={
      'Time [Mins]',
      'Direct sNIP [W/m^2]',                # this feature is highly correlated with cmp22
      'Tower Wet Bulb Temp [deg C]',        # highly correlated with other temperature readings
      'Tower Dew Point Temp [deg C]',
      'Snow Depth [cm]',
      'Moisture',
      'Albedo (CMP11)',
      'Precipitation (Accumulated) [mm]',
      'Azimuth Angle [degrees]',
      'scenario_set' 
    },inplace = True)
    return df.iloc[-1,]

df = preprocess(df,mode='train')


In [9]:

def split(x,y,train_size=0.70):
  return train_test_split(x,y,train_size=train_size,random_state=random_seed)

X_train_30,X_test_30,Y_train_30,Y_test_30 = split(df.iloc[:,:-4].values,df['t_30'].values)
X_train_60,X_test_60,Y_train_60,Y_test_60 = split(df.iloc[:,:-3].values,df['t_60'].values)
X_train_90,X_test_90,Y_train_90,Y_test_90 = split(df.iloc[:,:-2].values,df['t_90'].values)
X_train_120,X_test_120,Y_train_120,Y_test_120 = split(df.iloc[:,:-1].values,df['t_120'].values)


In [15]:
params = {
    'C': 1,
    'dual': True,
    'epsilon': 0.1,
    'fit_intercept': True,
    'loss': 'epsilon_insensitive',
    'random_state': 42
        }

pipeline_30 = Pipeline([
    ('scaler',StandardScaler()),
    ('transformer',PowerTransformer()),
    ('sgd',LinearSVR(**params))
])

pipeline_60 = Pipeline([
    ('scaler',StandardScaler()),
    ('transformer',PowerTransformer()),
    ('sgd',LinearSVR(**params))
])

pipeline_90 = Pipeline([
    ('scaler',StandardScaler()),
    ('transformer',PowerTransformer()),
    ('sgd',LinearSVR(**params))
])

pipeline_120 = Pipeline([
    ('scaler',StandardScaler()),
    ('transformer',PowerTransformer()),
    ('sgd',LinearSVR(**params))
])

In [21]:
# pipeline_30 = load(r'LinearSVR\new_LinearSVRmodel_30.joblib')
# pipeline_60 = load(r'LinearSVR\new_LinearSVRmodel_60.joblib')
# pipeline_90 = load(r'LinearSVR\new_LinearSVRmodel_90.joblib')
# pipeline_120 = load(r'LinearSVR\new_LinearSVRmodel_120.joblib')

In [16]:
lgb_params = {
    'n_estimators' : 500,
    'max_depth' : 500,
    'random_state':random_seed,
    'num_leaves':60
}

lgb_30 = LGBMRegressor(**lgb_params)
lgb_60 = LGBMRegressor(**lgb_params)
lgb_90 = LGBMRegressor(**lgb_params)
lgb_120 = LGBMRegressor(**lgb_params)

In [22]:
model_30 = VotingRegressor([('lsvr_30', pipeline_30), ('lgb_30', lgb_30)])
model_60 = VotingRegressor([('lsvr_60', pipeline_60), ('lgb_60', lgb_60)])
model_90 = VotingRegressor([('lsvr_90', pipeline_90), ('lgb_90', lgb_90)])
model_120 = VotingRegressor([('lsvr_120',pipeline_120), ('lgb_120', lgb_120)])

In [35]:
def train_and_validate(model,x_tr,x_ts,y_tr,y_ts):
  model.fit(x_tr,y_tr)
  preds = model.predict(x_ts)
  print(model.score(x_ts,y_ts))
  print(f"mae score: {mae(y_ts,preds)}")
  return preds

In [36]:
pred_30 = train_and_validate(model_30,X_train_30,X_test_30,Y_train_30,Y_test_30)
pred_60 = train_and_validate(model_60,X_train_60,X_test_60,Y_train_60,Y_test_60)
pred_90 = train_and_validate(model_90,X_train_90,X_test_90,Y_train_90,Y_test_90)
pred_120 = train_and_validate(model_120,X_train_120,X_test_120,Y_train_120,Y_test_120)

0.873476319995545
mae score: 8.040225855719612
0.8831877203270491
mae score: 7.600077843828635
0.8887363839923138
mae score: 7.400704962243625
0.8885908540177627
mae score: 7.350021842519031


In [38]:

path = r'data\train.csv'
# load data
df = pd.read_csv(path)

test_df = preprocess(df,mode='train')

X_30,Y_30 = test_df.iloc[:,:-4].values,test_df['t_30'].values
c_30 = cross_val_score(model_30,X_30,Y_30,scoring='neg_mean_absolute_error',verbose=1,n_jobs=-1)
print(f"c_30:{c_30} ,mean---------> {c_30.mean()}")

X_60,Y_60 = test_df.iloc[:,:-3].values,test_df['t_60'].values
c_60 = cross_val_score(model_60,X_60,Y_60,scoring='neg_mean_absolute_error',verbose=1,n_jobs=-1)
print(f"c_60:{c_60 } ,mean---------> { c_60.mean()}")

X_90,Y_90 = test_df.iloc[:,:-2].values,test_df['t_90'].values
c_90 = cross_val_score(model_90,X_90,Y_90,scoring='neg_mean_absolute_error',verbose=1,n_jobs=-1)
print(f"c_90:{c_90 } ,mean---------> { c_90.mean()}")

X_120,Y_120 = test_df.iloc[:,:-1].values,test_df['t_120'].values
c_120 = cross_val_score(model_120,X_120,Y_120,scoring='neg_mean_absolute_error',verbose=1,n_jobs=-1)
print(f"c_120:{c_120} ,mean---------> {c_120.mean()}")


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   30.4s remaining:   45.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   31.1s finished


c_30:[-10.57141988 -11.0783422  -11.79991664 -10.38384739  -8.54953601] ,mean---------> -10.476612425443701


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   27.5s remaining:   41.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   29.4s finished


c_60:[ -9.69485112 -10.4175476  -11.32968203  -9.89830408  -7.66627878] ,mean---------> -9.801332723181336


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   31.6s remaining:   47.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   32.3s finished


c_90:[ -9.22395403 -10.39519477 -11.39379841  -9.96785403  -7.41326992] ,mean---------> -9.678814231570067


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   32.0s remaining:   48.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   32.5s finished


c_120:[ -8.84545155 -10.15009291 -11.3586893   -9.9183558   -7.2088214 ] ,mean---------> -9.496282190693952


In [None]:

# wd_test = pd.read_csv(r'data\shell_test.csv')
# test = pd.read_csv(r'data\test.csv')

# file_count = 0
# for set in range(1,301):
#   wd = wd_test[wd_test['scenario_set'] == set]
#   # preprocessing test data
#   last_sample = preprocess(wd,mode='test')

#   # predicting test samples
#   pred_30 = model_30.predict(last_sample.values.reshape(1,-1))
#   last_sample['t_30'] = pred_30.item()
#   pred_60 = model_60.predict(last_sample.values.reshape(1,-1))
#   last_sample['t_60'] = pred_60.item()
#   pred_90 = model_90.predict(last_sample.values.reshape(1,-1))
#   last_sample['t_90'] = pred_90.item()
#   pred_120 = model_120.predict(last_sample.values.reshape(1,-1))
  
#   # fill in test data using above predictions
#   test.iloc[set-1,test.columns.get_indexer(['30_min_horizon'])] = np.round(pred_30.item())
#   test.iloc[set-1,test.columns.get_indexer(['60_min_horizon'])] = np.round(pred_60.item())
#   test.iloc[set-1,test.columns.get_indexer(['90_min_horizon'])] = np.round(pred_90.item())
#   test.iloc[set-1,test.columns.get_indexer(['120_min_horizon'])] = np.round(pred_120.item())

#   file_count += 1
#   if file_count%30 == 0 :
#       print(file_count)


In [None]:

wd_test = pd.read_csv(r'data\shell_test.csv')
test = pd.read_csv(r'data\test.csv')

file_count = 0
for set in range(1,301):
  wd = wd_test[wd_test['scenario_set'] == set]
  # preprocessing test data
  last_sample = preprocess(wd,mode='test')

  # predicting test samples
  pred_30 = cross_val_predict(model_30,last_sample.values.reshape(1,-1))
  last_sample['t_30'] = pred_30.item()
  pred_60 = cross_val_predict(model_60,last_sample.values.reshape(1,-1))
  last_sample['t_60'] = pred_60.item()
  pred_90 = cross_val_predict(model_90,last_sample.values.reshape(1,-1))
  last_sample['t_90'] = pred_90.item()
  pred_120 = cross_val_predict(model_120,last_sample.values.reshape(1,-1))
  
  # fill in test data using above predictions
  test.iloc[set-1,test.columns.get_indexer(['30_min_horizon'])] = np.round(pred_30.item())
  test.iloc[set-1,test.columns.get_indexer(['60_min_horizon'])] = np.round(pred_60.item())
  test.iloc[set-1,test.columns.get_indexer(['90_min_horizon'])] = np.round(pred_90.item())
  test.iloc[set-1,test.columns.get_indexer(['120_min_horizon'])] = np.round(pred_120.item())

  file_count += 1
  if file_count%30 == 0 :
      print(file_count)

In [33]:
# test = test.applymap(int)
test.to_csv('voting_regressor_preds.csv',index=False)

print(test.describe())
print(test)

       scenario_set  30_min_horizon  60_min_horizon  90_min_horizon  \
count    300.000000      300.000000      300.000000      300.000000   
mean     150.500000       57.243333       57.903333       58.186667   
std       86.746758       30.752178       27.706549       24.684060   
min        1.000000        6.000000        5.000000        6.000000   
25%       75.750000       24.000000       29.750000       36.750000   
50%      150.500000       62.500000       66.000000       66.500000   
75%      225.250000       86.000000       82.000000       78.000000   
max      300.000000       99.000000       98.000000       97.000000   

       120_min_horizon  
count       300.000000  
mean         58.336667  
std          21.939793  
min           6.000000  
25%          42.750000  
50%          64.500000  
75%          75.000000  
max          98.000000  
     scenario_set  30_min_horizon  60_min_horizon  90_min_horizon  \
0               1            93.0            94.0            95.0 