In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
! wget "https://he-public-data.s3.ap-southeast-1.amazonaws.com/shell_dataset.zip"
! unzip -q shell_dataset.zip
! unzip -q dataset/train.zip
! unzip -q dataset/test.zip

In [4]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import LinearSVR
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import Normalizer,PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error as mae
from sklearn.pipeline import Pipeline
from joblib import load,dump
import pickle as pkl
import sys
import os
import warnings
warnings.filterwarnings('ignore')

random_seed = 42

In [5]:
train  = pd.read_csv('./train/train.csv')
train

In [6]:
#  fill in 'bad' values
train['Total Cloud Cover [%]'].replace(-7999,np.nan,inplace = True)
train['Total Cloud Cover [%]'].replace(-6999,np.nan,inplace = True)
train['Total Cloud Cover [%]'].interpolate(limit = 10,limit_direction = 'both',inplace = True)  # possiblity of data leak???

In [7]:
filt = (train['Total Cloud Cover [%]'] < -1 ) & (train['MST'].str.endswith('0'))
train[filt]["Total Cloud Cover [%]"].nunique()

In [8]:
train['t_30'] = train.groupby('DATE (MM/DD)')['Total Cloud Cover [%]'].shift(periods = -30,fill_value = -1)
train['t_60'] = train.groupby('DATE (MM/DD)')['Total Cloud Cover [%]'].shift(periods = -60,fill_value = -1)
train['t_90'] = train.groupby('DATE (MM/DD)')['Total Cloud Cover [%]'].shift(periods = -90,fill_value = -1)
train['t_120'] = train.groupby('DATE (MM/DD)')['Total Cloud Cover [%]'].shift(periods = -120,fill_value = -1)
train

In [9]:
# sample train -----> well balanced 

cond = (train['t_30'] == -1) | (train['t_90'] == -1) | (train['t_60'] == -1) | (train['t_120'] == -1)
req_samples = train[cond].sample(frac = 0.02,random_state = random_seed)
not_req_samples = train[cond].drop(req_samples.index)
train.drop(not_req_samples.index,inplace=True)
train.head()

In [10]:
train.shape

In [11]:
# drop unwanted features
train.drop(columns={
    'DATE (MM/DD)',
    'MST',
    'Direct sNIP [W/m^2]',                # this feature is highly correlated with cmp22
    'Tower Wet Bulb Temp [deg C]',        # highly correlated with other temperature readings
    'Snow Depth [cm]' 
},inplace = True)
train.head()

In [12]:
train.head()

In [13]:
# <a href="Your file path"> Download File </a>

In [15]:
X_train_30,X_test_30,Y_train_30,Y_test_30 = train_test_split(train.iloc[:,:-4].values,train['t_30'].values,
                                                             train_size = 0.60,
                                                             random_state = random_seed)
X_train_60,X_test_60,Y_train_60,Y_test_60 = train_test_split(train.iloc[:,:-3].values,train['t_60'].values,
                                                             train_size = 0.60,
                                                             random_state = random_seed)
X_train_90,X_test_90,Y_train_90,Y_test_90 = train_test_split(train.iloc[:,:-2].values,train['t_90'].values,
                                                             train_size = 0.60,
                                                             random_state = random_seed)
X_train_120,X_test_120,Y_train_120,Y_test_120 = train_test_split(train.iloc[:,:-1].values,train['t_120'].values,
                                                             train_size = 0.60,
                                                             random_state = random_seed)

In [16]:
X_train_30.flags

In [17]:
ppline_30 = Pipeline([
                      ('pwr_transformer',PowerTransformer()),
                      ('model_30',SVR())
])
ppline_60 = Pipeline([
                      ('pwr_transformer',PowerTransformer()),
                      ('model_60',SVR())
])
ppline_90 = Pipeline([
                      ('pwr_transformer',PowerTransformer()),
                      ('model_90',SVR())
])
ppline_120 = Pipeline([
                      ('pwr_transformer',PowerTransformer()),
                      ('model_120',SVR())
])

In [18]:
ppline_30.get_params()

In [21]:
scipy.stats.expon(scale=.1).rvs()

In [25]:
param_dist = {
    'model_30__kernel' : ['linear', 'poly', 'rbf','sigmoid', 'precomputed'], # the kernel to used
    'model_30__gamma': [3,1, 0.1, 0.01, 0.001], 
#     'model_30__degree' : [3,4,5],
    #     'C' = [,0.5,0.7,1,5,10],               # reugularisation parameter (low C corresponds to strong regularisation for noisy data)
    'model_30__epsilon' : [0.1,0.3,0.2], # epsilon tube parameter  (no loss is associated with correct samples within the tube)
    'model_30__cache_size' : [3000],             # cache size for kernel
    'model_30__verbose' : [True],
    'model_30__shrinking':[False]
}

In [26]:
kf = KFold(n_splits=5,random_state = random_seed)
random_search_30 = RandomizedSearchCV(ppline_30,param_distributions = param_dist,n_jobs = -1,cv = kf,verbose=1)

In [None]:
random_search_30.fit(X_train_30,Y_train_30)

In [None]:
print(random_search_30.best_estimator_)
print(random_search_30.best_params_)
random_search_30.best_score_

In [None]:
rs_30_finetuning_results = pd.DataFrame(random_search_30.cv_results_)
rs_30_finetuning_results.to_csv('rs_30_finetuning_results.csv',index=False)

<a href="./rs_30_finetuning_results.csv"> Download File </a>

In [None]:
def preprocess(df,mode = 'test'):
    if mode == 'test':
      # clean unwanted data
      df['Total Cloud Cover [%]'].replace(-7999,np.nan,inplace = True)
      df['Total Cloud Cover [%]'].replace(-6999,np.nan,inplace = True)
      df['Total Cloud Cover [%]'].interpolate(limit = 10,limit_direction = 'both',inplace = True)

      # drop unwanted columns
      df.drop(columns={
          'Time [Mins]',
          'Direct sNIP [W/m^2]',                # this feature is highly correlated with cmp22
          'Tower Wet Bulb Temp [deg C]',        # highly correlated with other temperature readings
          'Snow Depth [cm]' 
      },inplace = True)
        
    return df.iloc[-1,]

In [None]:
path = os.getcwd()
name = "weather_data.csv"
test = pd.read_csv('./dataset/test.csv')

file_count = 0
for root, dirs, files in os.walk(path):
    if name in files:
        # take in test datafile
        req_path = (os.path.join(root, name))
        wd = pd.read_csv(req_path)
        scenario_set = int(req_path.split('/')[-2])
        
        # preprocessing test data
        last_sample = preprocess(wd)
        
        # predicting test samples
        pred_30 = ppline_30.predict(last_sample.to_numpy().reshape(1,-1))
        last_sample['t_30'] = pred_30[0]
        pred_60 = ppline_60.predict(last_sample.to_numpy().reshape(1,-1))
        last_sample['t_60'] = pred_60[0]
        pred_90 = ppline_90.predict(last_sample.to_numpy().reshape(1,-1))
        last_sample['t_90'] = pred_90[0]
        pred_120 = ppline_120.predict(last_sample.to_numpy().reshape(1,-1))
        
        # fill in test data using above predictions
        test.iloc[scenario_set-1,test.columns.get_indexer(['30_min_horizon'])] = np.round(pred_30[0])
        test.iloc[scenario_set-1,test.columns.get_indexer(['60_min_horizon'])] = np.round(pred_60[0])
        test.iloc[scenario_set-1,test.columns.get_indexer(['90_min_horizon'])] = np.round(pred_90[0])
        test.iloc[scenario_set-1,test.columns.get_indexer(['120_min_horizon'])] = np.round(pred_120[0])

        file_count += 1
        if file_count%30 == 0 : print(file_count)

In [None]:
test = test.applymap(int)
test