In [1]:
import os
import numpy as np
import pandas as pd
import decimal

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer

from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
DATADIR = 'data-CLEAN'
EXPORT_DIR = 'data-PREPROCESSED'

training_file = os.path.join(DATADIR, 'train_data.csv')
testing_file = os.path.join(DATADIR, 'test_data.csv')

In [3]:
# function courtesy of techbeamers.com/python-float-range/
def float_range(start, stop, step):
    while start < stop:
        yield float(start)
        start += decimal.Decimal(step)

In [4]:
# code courtesy of: https://stackoverflow.com/questions/28465633/easy-way-to-apply-transformation-from-pandas-get-dummies-to-new-data
class GetDummies(BaseEstimator, TransformerMixin):
    def __init__(self, dummy_columns):
        self.columns=None
        self.dummy_columns=dummy_columns
        
    def fit(self, X, y=None):
        self.columns = pd.get_dummies(X, columns=self.dummy_columns).columns
        return self
    
    def transform(self, X):
        X_new = pd.get_dummies(X, columns=self.dummy_columns)
        return X_new.reindex(columns=self.columns, fill_value=0)

In [5]:
train_data = pd.read_csv(training_file)
train_data.head(10)

Unnamed: 0,Day of Year,Year,Month,Day,First Hour of Period,Is Daylight,Distance to Solar Noon,Average Temperature (Day),Average Wind Direction (Day),Average Wind Speed (Day),Sky Cover,Visibility,Relative Humidity,Average Wind Speed (Period),Average Barometric Pressure (Period),Power Generated
0,210,2009,7,29,10,True,0.16,65,27,13.6,1,10.0,68,14.0,29.88,10750
1,92,2009,4,2,16,True,0.298292,54,27,25.1,2,10.0,64,38.0,29.89,18575
2,353,2008,12,18,16,True,0.511344,47,20,8.7,4,10.0,69,11.0,30.14,3407
3,173,2009,6,22,10,True,0.148816,67,29,10.7,0,10.0,70,5.0,29.83,29010
4,243,2009,8,31,10,True,0.166453,63,27,13.9,4,10.0,75,10.0,29.93,6995
5,74,2009,3,15,4,False,0.695955,55,18,8.2,4,6.0,93,5.0,30.04,0
6,320,2008,11,15,16,True,0.504119,69,14,5.0,1,10.0,27,0.0,30.05,6614
7,231,2009,8,19,4,False,0.610149,65,30,8.6,4,10.0,87,6.0,29.82,0
8,358,2008,12,23,22,False,1.136126,47,13,3.6,4,10.0,77,6.0,29.89,0
9,53,2009,2,22,10,True,0.124438,56,15,9.4,4,4.0,93,7.0,29.99,133


In [6]:
test_data = pd.read_csv(testing_file)
test_data.head(10)

Unnamed: 0,Day of Year,Year,Month,Day,First Hour of Period,Is Daylight,Distance to Solar Noon,Average Temperature (Day),Average Wind Direction (Day),Average Wind Speed (Day),Sky Cover,Visibility,Relative Humidity,Average Wind Speed (Period),Average Barometric Pressure (Period),Power Generated
0,183,2009,7,2,16,True,0.255946,64,29,12.3,1,10.0,59,21.0,29.86,21804
1,187,2009,7,6,19,True,0.459705,62,27,15.8,1,10.0,72,18.0,29.97,2065
2,173,2009,6,22,22,False,0.662909,67,29,10.7,0,10.0,72,13.0,29.75,0
3,343,2008,12,8,16,True,0.515571,50,33,4.9,2,10.0,71,0.0,30.07,5654
4,84,2009,3,25,7,True,0.425876,55,27,16.1,1,10.0,86,5.0,30.16,3621
5,269,2008,9,25,13,True,0.081944,67,29,9.3,2,10.0,47,15.0,29.91,27072
6,71,2009,3,12,22,False,0.819464,51,28,9.5,1,10.0,86,13.0,30.15,0
7,37,2009,2,6,19,True,0.72381,53,13,7.5,4,10.0,86,6.0,29.8,0
8,188,2009,7,7,4,True,0.56314,62,27,12.9,4,10.0,80,9.0,29.93,0
9,138,2009,5,18,4,True,0.565774,59,29,15.6,2,10.0,90,10.0,29.87,0


In [7]:
# finalizing the feature engineering before transformations

In [8]:
def engineer_features(df: pd.DataFrame):
    df['Is Daylight'] = pd.Series(map(int, df['Is Daylight']), name='Is Daylight')
    dist = pd.Series(map(lambda x: abs(12-x), df['First Hour of Period']), name='Hours from Noon')
    return df.merge(dist, left_index=True, right_index=True)

In [9]:
train_data.corr()['Power Generated'].sort_values(ascending=False)

Power Generated                         1.000000
Is Daylight                             0.537624
Average Wind Speed (Period)             0.276229
Average Wind Direction (Day)            0.148886
Average Wind Speed (Day)                0.140443
Average Temperature (Day)               0.133346
Year                                    0.120728
First Hour of Period                    0.110388
Visibility                              0.086513
Day                                    -0.003208
Month                                  -0.032273
Day of Year                            -0.033956
Average Barometric Pressure (Period)   -0.036025
Sky Cover                              -0.195783
Relative Humidity                      -0.533692
Distance to Solar Noon                 -0.746085
Name: Power Generated, dtype: float64

In [10]:
train_data = engineer_features(train_data)
train_data.drop(columns=['Year', 'Day', 'Day of Year', 'First Hour of Period'], inplace=True)
train_data.head(10)

Unnamed: 0,Month,Is Daylight,Distance to Solar Noon,Average Temperature (Day),Average Wind Direction (Day),Average Wind Speed (Day),Sky Cover,Visibility,Relative Humidity,Average Wind Speed (Period),Average Barometric Pressure (Period),Power Generated,Hours from Noon
0,7,1,0.16,65,27,13.6,1,10.0,68,14.0,29.88,10750,2
1,4,1,0.298292,54,27,25.1,2,10.0,64,38.0,29.89,18575,4
2,12,1,0.511344,47,20,8.7,4,10.0,69,11.0,30.14,3407,4
3,6,1,0.148816,67,29,10.7,0,10.0,70,5.0,29.83,29010,2
4,8,1,0.166453,63,27,13.9,4,10.0,75,10.0,29.93,6995,2
5,3,0,0.695955,55,18,8.2,4,6.0,93,5.0,30.04,0,8
6,11,1,0.504119,69,14,5.0,1,10.0,27,0.0,30.05,6614,4
7,8,0,0.610149,65,30,8.6,4,10.0,87,6.0,29.82,0,8
8,12,0,1.136126,47,13,3.6,4,10.0,77,6.0,29.89,0,10
9,2,1,0.124438,56,15,9.4,4,4.0,93,7.0,29.99,133,2


In [11]:
test_data = engineer_features(test_data)
test_data.drop(columns=['Year', 'Day', 'Day of Year', 'First Hour of Period'], inplace=True)
test_data.head(10)

Unnamed: 0,Month,Is Daylight,Distance to Solar Noon,Average Temperature (Day),Average Wind Direction (Day),Average Wind Speed (Day),Sky Cover,Visibility,Relative Humidity,Average Wind Speed (Period),Average Barometric Pressure (Period),Power Generated,Hours from Noon
0,7,1,0.255946,64,29,12.3,1,10.0,59,21.0,29.86,21804,4
1,7,1,0.459705,62,27,15.8,1,10.0,72,18.0,29.97,2065,7
2,6,0,0.662909,67,29,10.7,0,10.0,72,13.0,29.75,0,10
3,12,1,0.515571,50,33,4.9,2,10.0,71,0.0,30.07,5654,4
4,3,1,0.425876,55,27,16.1,1,10.0,86,5.0,30.16,3621,5
5,9,1,0.081944,67,29,9.3,2,10.0,47,15.0,29.91,27072,1
6,3,0,0.819464,51,28,9.5,1,10.0,86,13.0,30.15,0,10
7,2,1,0.72381,53,13,7.5,4,10.0,86,6.0,29.8,0,7
8,7,1,0.56314,62,27,12.9,4,10.0,80,9.0,29.93,0,8
9,5,1,0.565774,59,29,15.6,2,10.0,90,10.0,29.87,0,8


In [12]:
train_data.corr()['Power Generated'].sort_values(ascending=False)

Power Generated                         1.000000
Is Daylight                             0.537624
Average Wind Speed (Period)             0.276229
Average Wind Direction (Day)            0.148886
Average Wind Speed (Day)                0.140443
Average Temperature (Day)               0.133346
Visibility                              0.086513
Month                                  -0.032273
Average Barometric Pressure (Period)   -0.036025
Sky Cover                              -0.195783
Relative Humidity                      -0.533692
Distance to Solar Noon                 -0.746085
Hours from Noon                        -0.759695
Name: Power Generated, dtype: float64

In [13]:
# Preprocessing

In [14]:
x_train = train_data.drop(columns='Power Generated')
y_train = train_data['Power Generated']
type(y_train)

pandas.core.series.Series

In [15]:
x_test = test_data.drop(columns='Power Generated')
y_test = test_data['Power Generated']
type(y_test)

pandas.core.series.Series

In [16]:
x_train.head(10)

Unnamed: 0,Month,Is Daylight,Distance to Solar Noon,Average Temperature (Day),Average Wind Direction (Day),Average Wind Speed (Day),Sky Cover,Visibility,Relative Humidity,Average Wind Speed (Period),Average Barometric Pressure (Period),Hours from Noon
0,7,1,0.16,65,27,13.6,1,10.0,68,14.0,29.88,2
1,4,1,0.298292,54,27,25.1,2,10.0,64,38.0,29.89,4
2,12,1,0.511344,47,20,8.7,4,10.0,69,11.0,30.14,4
3,6,1,0.148816,67,29,10.7,0,10.0,70,5.0,29.83,2
4,8,1,0.166453,63,27,13.9,4,10.0,75,10.0,29.93,2
5,3,0,0.695955,55,18,8.2,4,6.0,93,5.0,30.04,8
6,11,1,0.504119,69,14,5.0,1,10.0,27,0.0,30.05,4
7,8,0,0.610149,65,30,8.6,4,10.0,87,6.0,29.82,8
8,12,0,1.136126,47,13,3.6,4,10.0,77,6.0,29.89,10
9,2,1,0.124438,56,15,9.4,4,4.0,93,7.0,29.99,2


In [17]:
x_test.head(10)

Unnamed: 0,Month,Is Daylight,Distance to Solar Noon,Average Temperature (Day),Average Wind Direction (Day),Average Wind Speed (Day),Sky Cover,Visibility,Relative Humidity,Average Wind Speed (Period),Average Barometric Pressure (Period),Hours from Noon
0,7,1,0.255946,64,29,12.3,1,10.0,59,21.0,29.86,4
1,7,1,0.459705,62,27,15.8,1,10.0,72,18.0,29.97,7
2,6,0,0.662909,67,29,10.7,0,10.0,72,13.0,29.75,10
3,12,1,0.515571,50,33,4.9,2,10.0,71,0.0,30.07,4
4,3,1,0.425876,55,27,16.1,1,10.0,86,5.0,30.16,5
5,9,1,0.081944,67,29,9.3,2,10.0,47,15.0,29.91,1
6,3,0,0.819464,51,28,9.5,1,10.0,86,13.0,30.15,10
7,2,1,0.72381,53,13,7.5,4,10.0,86,6.0,29.8,7
8,7,1,0.56314,62,27,12.9,4,10.0,80,9.0,29.93,8
9,5,1,0.565774,59,29,15.6,2,10.0,90,10.0,29.87,8


In [18]:
# visibility is tricky because it seems to be an ordinal category, but I cant encode 
# the way i want to. The length of the range i provide has to have a length equal to 
# the number of features contained within the column, but that is obviously not the
# range it exists on. Some possible values for the feature did not appear in the 
# training data. It should work to have it as a 
visibility_cats = list(float_range(0, 10.25, .25))

for_encoding = [
    'Month',
    'Sky Cover'
]

In [19]:
# 'Hours from Noon' could be ordinal or simply numeric data. I think it leans more
# numeric data.
for_scaling = [
    'Distance to Solar Noon', 
    'Average Temperature (Day)',
    'Average Wind Direction (Day)',
    'Average Wind Speed (Day)',
    'Visibility',
    'Relative Humidity',
    'Average Wind Speed (Period)',
    'Average Barometric Pressure (Period)',
    'Hours from Noon'
]

In [20]:
# both pipelines work as intended, but the way that the fit_transform() method of the 
# pipeline works, I cannot recover the newly added columns for later use
# I therefore am going to apply each tranformation separately, as it will allow me to 
# do just that.

In [21]:
ct = make_column_transformer(
    (OrdinalEncoder(), for_encoding),
    (MinMaxScaler(), for_scaling),
    remainder='passthrough'
)

In [22]:
x_train_transformed = ct.fit_transform(x_train)
x_train_transformed

array([[ 6.        ,  1.        ,  0.10046111, ...,  0.38095238,
         0.1       ,  1.        ],
       [ 3.        ,  2.        ,  0.22722256, ...,  0.39047619,
         0.3       ,  1.        ],
       [11.        ,  4.        ,  0.42251113, ...,  0.62857143,
         0.3       ,  1.        ],
       ...,
       [ 0.        ,  1.        ,  0.35386615, ...,  0.63809524,
         0.4       ,  1.        ],
       [ 1.        ,  1.        ,  0.60997145, ...,  0.53333333,
         0.6       ,  1.        ],
       [11.        ,  1.        ,  0.13616624, ...,  0.45714286,
         0.        ,  1.        ]])

In [23]:
x_test_transformed = ct.transform(x_test)
x_test_transformed

array([[ 6.        ,  1.        ,  0.18840715, ...,  0.36190476,
         0.3       ,  1.        ],
       [ 6.        ,  1.        ,  0.37517768, ...,  0.46666667,
         0.6       ,  1.        ],
       [ 5.        ,  0.        ,  0.56143908, ...,  0.25714286,
         0.9       ,  0.        ],
       ...,
       [ 5.        ,  2.        ,  0.38246072, ...,  0.44761905,
         0.6       ,  1.        ],
       [ 0.        ,  1.        ,  0.35550271, ...,  0.63809524,
         0.4       ,  1.        ],
       [10.        ,  1.        ,  0.6714793 , ...,  0.60952381,
         0.6       ,  0.        ]])

In [24]:
dummies = GetDummies(for_encoding)
minmax = MinMaxScaler()
ct1 = make_column_transformer(
    (dummies, for_encoding),
    (minmax, for_scaling)
)

In [25]:
new_transform = ct1.fit_transform(x_train)
new_transform

array([[0.        , 0.        , 0.        , ..., 0.36842105, 0.38095238,
        0.1       ],
       [0.        , 0.        , 0.        , ..., 1.        , 0.39047619,
        0.3       ],
       [0.        , 0.        , 0.        , ..., 0.28947368, 0.62857143,
        0.3       ],
       ...,
       [1.        , 0.        , 0.        , ..., 0.        , 0.63809524,
        0.4       ],
       [0.        , 1.        , 0.        , ..., 0.55263158, 0.53333333,
        0.6       ],
       [0.        , 0.        , 0.        , ..., 0.13157895, 0.45714286,
        0.        ]])

In [26]:
ct1.transform(x_test)

array([[0.        , 0.        , 0.        , ..., 0.55263158, 0.36190476,
        0.3       ],
       [0.        , 0.        , 0.        , ..., 0.47368421, 0.46666667,
        0.6       ],
       [0.        , 0.        , 0.        , ..., 0.34210526, 0.25714286,
        0.9       ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.28947368, 0.44761905,
        0.6       ],
       [1.        , 0.        , 0.        , ..., 0.13157895, 0.63809524,
        0.4       ],
       [0.        , 0.        , 0.        , ..., 0.39473684, 0.60952381,
        0.6       ]])

In [27]:
dummies = GetDummies(for_encoding)
minmax = MinMaxScaler()

In [28]:
x_train = dummies.fit(x_train).transform(x_train)
x_train

Unnamed: 0,Is Daylight,Distance to Solar Noon,Average Temperature (Day),Average Wind Direction (Day),Average Wind Speed (Day),Visibility,Relative Humidity,Average Wind Speed (Period),Average Barometric Pressure (Period),Hours from Noon,...,Month_8,Month_9,Month_10,Month_11,Month_12,Sky Cover_0,Sky Cover_1,Sky Cover_2,Sky Cover_3,Sky Cover_4
0,1,0.160000,65,27,13.6,10.0,68,14.0,29.88,2,...,0,0,0,0,0,0,1,0,0,0
1,1,0.298292,54,27,25.1,10.0,64,38.0,29.89,4,...,0,0,0,0,0,0,0,1,0,0
2,1,0.511344,47,20,8.7,10.0,69,11.0,30.14,4,...,0,0,0,0,1,0,0,0,0,1
3,1,0.148816,67,29,10.7,10.0,70,5.0,29.83,2,...,0,0,0,0,0,1,0,0,0,0
4,1,0.166453,63,27,13.9,10.0,75,10.0,29.93,2,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2331,1,0.546685,52,27,10.7,10.0,80,22.0,30.17,7,...,0,0,0,0,0,0,0,1,0,0
2332,0,1.084602,56,13,3.2,10.0,59,3.0,30.15,10,...,0,0,0,0,0,1,0,0,0,0
2333,1,0.436455,55,13,1.5,10.0,86,0.0,30.15,5,...,0,0,0,0,0,0,1,0,0,0
2334,1,0.715856,48,29,16.2,10.0,68,21.0,30.04,7,...,0,0,0,0,0,0,1,0,0,0


In [29]:
dummies.columns

Index(['Is Daylight', 'Distance to Solar Noon', 'Average Temperature (Day)',
       'Average Wind Direction (Day)', 'Average Wind Speed (Day)',
       'Visibility', 'Relative Humidity', 'Average Wind Speed (Period)',
       'Average Barometric Pressure (Period)', 'Hours from Noon', 'Month_1',
       'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6', 'Month_7',
       'Month_8', 'Month_9', 'Month_10', 'Month_11', 'Month_12', 'Sky Cover_0',
       'Sky Cover_1', 'Sky Cover_2', 'Sky Cover_3', 'Sky Cover_4'],
      dtype='object')

In [30]:
# dropping a column from each of the newly one-hot encoded categories
x_train = x_train.drop(columns=['Month_1', 'Sky Cover_0'])
set(dummies.columns) - set(x_train.columns)

{'Month_1', 'Sky Cover_0'}

In [31]:
x_train_transformed = minmax.fit_transform(x_train)
x_train_transformed

array([[1.        , 0.10046111, 0.63888889, ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.22722256, 0.33333333, ..., 1.        , 0.        ,
        0.        ],
       [1.        , 0.42251113, 0.13888889, ..., 0.        , 0.        ,
        1.        ],
       ...,
       [1.        , 0.35386615, 0.36111111, ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.60997145, 0.16666667, ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.13616624, 0.02777778, ..., 0.        , 0.        ,
        0.        ]])

In [32]:
x_test = dummies.transform(x_test)
x_test

Unnamed: 0,Is Daylight,Distance to Solar Noon,Average Temperature (Day),Average Wind Direction (Day),Average Wind Speed (Day),Visibility,Relative Humidity,Average Wind Speed (Period),Average Barometric Pressure (Period),Hours from Noon,...,Month_8,Month_9,Month_10,Month_11,Month_12,Sky Cover_0,Sky Cover_1,Sky Cover_2,Sky Cover_3,Sky Cover_4
0,1,0.255946,64,29,12.3,10.0,59,21.0,29.86,4,...,0,0,0,0,0,0,1,0,0,0
1,1,0.459705,62,27,15.8,10.0,72,18.0,29.97,7,...,0,0,0,0,0,0,1,0,0,0
2,0,0.662909,67,29,10.7,10.0,72,13.0,29.75,10,...,0,0,0,0,0,1,0,0,0,0
3,1,0.515571,50,33,4.9,10.0,71,0.0,30.07,4,...,0,0,0,0,1,0,0,1,0,0
4,1,0.425876,55,27,16.1,10.0,86,5.0,30.16,5,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
579,0,0.849367,71,29,9.8,10.0,93,10.0,30.05,11,...,1,0,0,0,0,0,0,1,0,0
580,0,0.665914,64,26,9.6,10.0,75,6.0,29.96,10,...,0,0,0,0,0,0,0,0,1,0
581,1,0.467650,64,26,8.0,10.0,72,11.0,29.95,7,...,0,0,0,0,0,0,0,1,0,0
582,1,0.438240,56,13,3.2,10.0,83,5.0,30.15,5,...,0,0,0,0,0,0,1,0,0,0


In [33]:
# dropping a column from each of the newly one-hot encoded categories
x_test = x_test.drop(columns=['Month_1', 'Sky Cover_0'])
set(dummies.columns) - set(x_test.columns)

{'Month_1', 'Sky Cover_0'}

In [34]:
x_test_transformed = minmax.transform(x_test)
x_test_transformed

array([[1.        , 0.18840715, 0.61111111, ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.37517768, 0.55555556, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.56143908, 0.69444444, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [1.        , 0.38246072, 0.61111111, ..., 1.        , 0.        ,
        0.        ],
       [1.        , 0.35550271, 0.38888889, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.6714793 , 0.55555556, ..., 0.        , 0.        ,
        0.        ]])

In [35]:
np.save(os.path.join(EXPORT_DIR, 'x_train.npy'), x_train_transformed)
np.save(os.path.join(EXPORT_DIR, 'y_train.npy'), y_train.to_numpy())
np.save(os.path.join(EXPORT_DIR, 'x_test.npy'), x_test_transformed)
np.save(os.path.join(EXPORT_DIR, 'y_test.npy'), y_test.to_numpy())

In [36]:
pd.DataFrame(data=x_train_transformed, columns=x_train.columns).to_csv('data-PREPROCESSED/x_train.csv', index=False)

In [37]:
pd.DataFrame(data=x_test_transformed, columns=x_test.columns).to_csv('data-PREPROCESSED/x_test.csv', index=False)