In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Read saved csv files
TrainVal     = pd.read_csv('01TrainVal.csv')

In [3]:
TrainVal['index'] = pd.to_datetime(TrainVal['index'])
TrainVal.rename(columns = {'index':'Timestamp'}, inplace=True)

In [4]:
TrainVal['Timestamp']

0       2019-03-01 00:00:00
1       2019-03-01 00:05:00
2       2019-03-01 00:10:00
3       2019-03-01 00:15:00
4       2019-03-01 00:20:00
                ...        
35131   2019-06-30 23:35:00
35132   2019-06-30 23:40:00
35133   2019-06-30 23:45:00
35134   2019-06-30 23:50:00
35135   2019-06-30 23:55:00
Name: Timestamp, Length: 35136, dtype: datetime64[ns]

In [5]:
TrainVal

Unnamed: 0,Timestamp,f1,f2,f3,f4,f5,f6,f7
0,2019-03-01 00:00:00,104.0,98.0,132.0,125.0,128.0,38.0,45.0
1,2019-03-01 00:05:00,100.0,95.0,135.0,125.0,119.0,72.0,77.0
2,2019-03-01 00:10:00,110.0,124.0,132.0,112.0,118.0,73.0,73.0
3,2019-03-01 00:15:00,110.0,102.0,103.0,100.0,106.0,88.0,90.0
4,2019-03-01 00:20:00,101.0,98.0,117.0,114.0,109.0,66.0,62.0
...,...,...,...,...,...,...,...,...
35131,2019-06-30 23:35:00,141.0,143.0,150.0,142.0,153.0,110.0,84.0
35132,2019-06-30 23:40:00,132.0,128.0,139.0,116.0,114.0,98.0,89.0
35133,2019-06-30 23:45:00,106.0,111.0,123.0,127.0,146.0,94.0,85.0
35134,2019-06-30 23:50:00,137.0,140.0,141.0,103.0,124.0,109.0,95.0


# Split to Training and Test Data

In [6]:
len(TrainVal)

35136

In [7]:
test_percent = 0.11475     # 11.475 % used as test data (final 2 weeks)
len(TrainVal)*test_percent
test_point = np.round(len(TrainVal)*test_percent)
test_ind = int(len(TrainVal) - test_point)

In [8]:
test_ind

31104

In [9]:
train = TrainVal.iloc[:test_ind].drop('Timestamp', axis=1)      # From 1st element to 31575
test = TrainVal.iloc[test_ind:].drop('Timestamp', axis=1)       # From 31575 to end

In [10]:
test.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7
31104,144.0,155.0,130.0,122.0,136.0,84.0,75.0
31105,130.0,132.0,157.0,109.0,133.0,95.0,80.0
31106,109.0,121.0,133.0,114.0,135.0,109.0,104.0
31107,123.0,133.0,123.0,104.0,120.0,100.0,96.0
31108,121.0,130.0,135.0,119.0,129.0,73.0,65.0


In [11]:
test_percen = 0.12962     # 11.475 % used as test data (final 2 weeks)
len(train)*test_percen
test_p = np.round(len(train)*test_percen)
test_ind = int(len(train) - test_p)

In [12]:
test_ind

27072

In [13]:
t = train.iloc[:test_ind]      # From 1st element to 31575

In [14]:
t

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7
0,104.0,98.0,132.0,125.0,128.0,38.0,45.0
1,100.0,95.0,135.0,125.0,119.0,72.0,77.0
2,110.0,124.0,132.0,112.0,118.0,73.0,73.0
3,110.0,102.0,103.0,100.0,106.0,88.0,90.0
4,101.0,98.0,117.0,114.0,109.0,66.0,62.0
...,...,...,...,...,...,...,...
27067,115.0,110.0,127.0,115.0,116.0,61.0,69.0
27068,98.0,121.0,123.0,126.0,142.0,87.0,82.0
27069,115.0,112.0,125.0,128.0,131.0,75.0,80.0
27070,113.0,113.0,123.0,103.0,107.0,71.0,75.0


In [15]:
TrainVal.iloc[27071]

Timestamp    2019-06-02 23:55:00
f1                         104.0
f2                         109.0
f3                         104.0
f4                          83.0
f5                          86.0
f6                          68.0
f7                          68.0
Name: 27071, dtype: object

# Feature Scaling

In [16]:
# The scaler - for min-max scaling
from sklearn.preprocessing import MinMaxScaler

The MinMaxScaler from scikit-learn subtracts the minimum value and re-scales the the value range, rendering the variable values between 0 and 1. 

In [17]:
# set up the scaler
scaler = MinMaxScaler()

# fit the scaler to the train set, it will learn the parameters
scaler.fit(train)

# transform train and test sets
train_scaled = scaler.transform(train)
test_scaled = scaler.transform(test)

In [18]:
# the scaler stores the maximum values of the features, learned from train set
scaler.data_max_

array([463., 512., 540., 470., 600., 534., 466.])

In [19]:
# the scaler stores the minimum values of the features, learned from train set
scaler.data_min_

array([14., 18., 10.,  0., 17.,  6.,  6.])

In [20]:
l = ['f1','f2','f3','f4','f5','f6','f7']

In [21]:
train_scaled = pd.DataFrame(train_scaled, columns=l)
test_scaled = pd.DataFrame(test_scaled, columns=l)

In [22]:
train_scaled['Timestamp'] = TrainVal['Timestamp'].iloc[:test_ind]
test_scaled['Timestamp'] = TrainVal.iloc[test_ind:].reset_index()['Timestamp']

In [23]:
train_scaled

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,Timestamp
0,0.200445,0.161943,0.230189,0.265957,0.190395,0.060606,0.084783,2019-03-01 00:00:00
1,0.191537,0.155870,0.235849,0.265957,0.174957,0.125000,0.154348,2019-03-01 00:05:00
2,0.213808,0.214575,0.230189,0.238298,0.173242,0.126894,0.145652,2019-03-01 00:10:00
3,0.213808,0.170040,0.175472,0.212766,0.152659,0.155303,0.182609,2019-03-01 00:15:00
4,0.193764,0.161943,0.201887,0.242553,0.157804,0.113636,0.121739,2019-03-01 00:20:00
...,...,...,...,...,...,...,...,...
31099,0.298441,0.253036,0.288679,0.291489,0.255575,0.204545,0.184783,NaT
31100,0.407572,0.382591,0.358491,0.312766,0.286449,0.195076,0.191304,NaT
31101,0.276169,0.285425,0.309434,0.293617,0.257290,0.231061,0.247826,NaT
31102,0.271715,0.246964,0.335849,0.270213,0.217839,0.159091,0.173913,NaT


In [24]:
test_scaled

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,Timestamp
0,0.289532,0.277328,0.226415,0.259574,0.204117,0.147727,0.150000,2019-06-03 00:00:00
1,0.258352,0.230769,0.277358,0.231915,0.198971,0.168561,0.160870,2019-06-03 00:05:00
2,0.211581,0.208502,0.232075,0.242553,0.202401,0.195076,0.213043,2019-06-03 00:10:00
3,0.242762,0.232794,0.213208,0.221277,0.176672,0.178030,0.195652,2019-06-03 00:15:00
4,0.238307,0.226721,0.235849,0.253191,0.192110,0.126894,0.128261,2019-06-03 00:20:00
...,...,...,...,...,...,...,...,...
4027,0.282851,0.253036,0.264151,0.302128,0.233276,0.196970,0.169565,2019-06-16 23:35:00
4028,0.262806,0.222672,0.243396,0.246809,0.166381,0.174242,0.180435,2019-06-16 23:40:00
4029,0.204900,0.188259,0.213208,0.270213,0.221269,0.166667,0.171739,2019-06-16 23:45:00
4030,0.273942,0.246964,0.247170,0.219149,0.183533,0.195076,0.193478,2019-06-16 23:50:00


In [25]:
type(test_scaled.iloc[0]['Timestamp'])

pandas._libs.tslibs.timestamps.Timestamp

# Save Data and Scaler

In [26]:
import joblib

In [27]:
scaler_filename = "scaler.save"
joblib.dump(scaler, scaler_filename) 

['scaler.save']

In [28]:
train.to_csv('01train.csv')
test.to_csv('01test.csv')

In [29]:
train_scaled.to_csv('01train_scaled.csv')
test_scaled.to_csv('01test_scaled.csv')