In [1]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM
import numpy as np

In [5]:
import pandas as pd
import os
PATH = "data/"

In [6]:
filename =os.path.join(PATH,"SN_d_tot_V2.0.csv")
names = ['year', 'month', 'day', 'dec_year', 'sn_value' , 'sn_error', 'obs_num']
df = pd.read_csv(filename,sep=';',header=None,names=names,na_values=['-1'], index_col=False)


In [11]:
df.shape

(62525, 7)

In [10]:
start_id = max(df[df['obs_num'] == 0].index.tolist())+1  # Find the last zero and move one beyond
print(start_id)
df = df[start_id:] # Trim the rows that have missing observations

11314


In [13]:
df["sn_value"] = df["sn_value"].astype(float)
df_train = df[df["year"]<2000]
df_test = df[df["year"]>=2000]
spots_train = df_train['sn_value'].tolist()
spots_test = df_test['sn_value'].tolist()

print("Training set has {} observations.".format(len(spots_train)))
print("Test set has {} observations.".format(len(spots_test)))

Training set has 55160 observations.
Test set has 7365 observations.


In [14]:
SEQUENCE_SIZE=10
import numpy as np
def to_Seq(seq_size,obs):
    x=[]
    y=[]
    for i in range(len(obs)-SEQUENCE_SIZE):
        window = obs[i:(i+SEQUENCE_SIZE)]
        after_window = obs[i+SEQUENCE_SIZE]
        window = [[x] for x in window]
        x.append(window)
        y.append(after_window)
        
    return np.array(x),np.array(y)
        

In [16]:
x_train,y_train = to_Seq(SEQUENCE_SIZE,spots_train)
x_test,y_test = to_Seq(SEQUENCE_SIZE,spots_test)

print("Shape of training set: {}".format(x_train.shape))
print("Shape of test set: {}".format(x_test.shape))

Shape of training set: (55150, 10, 1)
Shape of test set: (7355, 10, 1)


In [17]:
from tensorflow.keras.callbacks import EarlyStopping

model = Sequential()
model.add(LSTM(64,dropout=0.1,recurrent_dropout=0,input_shape =[None,1]))
model.add(Dense(32))

In [18]:
model.add(Dense(1))

In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 64)                16896     
_________________________________________________________________
dense (Dense)                (None, 32)                2080      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 19,009
Trainable params: 19,009
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='mean_squared_error', optimizer='adam')
monitor = EarlyStopping(monitor="val_loss",min_delta=1e-3,patience=4,verbose=1,
                       mode="auto",restore_best_weights=True)

print("Train")
model.fit(x_train,y_train,validation_data=[x_test,y_test],
         callbacks=[monitor],verbose=2,epochs=2000)

Train
Train on 55150 samples, validate on 7355 samples
Epoch 1/2000
55150/55150 - 12s - loss: 1404.5584 - val_loss: 288.8022
Epoch 2/2000
55150/55150 - 8s - loss: 573.6276 - val_loss: 387.5332
Epoch 3/2000
55150/55150 - 9s - loss: 575.8376 - val_loss: 195.4014
Epoch 4/2000
55150/55150 - 9s - loss: 574.0914 - val_loss: 281.9924
Epoch 5/2000
55150/55150 - 9s - loss: 566.3224 - val_loss: 351.0579
Epoch 6/2000
55150/55150 - 9s - loss: 565.6538 - val_loss: 298.0930
