# Final Project, Part 2: Forecasting Weather via LSTM Network

Time series datasets are defined as databases that contain a sequence of datapoints over time. This includes stock prices (e.g., price per day), weather (e.g., degrees Celsius per day), and sales figures (net profit by quarter), among others.

We'll be using LSTMs to predict the weather of Melbourne, Australia.

<div style="text-align: center;"> <img src = "res/final_project/weather_forecasting_icon.png" width="25%"/> </div>

However, our current LSTM neural network performs terribly! Thus, you'll be improving it and present your findings to the class.

# 0 | Google Colab Setup

In [None]:
import os
import shutil
import stat

In [None]:
def copy_safe(src, dst, max_len=200):
    """Copy files, skip long paths"""
    skipped = 0
    for root, dirs, files in os.walk(src):
        rel_path = os.path.relpath(root, src)
        dst_root = os.path.join(dst, rel_path) if rel_path != '.' else dst
        if len(dst_root) < max_len:
            os.makedirs(dst_root, exist_ok=True)
            for file in files:
                dst_file = os.path.join(dst_root, file)
                if len(dst_file) < max_len:
                    try: shutil.copy2(os.path.join(root, file), dst_file)
                    except: skipped += 1
                else: skipped += 1
        else: skipped += len(files)
    return skipped

In [None]:
# Setup resources if needed
setup_ran = False
if not os.path.exists('res'):
    print("Setting up resources...")
    setup_ran = True
    
    # Cleanup, clone, copy
    repo = 'deep_learning_resources'
    if os.path.exists(repo):
        shutil.rmtree(repo, onerror=lambda f,p,e: os.chmod(p, stat.S_IWRITE) or f(p))
    
    !git clone --depth=1 https://github.com/jjv31/deep_learning_resources
    
    if os.path.exists(f'{repo}/res'):
        skipped = copy_safe(f'{repo}/res', 'res')
        print(f"Setup complete! {'(' + str(skipped) + ' long filenames skipped)' if skipped else ''}")
    
    shutil.rmtree(repo, onerror=lambda f,p,e: os.chmod(p, stat.S_IWRITE) or f(p))

In [None]:
# Only refresh if we just downloaded resources
if setup_ran:
    from IPython.display import Javascript, display
    import time
    
    print("Refreshing images...")
    
    # Try browser refresh + aggressive image reload
    display(Javascript(f'''
    try {{ setTimeout(() => window.location.reload(true), 2000); }} catch(e) {{}}
    
    const t = {int(time.time())};
    document.querySelectorAll('img').forEach((img, i) => {{
        if (img.src.includes('res/')) {{
            const src = img.src.split('?')[0];
            setTimeout(() => img.src = src + '?v=' + t + '_' + i, i * 50);
        }}
    }});
    '''))
    
    print("If images don't appear, press Ctrl+Shift+R to hard refresh!")
else:
    print("Resources already exist, skipping setup.")

# 1 | Loads & Inspects Dataset

### 1.1 | Imports

In [None]:
import pandas as pd
import numpy as np

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Other
from statsmodels.tsa.seasonal import seasonal_decompose

# Preprocessing
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

# Neural Nets
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, Input, Flatten
from keras.optimizers import Adam
from keras import metrics

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### 1.2 | Aux functions. Just run

In [None]:
#Function to facilitate evaluating our models
def print_score(clf, X, y_true):

    # Gets predicted labels
    if isinstance(clf, keras.models.Sequential): # If the model is a Keras neural network
        y_pred = (clf.predict(X) >= 0.5).astype(int) 
    else: # Normal scikit-learn model
        y_pred = clf.predict(X)

    # Gets key performance indicators
    accuracy = round(accuracy_score(y_true, y_pred), 4)
    recall = round(recall_score(y_true, y_pred), 4)
    precision = round(precision_score(y_true, y_pred), 4)
    f1 = round(f1_score(y_true, y_pred), 4)

    # Displays them
    print(f"F1 = {f1:.4f} | Recall = {recall* 100:.2f}% | Precision = {precision*100:.2f}%")

In [None]:
# Plots the performance of the neural network
def plot_performance(training_values, validation_values, metric_name = "Recall"):

    epochs = range(1, len(training_values) + 1)
    
    sns.set() 
    plt.plot(epochs, training_values, '-', label=f'Training {metric_name}')
    plt.plot(epochs, validation_values, ':', label=f'Validation {metric_name}')

    plt.title(f'Training and Validation {metric_name}')
    plt.xlabel('Epoch')
    plt.ylabel(metric_name)
    plt.legend(loc='lower right')
    plt.plot()

In [None]:
# Takes a time series dataframe and returns a list of missing dates
# Assumes one day increments
def get_missing_dates(df):
    
    # Check for missing dates
    full_date_range = pd.date_range(start=df['Date'].min(), end=df['Date'].max(), freq='D')

    # Convert the existing dates in your DataFrame to a set for efficient lookup
    existing_dates = set(df['Date'])

    # Find the dates that are in the full_date_range but not in your existing dates
    missing_dates = [date for date in full_date_range if date not in existing_dates]

    return missing_dates

In [None]:
def print_univariates_metric(data, nameToPrint=None):

    # Mode - Handling multimodal cases
    mode_result = data.mode()
    if len(mode_result) == 0:  # No mode found
        mode_result = None
    else:
        mode_result = mode_result[0]

    # Print output
    print(f"Descriptives for {nameToPrint}")
    print(f"Mean = {round(data.mean(),2)} | Median = {round(data.median(),2)} | Mode = {mode_result} | "
          f"Min = {data.min()} | Max = {data.max()} | SD = {round(data.std(),2)} | "
          f"IQR(25) = {data.quantile(0.25)} | IQR(75) = {data.quantile(0.75)}")

In [None]:
# Plots the results of a time series.
# model is the neural network
# generator_to_evaluate is either the generator trained on your training or testing set
# scaler is the min-max scaler trained in §2.
def plot_time_series_results(model, generator_to_evaluate, scaler, date_index):

    # Returns pandas dataframe that contains (i) actual value (milk production) and (ii) predicted value. Both descaled.
    def get_results_df():
        
        # Creates list to store  (i) predicted and (ii) actual values
        all_predictions = []
        all_actuals = []

        for i in range(len(generator_to_evaluate)):
            # Gets a batch of (i) X features and (i) the target they're trying to predict
            x_batch, y_batch = generator_to_evaluate[i]

            # Make predictions on the current batch
            batch_predictions = model.predict(x_batch, verbose=0)

            # Extend our lists with the current batch's predictions and actuals
            # Flatten them if they are in shape (batch_size, 1) to (batch_size,)
            all_predictions.extend(batch_predictions.flatten())
            all_actuals.extend(y_batch.flatten())

        # Convert lists to NumPy arrays for easier manipulation
        all_predictions = np.array(all_predictions).reshape(-1, 1) # Reshape back to (n_samples, 1) for inverse_transform
        all_actuals = np.array(all_actuals).reshape(-1, 1)

        # Descales predictions (via the scaler) so they're intelligible again (i.e., not approximately 0-1)
        all_predictions = scaler.inverse_transform(all_predictions)
        all_actuals = scaler.inverse_transform(all_actuals)


        # Create a DataFrame for easy viewing
        results_df = pd.DataFrame({'Actual': all_actuals.flatten(), 'Predicted': all_predictions.flatten()}, 
                                  index=date_index[n_input:])
        return results_df

    # Plots the results df. Takes the results_df returned in the above subfunction.
    def plot_results_df(results_df):

        # Defines plot size
        plt.figure(figsize=(7, 5))

        # Plots vals
        plt.plot(results_df['Actual'], label='Actual Values (Test Set)', color='blue', linewidth=0.5, marker='o', markersize=1)
        plt.plot(results_df['Predicted'], label='Predicted Values (Test Set)', color='red', linestyle='--', linewidth=0.5, marker='o', markersize=1)

        
        # Labels
        plt.title('LSTM Model Predictions vs. Actuals')
        plt.xlabel('Date')
        plt.ylabel('Temperature (Celsius)')

        # Other
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()
        
    # Main functions
    results_df = get_results_df()
    plot_results_df(results_df)
    return results_df

### 1.3 | Loads & Inspects Data

In [None]:
#Now load the data using the pandas dataframe. We will use milk production data
df = pd.read_csv('res/final_project/temperature.csv')

# Convert 'Date' column to datetime objects with the correct format
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%y')

# Convert 'Temp' column to numeric, coercing errors to NaN
df['Temp'] = pd.to_numeric(df['Temp'], errors='coerce')

df.head(5)

### 1.4 | Imputes Missing Dates

In [None]:
# Stores imputed values
imputed_dates_dict = {}

# Gets (i) dates missing from Pandas and (ii) dates with a null/no temperature
missing_dates = get_missing_dates(df)
null_dates = df[df['Temp'].isnull()]["Date"].tolist() 

for ind_date in (missing_dates + null_dates) :

    # Gets temp before & after the missing date
    date_before, date_after = ind_date - pd.Timedelta(days=1), ind_date + pd.Timedelta(days=1)
    temp_before, temp_after = df[df["Date"] == date_before]["Temp"].iloc[0], df[df["Date"] == date_after]["Temp"].iloc[0]

    # Calculates average temp
    if np.isnan(temp_before) and np.isnan(temp_after):
        average_temp = np.nan
    elif np.isnan(temp_before):
        average_temp = round(temp_after, 1)
    elif np.isnan(temp_after):
        average_temp = round(temp_before, 1)
    else:
        average_temp = round((temp_before + temp_after) / 2, 1)

    # Displays the imputation process to the user
    print(f"The following date [{ind_date}] has a missing temperature.")
    print(f"--> The temperature the day before [{date_before}] was [{temp_before}]")
    print(f"--> The temperature the day after [{date_after}] was [{temp_after}]")
    print(f"--> Thus, we'll infer the temperature of that date is the average of the two: {average_temp}\n")

    # Stores imputation
    imputed_dates_dict[ind_date] = average_temp

In [None]:
# Drops the null dates so they can be re-added
df = df[~df['Date'].isin(null_dates)]

# Stores imputed dates into our pandas df
imputed_df = pd.DataFrame(imputed_dates_dict.items(), columns=['Date', 'Temp'])
df = pd.concat([df, imputed_df], ignore_index=True)
df = df.sort_values(by='Date').reset_index(drop=True)

In [None]:
# Ensures all misisng dates are imputed
assert( len(get_missing_dates(df)) == 0 )
print("Congratulations! There are no more missing dates!")

assert( len( df[df['Temp'].isnull()]["Date"].tolist() )  == 0 )
print("Congratulations! There are no more null dates!")

### 1.5 | Visualizes Dataset

In [None]:
df = df.set_index('Date')

In [None]:
# Plotting graph b/w production and date
df.plot(figsize=(20, 6), linewidth=0.25)

In [None]:
# Same as above, except it plots seasonality, trends, and noise.
# Noise is defined as time series datapoint - trend - seasonality
seasonal_decompose(df['Temp']).plot()

# 2 | Preprocessing

### 2.1 | Scales Temperature

In [None]:
scaler = MinMaxScaler()


print_univariates_metric(df["Temp"], "Temperature before scaling")
df["Temp"] = scaler.fit_transform(df[["Temp"]])
print_univariates_metric(df["Temp"], "Temperature after scaling")

### 2.2 | Train/Test Split

In [None]:
# We're going to use the last 365 days as the test set
end_date = df.index.max()
start_date = end_date - pd.Timedelta(days=365)

# Train/Test split
train = df[df.index <= start_date]
test = df.iloc[df.index > start_date] 

# Output
print(f"Training set size = {train.shape[0]} days")
print(f"Testing set size = {test.shape[0]} days")

### 2.3 | Converts our data to a format conducive to LSTMs

LSTMs take a sequence. In other words, we feed it X dates, and the LSTM will predict the (X+1) date. In order to feed the LSTM sequences, then, we need to use the "TimeSeriesGenerator" to convert our data into sequences.

In [None]:

# Creates the training & testing set.
n_input = 30
generator_train = TimeseriesGenerator(data = train['Temp'].values.reshape(-1, 1), targets = train['Temp'].values ,
                                      length = n_input, batch_size=32)
generator_test = TimeseriesGenerator(data = test['Temp'].values.reshape(-1, 1), targets = test['Temp'].values ,
                                      length = n_input, batch_size=32)

In [None]:
# Displays the raw input & output
X, y = generator_train[0]

print("Here is how the neural network will work.\n")

print(f'Given the Array: \n{X[0]}')
print(f'Predict this y: \n {y[0]}')

print("\nKeep in mind these values are SCALED. Here's what the unscaled looks like")
print("Here's the X: ")
print(scaler.inverse_transform( X[0] ) )
print("Here's the y (next day's temperature):")
print(scaler.inverse_transform( y[0].reshape(-1, 1) ))

# 3 | LSTM

### 3.0 | Section Overview

You will create a neural network that uses LSTM cells in order to handle this time series data (§3.1) before plotting the output (§3.2).

Here are a few parameters that may help you when creating an LSTM layer:
<ul>
  <li> <strong>activation.</strong> This is the activation function that's responsible for creating the new memory. This activation function occurs twice per cell: in the input gate (long term memory) and the output gate (short term memory). </li>
  <li> <strong>recurrent activation.</strong> This is the activation function that's responsible for the percentage of memory to remember. This activation function occurs thrice per cell: in the forgotten gate (updating the long term memory), in the input gate (long term memory) and the output gate (short term memory). </li>
    <li> <strong>return sequence</strong> By default, this is False. Keras sets up an LSTM layer such that the input passes through EACH NEURON in the layer. For example, if there are 64 neurons, the input will pass from LSTM cell #1, then to cell #2, etc. Sometimes, however, you want multiple LSTM layers running 'in parrallel' to each other. To implement this functionality, set the return_sequences = True until you get to the final LSTM layer. </li>
</ul>

Here is an LSTM cell that may help you better understand how the activation functions interact with the cell.

<div style="text-align: center;"> <img src = "res/model_building/lstms_lstm_cell.jpg" width="40%"/> </div>

<strong>The loss (error) on the testing set will be the primary means by which we evaluate your model.</strong>

Do not modify the loss function. You must use mean squared error (MSE).

### 3.1 | LSTM: Construct & Train

In [None]:
# Creates model
your_lstm_neural_network = Sequential()

# Input Layer
your_lstm_neural_network.add( Input( shape= (n_input,1) ) )

# Hidden (LSTM) Layers
# Remember: you must specify the parameter "return_sequences = True" for all LSTM layers except the final LSTM layer
your_lstm_neural_network.add(LSTM(1, activation="tanh", recurrent_activation="sigmoid", return_sequences = True)) 
your_lstm_neural_network.add(LSTM(1, activation="tanh", recurrent_activation="sigmoid",)) 

# Output Layer
your_lstm_neural_network.add(Dense(1, activation = "linear", )) # Linear activation b/c we're predicting temperature (c.f., probability)

# Compiles model
your_lstm_neural_network.compile(loss='mse', optimizer=Adam(learning_rate=.001), 
             metrics=[metrics.MeanSquaredError(name='mse'),])
your_lstm_neural_network.summary()

In [None]:
# Unlike previous exercises, you're free to modify the epochs.
# Just be careful. Too many epochs might cost you a lot of time and not produce spectacular results.

hist = your_lstm_neural_network.fit(generator_train, validation_data = generator_test, epochs=20)

### 3.2 | Results 

In [None]:
print(f"Training Loss = {round (hist.history['loss'][-1], 8)}")
print(f"Testing Loss  = {round(hist.history['val_loss'][-1], 8)}")

In [None]:
loss, val_loss = hist.history["loss"], hist.history["val_loss"]
plot_performance(loss, val_loss, "Loss")

In [None]:
# Training set
_ = plot_time_series_results(your_lstm_neural_network, generator_train, scaler, train.index)

In [None]:
# Testing set
_ = plot_time_series_results(your_lstm_neural_network, generator_test, scaler, test.index)

# 4 | Diagram Your Final LSTM Neural Network

You can use the same tool as a feedforward neural network to diagram an LSTM neural network: https://alexlenail.me/NN-SVG/index.html

When presenting your diagram, just be sure to explicitly identify any/all LSTM layers you used (as opposed to "normal" feedforward hidden layers).