<a href="https://colab.research.google.com/github/jtiggs/Crowdfunding_ETL/blob/main/p4_LSTM3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install scikeras

Collecting scikeras
  Downloading scikeras-0.12.0-py3-none-any.whl (27 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.12.0


In [68]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split as test_train_split
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import Callback
from sklearn.metrics import make_scorer, r2_score
from google.colab import files
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping

In [5]:
uploaded = files.upload()

Saving processed_storm_data.csv to processed_storm_data.csv


In [69]:
# Custom callback to print R^2 value during training
class R2Callback(Callback):
    def __init__(self, X_train, y_train, patience=5):
        super(R2Callback, self).__init__()
        self.X_train = X_train
        self.y_train = y_train
        self.patience = patience
        self.best_r2 = -float('inf')  # Initialize best R^2 score
        self.wait = 0  # Counter for patience

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.X_train)
        r2 = r2_score(self.y_train, y_pred)
        print(f"Epoch {epoch+1}, R^2: {r2}")

        # Check if current R^2 score is greater than the best R^2 score
        if r2 > self.best_r2:
            self.best_r2 = r2
            self.wait = 0  # Reset the counter
        else:
            self.wait += 1  # Increment the counter

            # Check if we have reached the patience limit
            if self.wait >= self.patience:
                print(f"Stopping training as R^2 score hasn't improved for {self.patience} epochs.")
                self.model.stop_training = True


In [None]:
def calculate_r_squared(y_true, y_pred):
    """
    Calculate the R^2 score.

    Parameters:
        y_true (array-like): The true values.
        y_pred (array-like): The predicted values.

    Returns:
        float: R^2 score.
    """
    y_true_mean = np.mean(y_true)
    ss_tot = np.sum((y_true - y_true_mean) ** 2)
    ss_res = np.sum((y_true - y_pred) ** 2)
    r2 = 1 - (ss_res / ss_tot)
    return r2

In [40]:
df=pd.read_csv('processed_storm_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,BEGIN_DATE_TIME,DAMAGE_PROPERTY,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,DURATION_SEC,REGION_NORTHEAST,REGION_NORTHERN ROCKIES AND PLAINS,...,EVENT_TYPE_Strong Wind,EVENT_TYPE_Thunderstorm Wind,EVENT_TYPE_Tornado,EVENT_TYPE_Tropical Depression,EVENT_TYPE_Tropical Storm,EVENT_TYPE_Tsunami,EVENT_TYPE_Waterspout,EVENT_TYPE_Wildfire,EVENT_TYPE_Winter Storm,EVENT_TYPE_Winter Weather
0,0,1994-03-27 11:32:00,5000000.0,34.43,-85.98,34.47,-85.78,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,1,1994-05-15 19:30:00,0.0,,,,,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,2,1994-06-26 22:20:00,500000.0,,,,,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,3,1994-05-15 13:47:00,0.0,,,,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,1994-03-27 15:50:00,0.0,,,,,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
df=df[['BEGIN_DATE_TIME', 'REGION_NORTHEAST',
       'REGION_NORTHERN ROCKIES AND PLAINS', 'REGION_NORTHWEST',
       'REGION_OHIO VALLEY', 'REGION_SOUTH', 'REGION_SOUTHEAST',
       'REGION_SOUTHWEST', 'REGION_UPPER MIDWEST', 'REGION_WEST','DAMAGE_PROPERTY']]
df.sort_values(by=['BEGIN_DATE_TIME'],inplace=True)
df.head()


Unnamed: 0,BEGIN_DATE_TIME,REGION_NORTHEAST,REGION_NORTHERN ROCKIES AND PLAINS,REGION_NORTHWEST,REGION_OHIO VALLEY,REGION_SOUTH,REGION_SOUTHEAST,REGION_SOUTHWEST,REGION_UPPER MIDWEST,REGION_WEST,DAMAGE_PROPERTY
12482,1994-01-03 12:50:00,0,0,0,0,0,1,0,0,0,500.0
14103,1994-01-03 14:45:00,0,0,0,0,0,1,0,0,0,500000.0
14104,1994-01-03 14:45:00,0,0,0,0,0,1,0,0,0,5000.0
15018,1994-01-03 17:05:00,0,0,0,0,0,1,0,0,0,5000.0
15133,1994-01-03 17:08:00,0,0,0,0,0,1,0,0,0,5000.0


In [48]:
df['BEGIN_DATE_TIME']=df['BEGIN_DATE_TIME'].astype('datetime64[ns]')
df['BEGIN_DATE_TIME']=df['BEGIN_DATE_TIME'].dt.to_period('M')
df['BEGIN_DATE_TIME']=df['BEGIN_DATE_TIME'].astype('datetime64[ns]')
df['BEGIN_DATE_TIME'] = df['BEGIN_DATE_TIME'].map(pd.Timestamp.timestamp)
df.head()

Unnamed: 0,BEGIN_DATE_TIME,REGION_NORTHEAST,REGION_NORTHERN ROCKIES AND PLAINS,REGION_NORTHWEST,REGION_OHIO VALLEY,REGION_SOUTH,REGION_SOUTHEAST,REGION_SOUTHWEST,REGION_UPPER MIDWEST,REGION_WEST,DAMAGE_PROPERTY
12482,757382400.0,0,0,0,0,0,1,0,0,0,500.0
14103,757382400.0,0,0,0,0,0,1,0,0,0,500000.0
14104,757382400.0,0,0,0,0,0,1,0,0,0,5000.0
15018,757382400.0,0,0,0,0,0,1,0,0,0,5000.0
15133,757382400.0,0,0,0,0,0,1,0,0,0,5000.0


In [60]:
model.reset_states()

In [61]:
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)


In [62]:
train_size = int(len(scaled_df) * 0.8)
test_size = len(scaled_df) - train_size
train, test = scaled_df.iloc[0:train_size], scaled_df.iloc[train_size:len(scaled_df)]

In [63]:
def create_sequences(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X.iloc[i:(i + time_steps)].values)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)

In [64]:
train.head()

Unnamed: 0,BEGIN_DATE_TIME,REGION_NORTHEAST,REGION_NORTHERN ROCKIES AND PLAINS,REGION_NORTHWEST,REGION_OHIO VALLEY,REGION_SOUTH,REGION_SOUTHEAST,REGION_SOUTHWEST,REGION_UPPER MIDWEST,REGION_WEST,DAMAGE_PROPERTY
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.793296e-08
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.793296e-05
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.793296e-07
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.793296e-07
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.793296e-07


In [65]:
time_steps = 4
X_train, y_train = create_sequences(train.drop(columns='DAMAGE_PROPERTY'), train[['DAMAGE_PROPERTY']], time_steps)
X_test, y_test = create_sequences(test.drop(columns='DAMAGE_PROPERTY'), test[['DAMAGE_PROPERTY']], time_steps)

In [66]:
model = Sequential()
model.add(LSTM(units=128, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
model.add(LSTM(units=64, return_sequences=True))
model.add(LSTM(units=32,return_sequences=True))
model.add(Dense(units=1))

In [67]:
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

In [70]:
r2_callback = R2Callback(X_train, y_train, patience=5)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1, callbacks=[r2_callback, early_stopping])

Epoch 1/10
Epoch 1, R^2: -0.0011896380913614113
Epoch 2/10
Epoch 2, R^2: -1.424045584652589e-05
Epoch 3/10
Epoch 3, R^2: -3.410167590400803e-05
Epoch 4/10
  571/29864 [..............................] - ETA: 6:55 - loss: 9.8344e-07

In [None]:
r_squared=calculate_r_squared(y_test, model.predict(X_test))
print(f"R^2: {r_squared}")