# Creating a Model 

In [4]:
import importlib, os, utils, urllib
from datetime import datetime, timedelta
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import pandas as pd
import numpy as np

In [84]:
importlib.reload(utils)
all_zones_df = utils.clean_raw_webscraper_data('output_data/All_Zones_2021-22_Season_reports_data.csv')
all_zones_df = utils.add_weather_to_reports(all_zones_df)
all_zones_df.to_csv('output_data/All_Zones_2021-22_Season_weather_and_reports_data.csv', index=False)

In [5]:
all_zones_df = pd.read_csv('output_data/all_zones_all_data.csv')

data = all_zones_df.drop(columns=['bottom_line_text', 'problem_type_text', 'forecast_discussion_text'])

data['combined_text'] = data['combined_text'].astype(str)
data['combined_text'] = data['combined_text'].apply(lambda x: word_tokenize(x.lower()))
data.rename(columns={'combined_text': 'tokens'}, inplace=True)

In [6]:
# Training the word to vec model:
model = Word2Vec(sentences=data['tokens'], vector_size=100, window=5, min_count=1, workers=4)
model.save('models/first_word2vec.model')

# Create word embeddings lookup dictionary
word_embeddings = {}

for word in model.wv.index_to_key:
    word_embeddings[word] = model.wv[word]

word = 'storm'
model.wv.most_similar(word)

[('since', 0.8743607997894287),
 ('in', 0.8464764952659607),
 ('total', 0.8372876048088074),
 ('prior', 0.8291876912117004),
 ('backed', 0.8236643075942993),
 ('associated', 0.8191325068473816),
 ('received', 0.8190154433250427),
 ('blew', 0.8183265328407288),
 ('brought', 0.8003273010253906),
 ('wrapped', 0.7978445887565613)]

In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
from bokeh.plotting import figure, show, output_file

# Load the dataframe
# df = pd.read_csv('your_dataframe.csv')
df = data

# Select the features you want to use for training
features = ['tavg', 'tmin', 'tmax', 'prcp', 'wdir', 'pres', 'tsun']

# Preprocess the data
scaler = MinMaxScaler()
df[features] = scaler.fit_transform(df[features])  # Normalize the numerical features

# Convert text tokens to numerical sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['tokens'])

sequences = tokenizer.texts_to_sequences(df['tokens'])
max_seq_length = max(len(seq) for seq in sequences)
sequences = pad_sequences(sequences, maxlen=max_seq_length)

tokenized_text = [str(tokens).split() for tokens in df['tokens']]
# Train Word2Vec embeddings
model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=1, workers=4)

# Save the trained model
model.save('models/first_word2vec.model')

# Load pre-trained word embeddings (Word2Vec or GloVe)
word_embeddings = Word2Vec.load('models/first_word2vec.model').wv

# Create an embedding matrix
embedding_dim = word_embeddings.vector_size
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in word_embeddings.key_to_index:
        embedding_matrix[i] = word_embeddings[word]

# Prepare the input and output data
X = sequences
y = df['overall_risk'].values

# Define the LSTM model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_seq_length, trainable=False))
model.add(LSTM(64))
model.add(Dense(1, activation='linear'))

# Compile and train the model
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X, y, epochs=10, batch_size=32)

# Predict for the next 7 days
last_sequence = sequences[-1]  # Get the last sequence from the input data

predictions = []
for _ in range(7):
    next_sequence = np.concatenate([last_sequence[1:], np.zeros(1)])  # Shift the sequence by one position
    next_sequence = next_sequence.reshape(1, -1)  # Reshape to match the input shape
    prediction = model.predict(next_sequence)[0][0]  # Make the prediction
    predictions.append(prediction)

    # Update the last sequence with the predicted value
    last_sequence[-1] = prediction
    last_sequence = last_sequence.reshape(1, -1)

# Print the predictions for the next 7 days
print("Predictions for the next 7 days:")
for i, prediction in enumerate(predictions, 1):
    print(f"Day {i}: {prediction}")

# Chart the predictions
days = range(1, 8)
risk_levels = predictions

p = figure(title="Risk Level Predictions for Next 7 Days", x_axis_label="Day", y_axis_label="Risk")




2023-05-09 00:08:34.059120: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-09 00:08:34.064534: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-09 00:08:34.068463: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 1/10


2023-05-09 00:08:34.831501: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-09 00:08:34.835589: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-09 00:08:34.840904: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


2023-05-09 00:09:53.295316: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-09 00:09:53.299922: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-09 00:09:53.303582: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s)

In [8]:
# Convert text data to sequences of word embeddings
max_sequence_length = max(data['tokens'].apply(len))
embedding_dimension = 100

def convert_text_to_embeddings(text):
    embeddings = []
    
    for word in text:
        if word in word_embeddings:
            embeddings.append(word_embeddings[word])
        else:
            embeddings.append([0.0] * embedding_dimension)
    
    return embeddings

sequences = data['tokens'].apply(convert_text_to_embeddings)

# Update the padding step
padded_sequences = pad_sequences(sequences.apply(lambda x: [elem[:embedding_dimension] for elem in x]),
                                maxlen=max_sequence_length, padding='post')

# padded_sequences = padded_sequences.reshape(padded_sequences.shape[0], padded_sequences.shape[1], embedding_dimension)

In [15]:
# data = data.drop(columns=['tokens'])
shape = data.shape
shape

(1180, 13)

In [20]:
# Define LSTM model
num_lstm_units = 64
num_classes = 5

model = Sequential()
# model.add(Embedding(input_dim=len(word_embeddings), output_dim=embedding_dimension,
#                     input_length=max_sequence_length, trainable=False))
model.add(LSTM(units=num_lstm_units))
model.add(Dense(units=num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [22]:
data.head()

Unnamed: 0,date,zone,overall_risk,above_treeline_risk,near_treeline_risk,below_treeline_risk,tavg,tmin,tmax,prcp,wdir,pres,tsun
0,2022-04-23,east slopes north,3.0,3.0,3.0,2.0,7.6,-0.9,15.9,0.0,225.3,1024.0,0.0
1,2022-04-23,west slopes central,3.0,3.0,3.0,2.0,9.9,4.6,15.4,0.0,242.4,1026.2,0.0
2,2022-04-23,west slopes south,3.0,3.0,3.0,2.0,9.9,4.4,16.3,0.0,279.4,1026.0,0.0
3,2022-04-23,east slopes central,2.0,2.0,2.0,2.0,10.6,5.3,16.3,0.0,282.1,1025.4,0.0
4,2022-04-23,snoqualmie pass,3.0,3.0,3.0,2.0,10.2,4.8,16.3,0.0,274.1,1025.8,0.0


In [21]:

# Split data into training and validation sets
train_size = int(len(data) * 0.8)

X_train = padded_sequences[:train_size]
y_train = data['overall_risk'].values[:train_size]
X_val = padded_sequences[train_size:]
y_val = data['overall_risk'].values[train_size:]

X_train = X_train.astype(np.int32)
X_val = X_val.astype(np.int32)

model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

Epoch 1/10


TypeError: in user code:

    File "/home/jaymin/Desktop/Spring 2023/CPSC325/SnowPackPrediction/.venv/lib/python3.10/site-packages/keras/engine/training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "/home/jaymin/Desktop/Spring 2023/CPSC325/SnowPackPrediction/.venv/lib/python3.10/site-packages/keras/engine/training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/jaymin/Desktop/Spring 2023/CPSC325/SnowPackPrediction/.venv/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "/home/jaymin/Desktop/Spring 2023/CPSC325/SnowPackPrediction/.venv/lib/python3.10/site-packages/keras/engine/training.py", line 1050, in train_step
        y_pred = self(x, training=True)
    File "/home/jaymin/Desktop/Spring 2023/CPSC325/SnowPackPrediction/.venv/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/home/jaymin/Desktop/Spring 2023/CPSC325/SnowPackPrediction/.venv/lib/python3.10/site-packages/keras/backend.py", line 2464, in dot
        out = tf.matmul(x, y)

    TypeError: Exception encountered when calling layer 'lstm_3' (type LSTM).
    
    Input 'b' of 'MatMul' Op has type float32 that does not match type int32 of argument 'a'.
    
    Call arguments received by layer 'lstm_3' (type LSTM):
      • inputs=tf.Tensor(shape=(None, 435, 100), dtype=int32)
      • mask=None
      • training=True
      • initial_state=None


In [None]:
# Train the LSTM model


# Example prediction
new_text = ['Slabs of snow are sitting on top of a weak layer of sugary snow.']
new_tokens = word_tokenize(new_text[0].lower())
new_sequence = convert_text_to_embeddings(new_tokens)
new_padded_sequence = pad_sequences([new_sequence], maxlen=max_sequence_length, padding='post')

# prediction = model.predict(new_padded_sequence)
# predicted_label = prediction.argmax(axis=-1)[0]
# print(f"Predicted label: {predicted_label}")