In [4]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dropout, Dense, CuDNNLSTM, Activation, Masking, LSTM, Conv1D
from keras.optimizers import Adam
from keras.utils import to_categorical
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Data setup

In [166]:
series_data = pd.read_csv("../data/denver_series_data.csv")
X_variables = ["humidity_Denver","pressure_Denver","temperature_Denver","wind_direction_Denver","wind_speed_Denver"]
y_variables = ["weather_description_Denver"]

X_data = series_data[X_variables]
y_data = series_data[y_variables]

number_of_hours_to_predict = 1

## Normalize Data

In [5]:
non_zero_X_data = X_data.loc[X_data["temperature_Denver"] != 0]
min_X_data = non_zero_X_data.min()
max_X_data = non_zero_X_data.max()

normalized_X_data = (X_data - min_X_data) / (max_X_data - min_X_data) 
normalized_X_data = pd.concat([normalized_X_data, y_data], axis=1)
normalized_X_data = normalized_X_data.clip(lower=0)

NameError: name 'X_data' is not defined

# Merge similar weather types

In [168]:
def merge_similar(data, similarities, new_value):
    for similarity in similarities:
        data.loc[data["weather_description_Denver"] == similarity] = new_value
    return data
    
light_rain = [3, 16, 19, 22, 23, 30, 32, 33]
mist = [13]
haze = [24, 26, 7]
snow = [9, 31]
moderate_rain = [12, 17, 18, 20, 25, 27, 28, 29, 34]

y_data = merge_similar(y_data, light_rain, 1)
y_data = merge_similar(y_data, mist, 10)
y_data = merge_similar(y_data, haze, 11)
y_data = merge_similar(y_data, snow, 15)
y_data = merge_similar(y_data, moderate_rain, 21)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


## Generate All Possible Sequences

In [169]:
X_data_arrays = []
y_data_arrays = []

for index in series_data.loc[series_data["weather_description_Denver"] != 0].index:
    if index >= 24 and index < series_data.shape[0]:
        X_data_arrays.append(normalized_X_data.iloc[index - 24: index][X_variables + y_variables].values)
        y_data_arrays.append(y_data.iloc[index: index+number_of_hours_to_predict][y_variables].values)
        
X_data_arrays = np.array(X_data_arrays)
y_data_arrays = np.array(y_data_arrays)

## Convert Y Targets to Categorical 1-Hot Encoded Vectors

In [183]:
squeezed_y_data = np.squeeze(y_data_arrays, axis=1)
#True Labels [ 1,  2,  4,  5,  6,  8, 10, 11, 14, 15, 21]
def transform_labels_0_n(y_data):
    label = 0
    labels = {}
    
    for i, target in enumerate(y_data):
        if target[0] not in labels:
            labels[target[0]] = label
            label += 1
            
        y_data[i][0] = labels[target[0]]
    return y_data
transformed_y_data = transform_labels_0_n(squeezed_y_data)
categorical_y_data = to_categorical(squeezed_y_data)

## Split Data Into training and testing.

In [186]:
X_train, X_test, y_train, y_test = train_test_split(X_data_arrays, categorical_y_data, test_size=0.2)

In [132]:
t = {'broken clouds': 2,
 'drizzle': 19,
 'dust': 26,
 'few clouds': 5,
 'fog': 13,
 'haze': 11,
 'heavy intensity drizzle': 32,
 'heavy intensity rain': 18,
 'heavy snow': 9,
 'light intensity drizzle': 16,
 'light intensity shower rain': 23,
 'light rain': 1,
 'light rain and snow': 31,
 'light snow': 14,
 'mist': 10,
 'moderate rain': 21,
 'overcast clouds': 8,
 'proximity shower rain': 3,
 'proximity thunderstorm': 12,
 'proximity thunderstorm with rain': 28,
 'ragged thunderstorm': 34,
 'scattered clouds': 4,
 'shower rain': 29,
 'sky is clear': 6,
 'smoke': 24,
 'snow': 15,
 'squalls': 7,
 'thunderstorm': 20,
 'thunderstorm with heavy drizzle': 33,
 'thunderstorm with heavy rain': 25,
 'thunderstorm with light drizzle': 30,
 'thunderstorm with light rain': 22,
 'thunderstorm with rain': 17,
 'very heavy rain': 27}


res = dict((v,k) for k,v in t.items())


In [189]:
# values, counts = np.unique(y_data_arrays, return_counts=True)
# for count, value in zip(counts, values):
#     if count >= 400:
#         print(count, res[value], value)
    

# Model Creation

In [228]:
def create_model2(timesteps, features):
    model = Sequential()
    model.add(Conv1D(filters=128, kernel_size=3, strides=1, input_shape=(timesteps, features)))
    model.add(Activation('relu'))
    model.add(LSTM(128, return_sequences=True))
    model.add(Activation('relu'))
    model.add(LSTM(128))
    model.add(Activation('relu'))
    model.add(Dense(256))
    model.add(Activation('relu'))
    model.add(Dense(256))
    model.add(Activation('relu'))
    model.add(Dense(11, activation='softmax', name="OutputLayer"))
    
    model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=1e-3),metrics=['accuracy'])
    return model

# 0.694, starting to overfit 20 epochs
# def create_model(timesteps, features):
#     model = Sequential()
#     model.add(Masking(mask_value=0., input_shape=(timesteps, features)))
#     model.add(LSTM(128, return_sequences=True))
#     model.add(Activation('relu'))
#     model.add(LSTM(128))
#     model.add(Activation('relu'))
#     model.add(Dense(256))
#     model.add(Activation('relu'))
#     model.add(Dense(256))
#     model.add(Activation('relu'))
#     model.add(Dense(11, activation='softmax', name="OutputLayer"))
    
#     model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=1e-3),metrics=['accuracy'])
#     return model

## Train Model

In [205]:
epochs=20
batch_size = 64
model = create_model(24, 6)
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))

Train on 34558 samples, validate on 8640 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f55cdd3ce48>

# Grab temporal data with no gaps in sequence.

In [231]:
non_zero_temporal_X_data = []
non_zero_temporal_y_data = []

for sequence, target in zip(X_test, y_test):
    non_zero_sequence = True
    
    for sub_sequence in sequence:
        if np.count_nonzero(sub_sequence==0) == 6:
            non_zero_sequence = False
            
    if non_zero_sequence:
        non_zero_temporal_X_data.append(sequence)
        non_zero_temporal_y_data.append(target)
        
non_zero_temporal_X_data = np.array(non_zero_temporal_X_data)
non_zero_temporal_y_data = np.array(non_zero_temporal_y_data)


In [232]:
non_zero_temporal_X_data.shape

(7318, 24, 6)

### Evaluate model on non-zero temporal data.

In [227]:
model.evaluate(non_zero_temporal_X_data, non_zero_temporal_y_data)



[0.786207084455466, 0.7354466661954263]

### Train RCNN.

In [230]:
model2 = create_model2(24, 6)
model2.fit(non_zero_temporal_X_data, non_zero_temporal_y_data, epochs=epochs)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20

KeyboardInterrupt: 

In [233]:
model2.evaluate(non_zero_temporal_X_data, non_zero_temporal_y_data)



[1.0037649653263514, 0.6832467887075132]