In [42]:
import os
import random
import pandas as pd
import datetime
import numpy as np
from sklearn.preprocessing import LabelEncoder
from etl.data_processor import FeatureProcessor, VERB_MAP, OBSERVABLE
import tensorflow as tf
import matplotlib.pyplot as plt
tf.keras.layers.Conv1D

In [43]:
data_a_train = pd.read_csv(os.path.join("../data/raw/", "data_a_train.csv"))
data_a_train[OBSERVABLE] = data_a_train[OBSERVABLE].apply(
    lambda x: VERB_MAP[x] if x in VERB_MAP else x
)
data_a_hidden = pd.concat(
    [
        pd.read_csv(os.path.join("../data/raw/", "data_a_hidden_10.csv")),
        pd.read_csv(os.path.join("../data/raw/", "data_a_hidden_20.csv")),
        pd.read_csv(os.path.join("../data/raw/", "data_a_hidden_30.csv")),
    ],
    axis=0,
    sort=False
)
data_a_hidden[OBSERVABLE] = data_a_hidden[OBSERVABLE].apply(
    lambda x: VERB_MAP[x] if x in VERB_MAP else x
)
question_type_ln = LabelEncoder()
verb_ln = LabelEncoder()
verb_ln.fit(data_a_train["Observable"].tolist() + data_a_hidden["Observable"].tolist())
question_type_ln.fit(data_a_train["ItemType"].tolist() + data_a_hidden["ItemType"].tolist())

hidden_index, hidden = FeatureProcessor().get_multi_dim_verb(data_a_hidden, question_type_ln, verb_ln)
train_index, train = FeatureProcessor().get_multi_dim_verb(data_a_train, question_type_ln, verb_ln)
label = pd.read_csv(os.path.join("../data/raw/", "data_train_label.csv")).set_index("STUDENTID")
label = label.reindex(train_index)
train = train.astype(np.int32)
hidden = hidden.astype(np.int32)

In [44]:
print(train[0, :, 0])
print(verb_ln.inverse_transform(train[0, train[0, :, 0] != 0, 0]-1))
print(train.shape)
print(label.shape)
zip_combine = list(zip(label.values, train))
random.shuffle(zip_combine)
label_values, train = zip(*zip_combine)
train = np.array(train)
label_values = np.array(label_values).astype(int)
tmp_hidden = np.zeros(train.shape)
tmp_hidden[0: hidden.shape[0], 0: hidden.shape[1], 0: hidden.shape[2]] = hidden
hidden = tmp_hidden
print(train.shape)
print(hidden.shape)
print(train[:, :, 0].shape)

(1232, 2599, 3)
(1232, 1)


In [45]:
input_dim = max(len(question_type_ln.classes_), len(verb_ln.classes_)) + 2
embedding_output_dim = 10 

(1232, 1042, 3)
(1232, 1042, 3)
(1232, 1042)


In [46]:
label = label.reindex(train_index)
input_dim = max(len(question_type_ln.classes_), len(verb_ln.classes_)) + 2
embedding_output_dim = 10 

In [47]:
verb_input = tf.keras.layers.Input(shape=(train.shape[1],))
question_type_input = tf.keras.layers.Input(shape=(train.shape[1],))
duration_level_input = tf.keras.layers.Input(shape=(train.shape[1],))
verb_embedding = tf.keras.layers.Embedding(
    output_dim=20, 
    input_dim=len(verb_ln.classes_)+1, 
    input_length=train.shape[1], 
    # mask_zero=True
)(verb_input)
question_type_embedding = tf.keras.layers.Embedding(
    output_dim=10, 
    input_dim=len(question_type_ln.classes_)+1, 
    input_length=train.shape[1], 
    # mask_zero=True
)(question_type_input)
duration_level_embedding = tf.keras.layers.Embedding(
    output_dim=10, 
    input_dim=int(np.max(train[:, :, 2]))+1, 
    input_length=train.shape[1], 
    # mask_zero=True
)(duration_level_input)
merged_input = tf.keras.layers.concatenate([verb_embedding, question_type_embedding, duration_level_embedding], axis=-1)
# input_dropout = tf.keras.layers.Dropout(0.2)(merged_input)
# lstm = tf.keras.layers.Bidirectional(
#     tf.keras.layers.LSTM(
#         10, 
#         dropout=0.3,
#         kernel_initializer=tf.keras.initializers.VarianceScaling(),
#         recurrent_initializer='orthogonal',
#         bias_initializer='random_uniform',
#         )
# )(merged_input)
conv1d_1 = tf.keras.layers.Conv1D(100, 10, activation='relu')(merged_input)
conv1d_2 = tf.keras.layers.Conv1D(100, 10, activation='relu')(conv1d_1)
max_pool = tf.keras.layers.MaxPooling1D(3)(conv1d_2)
conv1d_3 = tf.keras.layers.Conv1D(160, 10, activation='relu')(max_pool)
conv1d_4 = tf.keras.layers.Conv1D(160, 10, activation='relu')(conv1d_3)
gloable_average = tf.keras.layers.GlobalAveragePooling1D()(conv1d_4)
dropout = tf.keras.layers.Dropout(0.5)(gloable_average)
# lstm = tf.keras.layers.LSTM(
#         50, 
#         dropout=0.8,
#         # kernel_initializer=tf.keras.initializers.VarianceScaling(),
#         # recurrent_initializer='orthogonal',
#         # bias_initializer='random_uniform',
# )(verb_embedding)
dense = tf.keras.layers.Dense(
    50, 
    activation='relu',
    # kernel_initializer=tf.keras.initializers.VarianceScaling(),
    # bias_initializer='random_uniform',
    )(conv1d_4)
flatten = tf.keras.layers.Flatten()(dense)
output = tf.keras.layers.Dense(
    1, 
    activation='sigmoid', 
    # kernel_initializer=tf.keras.initializers.VarianceScaling(),
    # bias_initializer='random_uniform',
    )(dropout)
model = tf.keras.Model(
    inputs=[verb_input, question_type_input, duration_level_input],
    outputs=output
)
optimizer = tf.keras.optimizers.Adam()
model.compile(
    loss='binary_crossentropy',
    optimizer=optimizer,
    metrics=['acc']
)
print(model.summary())
# Define the Keras TensorBoard callback.
logdir="logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

In [48]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir=logdir,
    histogram_freq=1,
    write_graph=True,
    write_grads=True,
    write_images=False,
    update_freq="epoch"
)

In [None]:
history = model.fit(
    [train[:, :, 0], train[:, :, 1], train[:, :, 2]], 
    label_values,
    # batch_size=100,
    shuffle = True,
    epochs=3,
    validation_split=0.33,
    callbacks=[tensorboard_callback],
)

Train on 985 samples, validate on 247 samples
Epoch 1/50

In [None]:
hidden_label = pd.read_csv(os.path.join("../data/raw/", "hidden_label.csv"))
hidden_result = model.predict([hidden[:, :, 0], hidden[:, :, 1], hidden[:, :, 2]])
hidden_result_value = hidden_result.ravel()
print(hidden_result_value)
predict_result = pd.DataFrame(hidden_result_value, index=hidden_index).reindex(hidden_index)
print(predict_result)

In [None]:
print(hidden_result.ravel())
hidden_result_value = hidden_result.ravel()
predict_result = pd.DataFrame(hidden_result_value, index=hidden_index).reindex(hidden_index)
print(predict_result)
predict_result.to_csv(
    "result.csv",
    line_terminator=",",
    index=False,
    header=False
)

In [None]:
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

In [None]:
print(history.history['val_acc'])
plot_graphs(history, 'acc')

plot_graphs(history, 'loss')
