In [21]:
import numpy as np
import pandas as pd
import os
import re

import tensorflow as tf
import tensorflow_hub as hub

from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import Model, optimizers
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Lambda, Input, Dense, Dropout, Concatenate, BatchNormalization, Activation

import matplotlib.pyplot as plt

In [22]:
#!ls ../input

DATA_DIR = '../kaggle/input/google-quest-challenge'

# EDA

In [23]:
train = pd.read_csv(f"{DATA_DIR}/train.csv")


In [24]:
test = pd.read_csv(f"{DATA_DIR}/test.csv")


In [25]:
submission = pd.read_csv(f"{DATA_DIR}/sample_submission.csv")
targets = list(submission.columns)
targets.pop(0)


'qa_id'

In [26]:
print('train', train.shape)
print('test', test.shape)
print('sample_submission', submission.shape)

train (6079, 41)
test (476, 11)
sample_submission (476, 31)


In [27]:
all_train_columns = list(train.columns)
question_answer_cols = all_train_columns[:11]
question_target_cols = all_train_columns[11:32]
answer_target_cols  = all_train_columns[32:41]
target_cols = question_target_cols + answer_target_cols

# Model

In [28]:
EPOCHS = 20
BATCH_SIZE = 32
LEARNING_RATE = 3e-4
EMBEDDDING_SIZE = 512
N_CLASS = len(target_cols)
ES_PATIENCE = 3
RLROP_PATIENCE = 2
DECAY_DROP = 0.3
N_FOLD = 5
module_url = 'https://tfhub.dev/google/universal-sentence-encoder/4'
MODEL_PATH = '../working/model_%d.h5'

es = EarlyStopping(monitor='val_loss', mode='min', patience=ES_PATIENCE, restore_best_weights=True, verbose=1)
rlrop = ReduceLROnPlateau(monitor='val_loss', mode='min', patience=RLROP_PATIENCE, factor=DECAY_DROP, min_lr=1e-6, verbose=1)

In [29]:
use_embed = hub.load(module_url)



In [30]:
def USEEmbedding(x):
    return use_embed(tf.squeeze(tf.cast(x, tf.string)))

def swish(x):
    return K.sigmoid(x) * x

In [31]:
def model_fn():    
    input_title = Input(shape=(1,), dtype=tf.string, name='input_title')
    embedding_title = Lambda(USEEmbedding, output_shape=(EMBEDDDING_SIZE,))(input_title)

    input_body = Input(shape=(1,), dtype=tf.string, name='input_body')
    embedding_body = Lambda(USEEmbedding, output_shape=(EMBEDDDING_SIZE,))(input_body)
    
    input_answer = Input(shape=(1,), dtype=tf.string, name='input_answer')
    embedding_answer = Lambda(USEEmbedding, output_shape=(EMBEDDDING_SIZE,))(input_answer)

    x = Concatenate()([embedding_title, embedding_body, embedding_answer])
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    x = Dense(512, activation=swish)(x)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    output = Dense(N_CLASS, activation='sigmoid', name='output')(x)
    model = Model(inputs=[input_title, input_body, input_answer], outputs=[output])

    optimizer = optimizers.Adam(LEARNING_RATE)
    model.compile(optimizer=optimizer, loss='binary_crossentropy')
    
    return model

In [32]:
#model_fn().summary()

# Train

In [33]:
x_labels = ['question_title', 'question_body', 'answer']
x_train = train[x_labels]
y_train = train[targets]

x_train = [x_train[col] for col in x_labels]

In [34]:
x_train[0][0]

'What am I losing when using extension tubes instead of a macro lens?'

In [35]:
def pre_processing(df, cols: list):
    for col in cols:
        df[col] = df[col].apply(lambda x: str(x).replace("?","."))
        df[col] = df[col].apply(lambda x: str(x).replace("!","."))
        df[col] = df[col].apply(lambda x: str(x).replace("\n"," "))
        df[col] = df[col].apply(lambda x: re.sub('[0-9]+', '0', x))
    return df

train = pre_processing(train, x_labels)

#display(train.head())

test = pre_processing(test, x_labels)
#display(test.head())

In [36]:
type(x_train)

list

In [37]:
len(x_train)

3

In [38]:
x_train[0][0]

'What am I losing when using extension tubes instead of a macro lens?'

In [39]:
x_train[1][0]

'After playing around with macro photography on-the-cheap (read: reversed lens, rev. lens mounted on a straight lens, passive extension tubes), I would like to get further with this. The problems with the techniques I used is that focus is manual and aperture control is problematic at best. This limited my setup to still subjects (read: dead insects) Now, as spring is approaching, I want to be able to shoot live insects. I believe that for this, autofocus and settable aperture will be of great help.\n\nSo, one obvious but expensive option is a macro lens (say, EF 100mm Macro) However, I am not really interested in yet another prime lens. An alternative is the electrical extension tubes.\n\nExcept for maximum focusing distance, what am I losing when using tubes (coupled with a fine lens, say EF70-200/2.8) instead of a macro lens?\n'

In [40]:
x_train[2][0]

"I just got extension tubes, so here's the skinny.\n\n\n  ...what am I losing when using tubes...?\n\n\nA very considerable amount of light!  Increasing that distance from the end of the lens to the sensor can cut your light several stops.  Combined with the fact that you'll usually shoot stopped down - expect to need to increase your ISO considerably.\n\nThe fact the macro's are usually considered very very sharp, although I believe that 70-200mm 2.8 is supposed to be quite sharp.\n\nThe ultra low distortion typical of many macros.\n\nI wouldn't worry too much about the bokeh since the DOF will still be quite limited.\n\nCoupled on my 50mm, a full 60mm'ish extension tube results in a DOF of about a couple inches in front of the lens.  On my 70-300, its probably around 2-3 feet in front of the lens to about a foot in front of the lens.\n"

In [None]:
callback_list = [es, rlrop]

for i in range(N_FOLD):
    model = model_fn()
    history = model.fit(x_train, y_train, validation_split=0.2, shuffle=True, batch_size=BATCH_SIZE, callbacks=callback_list, epochs=EPOCHS, verbose=1)
    #model.save_weights(MODEL_PATH % i)





Train on 4863 samples, validate on 1216 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20




Train on 4863 samples, validate on 1216 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 00020: ReduceLROnPlateau reducing learning rate to 9.000000427477062e-05.




Train on 4863 samples, validate on 1216 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 00015: ReduceLROnPlateau reducing learning rate to 9.000000427477062e-05.
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 00019: ReduceLROnPlateau reducing learning rate to 2.700000040931627e-05.
Epoch 20/20




Train on 4863 samples, validate on 1216 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 00019: ReduceLROnPlateau reducing learning rate to 9.000000427477062e-05.
Epoch 20/20

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Test

In [None]:
x_labels = ['question_title', 'question_body', 'answer']
x_test = test[x_labels]
x_test = [x_test[col] for col in x_labels]
y_tests = np.zeros((N_FOLD, len(test), len(target_cols)))

for i in range(N_FOLD):
    model = model_fn()
    model.load_weights(MODEL_PATH % i)
    y_tests[i] = model.predict(x_test)

# Report

In [None]:
submission[target_cols] = np.average(y_tests, axis = 0)
submission.to_csv('submission.csv', index=False)
display(submission.head())