In [1]:
import re
import os
import keras.backend as K
import numpy as np
import pandas as pd
from keras import layers, models, utils
import json

Using TensorFlow backend.


In [2]:
def reset_everything():
    import tensorflow as tf
    %reset -f in out dhist
    tf.reset_default_graph()
    K.set_session(tf.InteractiveSession())

In [3]:
# Constants for our networks.  We keep these deliberately small to reduce training time.

VOCAB_SIZE = 250000
EMBEDDING_SIZE = 100
MAX_DOC_LEN = 128
MIN_DOC_LEN = 12

In [5]:
def extract_stackexchange(filename, limit=1000000):
    json_file = filename + 'limit=%s.json' % limit

    rows = []
    for i, line in enumerate(os.popen('7z x -so "%s" Posts.xml' % filename)):
        line = str(line)
        if not line.startswith('  <row'):
            continue
            
        if i % 1000 == 0:
            print('\r%05d/%05d' % (i, limit), end='', flush=True)

        parts = line[6:-5].split('"')
        record = {}
        for i in range(0, len(parts), 2):
            k = parts[i].replace('=', '').strip()
            v = parts[i+1].strip()
            record[k] = v
        rows.append(record)
        
        if len(rows) > limit:
            break
    
    with open(json_file, 'w') as fout:
        json.dump(rows, fout)
    
    return rows

data_path = '/home/ubuntu/.keras/datasets/travel.stackexchange.com.7zlimit=1000000.json'

if not os.path.exists(data_path):
    xml_7z = utils.get_file(
        fname='travel.stackexchange.com.7z',
        origin='https://ia800107.us.archive.org/27/items/stackexchange/travel.stackexchange.com.7z',
    )
    rows = extract_stackexchange(xml_7z)
else:
    print("Already downloaded. File size:", os.stat(data_path).st_size / 1e9, 'GB')
    with open(data_path, 'r') as fin:
        # Use load with a file (loads with json)
        rows = json.load(fin)


Already downloaded. File size: 0.117212775 GB


# Data Exploration

Now that we have extracted our data, let's clean it up and take a look at what we have to work with.

In [9]:
rows[1]

{'Id': '2',
 'PostTypeId': '1',
 'CreationDate': '2011-06-21T20:22:33.760',
 'Score': '36',
 'ViewCount': '1876',
 'Body': "&lt;p&gt;This was one of our definition questions, but also one that interests me personally: How can I find a guide that will take me safely through the Amazon jungle? I'd love to explore the Amazon but would not attempt it without a guide, at least not the first time. And I'd prefer a guide that wasn't going to ambush me or anything. :P&lt;/p&gt;&#xA;&#xA;&lt;p&gt;I don't want to go anywhere &quot;touristy&quot;.  Start and end points are open, but the trip should take me places where I am not likely to see other travelers/tourists and where I will definitely require a good guide in order to be safe.&lt;/p&gt;&#xA;",
 'OwnerUserId': '13',
 'LastEditorUserId': '51577',
 'LastEditDate': '2018-08-14T16:23:48.240',
 'LastActivityDate': '2018-08-26T00:04:13.520',
 'Title': 'How can I find a guide that will take me safely through the Amazon jungle?',
 'Tags': '&lt;gui

In [6]:
df = pd.DataFrame.from_records(rows)    
df = df.set_index('Id', drop=False)
df['Title'] = df['Title'].fillna('').astype('str')
df['Tags'] = df['Tags'].fillna('').astype('str')
df['Body'] = df['Body'].fillna('').astype('str')
df['Id'] = df['Id'].astype('int')
df['PostTypeId'] = df['PostTypeId'].astype('int')
df['ViewCount'] = df['ViewCount'].astype('float')

df.head()

Unnamed: 0_level_0,AcceptedAnswerId,AnswerCount,Body,ClosedDate,CommentCount,CommunityOwnedDate,CreationDate,FavoriteCount,Id,LastActivityDate,...,LastEditorDisplayName,LastEditorUserId,OwnerDisplayName,OwnerUserId,ParentId,PostTypeId,Score,Tags,Title,ViewCount
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,393.0,4.0,&lt;p&gt;My fiancée and I are looking for a go...,2013-02-25T23:52:47.953,4,,2011-06-21T20:19:34.730,,1,2012-05-24T14:52:14.760,...,,101.0,,9,,1,8,&lt;caribbean&gt;&lt;cruising&gt;&lt;vacations...,What are some Caribbean cruises for October?,443.0
2,,8.0,&lt;p&gt;This was one of our definition questi...,,4,,2011-06-21T20:22:33.760,5.0,2,2018-08-26T00:04:13.520,...,,51577.0,,13,,1,36,&lt;guides&gt;&lt;extreme-tourism&gt;&lt;amazo...,How can I find a guide that will take me safel...,1876.0
3,,,&lt;p&gt;One way would be to go through an Adv...,,2,,2011-06-21T20:24:28.080,,3,2011-06-21T20:24:28.080,...,,,,9,2.0,2,14,,,
4,,1.0,&lt;p&gt;Singapore Airlines has an all-busines...,,1,,2011-06-21T20:24:57.160,,4,2013-01-09T09:55:22.743,...,,693.0,,24,,1,8,&lt;loyalty-programs&gt;&lt;routes&gt;&lt;ewr&...,Does Singapore Airlines offer any reward seats...,249.0
5,770.0,5.0,&lt;p&gt;Another definition question that inte...,,0,,2011-06-21T20:25:56.787,2.0,5,2012-10-12T20:49:08.110,...,,101.0,,13,,1,13,&lt;romania&gt;&lt;transportation&gt;,What is the easiest transportation to use thro...,418.0


In [7]:
list(df[df['ViewCount'] > 250000]['Title'])

['Do I need a US visa to transit (or layover) through an American airport?',
 'How to get from Nice to Monaco by public transport?',
 'Should my first trip be to the country which issued my Schengen Visa?',
 'Can I use Google Maps traffic information to estimate driving time for a specific date/time?',
 'Are aerosol cans allowed and safe, in checked luggage?',
 'How to track my UK Visa Application Status?',
 "When applying for an Indian Passport, how do I know if I'm in the ECR or non-ECR category?",
 'Are battery packs allowed in hand luggage?']

In [8]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(df['Body'] + df['Title'])

In [11]:
tokenizer.document_count

87956

In [12]:
len(df)

87956

In [13]:
# Compute TF/IDF Values

total_count = sum(tokenizer.word_counts.values())
idf = { k: np.log(total_count/v) for (k,v) in tokenizer.word_counts.items() }

In [18]:
list(idf.keys())[10], list(idf.values())[10]

('a', 3.603566009125561)

In [26]:
sorted_idf = sorted(idf.items(), key = lambda x: x[1])
sorted_idf[0], sorted_idf[-1]

(('gt', 2.8184622225006084), ("'low'", 16.670909155734705))

In [27]:
# Download pre-trained word2vec embeddings

import gensim

glove_100d = utils.get_file(
    fname='glove.6B.100d.txt',
    origin='https://storage.googleapis.com/deep-learning-cookbook/glove.6B.100d.txt',
)

w2v_100d = glove_100d + '.w2v'
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_100d, w2v_100d)
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_100d)

w2v_weights = np.zeros((VOCAB_SIZE, w2v_model.syn0.shape[1]))
idf_weights = np.zeros((VOCAB_SIZE, 1))

for k, v in tokenizer.word_index.items():
    if v >= VOCAB_SIZE:
        continue
    
    if k in w2v_model:
        w2v_weights[v] = w2v_model[k]
    
    idf_weights[v] = idf[k]
    
del w2v_model

Downloading data from https://storage.googleapis.com/deep-learning-cookbook/glove.6B.100d.txt




In [28]:
df['title_tokens'] = tokenizer.texts_to_sequences(df['Title'])
df['body_tokens'] = tokenizer.texts_to_sequences(df['Body'])

In [29]:
df.head()

Unnamed: 0_level_0,AcceptedAnswerId,AnswerCount,Body,ClosedDate,CommentCount,CommunityOwnedDate,CreationDate,FavoriteCount,Id,LastActivityDate,...,OwnerDisplayName,OwnerUserId,ParentId,PostTypeId,Score,Tags,Title,ViewCount,title_tokens,body_tokens
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,393.0,4.0,&lt;p&gt;My fiancée and I are looking for a go...,2013-02-25T23:52:47.953,4,,2011-06-21T20:19:34.730,,1,2012-05-24T14:52:14.760,...,,9,,1,8,&lt;caribbean&gt;&lt;cruising&gt;&lt;vacations...,What are some Caribbean cruises for October?,443.0,"[67, 20, 62, 2287, 2935, 15, 1209]","[2, 4, 1, 37, 9705, 9, 12, 20, 386, 15, 6, 168..."
2,,8.0,&lt;p&gt;This was one of our definition questi...,,4,,2011-06-21T20:22:33.760,5.0,2,2018-08-26T00:04:13.520,...,,13,,1,36,&lt;guides&gt;&lt;extreme-tourism&gt;&lt;amazo...,How can I find a guide that will take me safel...,1876.0,"[91, 33, 12, 120, 6, 698, 16, 36, 105, 96, 271...","[2, 4, 1, 32, 59, 55, 13, 346, 2593, 113, 34, ..."
3,,,&lt;p&gt;One way would be to go through an Adv...,,2,,2011-06-21T20:24:28.080,,3,2011-06-21T20:24:28.080,...,,9,2.0,2,14,,,,[],"[2, 4, 1, 55, 110, 54, 19, 7, 87, 111, 43, 313..."
4,,1.0,&lt;p&gt;Singapore Airlines has an all-busines...,,1,,2011-06-21T20:24:57.160,,4,2013-01-09T09:55:22.743,...,,24,,1,8,&lt;loyalty-programs&gt;&lt;routes&gt;&lt;ewr&...,Does Singapore Airlines offer any reward seats...,249.0,"[122, 739, 183, 514, 61, 4943, 609, 18, 97, 31...","[2, 4, 1, 739, 183, 79, 43, 66, 312, 269, 75, ..."
5,770.0,5.0,&lt;p&gt;Another definition question that inte...,,0,,2011-06-21T20:25:56.787,2.0,5,2012-10-12T20:49:08.110,...,,13,,1,13,&lt;romania&gt;&lt;transportation&gt;,What is the easiest transportation to use thro...,418.0,"[67, 14, 5, 2068, 840, 7, 103, 2166, 1471, 15,...","[2, 4, 1, 192, 2593, 176, 16, 1080, 96, 59, 67..."


In [51]:
df.index

Index(['1', '2', '3', '4', '5', '6', '8', '9', '10', '11',
       ...
       '121647', '121648', '121649', '121650', '121652', '121653', '121655',
       '121657', '121658', '121659'],
      dtype='object', name='Id', length=87956)

In [52]:
df.loc['393']

AcceptedAnswerId                                                       NaN
AnswerCount                                                            NaN
Body                     &lt;p&gt;This is less than an answer, but more...
ClosedDate                                                             NaN
CommentCount                                                             1
CommunityOwnedDate                                                     NaN
CreationDate                                       2011-06-24T05:12:01.133
FavoriteCount                                                          NaN
Id                                                                     393
LastActivityDate                                   2011-06-24T05:12:01.133
LastEditDate                                                           NaN
LastEditorDisplayName                                                  NaN
LastEditorUserId                                                       NaN
OwnerDisplayName         

In [39]:
tokenizer.word_counts['a'], tokenizer.word_counts['the']

(473233, 552992)

In [43]:
idf['a'], idf['the']

(3.603566009125561, 3.4478103418809867)

In [44]:
len(tokenizer.word_index)

177184

In [46]:
import random

# We can create a data generator that will randomly title and body tokens for questions.  We'll use random text
# from other questions as a negative example when necessary.
def data_generator(batch_size, negative_samples=1):
    questions = df[df['PostTypeId'] == 1]
    all_q_ids = list(questions.index)
        
    batch_x_a = []
    batch_x_b = []
    batch_y = []
    
    def _add(x_a, x_b, y):
        batch_x_a.append(x_a[:MAX_DOC_LEN])
        batch_x_b.append(x_b[:MAX_DOC_LEN])
        batch_y.append(y)
    
    while True:
        questions = questions.sample(frac=1.0)
        
        for i, q in questions.iterrows():
            _add(q['title_tokens'], q['body_tokens'], 1)
            
            negative_q = random.sample(all_q_ids, negative_samples)
            for nq_id in negative_q:
                _add(q['title_tokens'], df.at[nq_id, 'body_tokens'], 0)            
            
            if len(batch_y) >= batch_size:
                yield ({
                    'title': pad_sequences(batch_x_a, maxlen=None),
                    'body': pad_sequences(batch_x_b, maxlen=None),
                }, np.asarray(batch_y))
                
                batch_x_a = []
                batch_x_b = []
                batch_y = []

dg = data_generator(1, 2)
next(dg)
next(dg)

({'title': array([[  36,    6, 1403,  114,  277,  196,   10, 2615],
         [  36,    6, 1403,  114,  277,  196,   10, 2615],
         [  36,    6, 1403,  114,  277,  196,   10, 2615]], dtype=int32),
  'body': array([[     0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      2,
               4,      1,     23,     12,    230,  

In [48]:
tokenizer.sequences_to_texts([[36,    6, 1403,  114,  277,  196,   10, 2615]])

['will a vancouver day pass work in victoria']

In [49]:
tokenizer.sequences_to_texts([[     0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      2,
               4,      1,     23,     12,    230,      6,    114,    277,
              10,   1403,   2757,     18,      6,    685,      7,   2615,
               9,    546,     39,     33,     12,    103,      5,    114,
             277,      7,     56,    175,   2615,      2,      4,      1,
               3,      3,      2,      4,      1,   1688,   2265,      7,
          130123,   3522,     29,  25494,      2,      4,      1,      3]])

["lt p gt if i buy a day pass in vancouver hop on a ferry to victoria and arrive there can i use the day pass to get around victoria lt p gt xa xa lt p gt we're heading to butchart gardens from burnaby lt p gt xa"]

# Embedding Lookups

Let's define a helper class for looking up our embedding results.  We'll use it
to verify our models.

In [53]:
questions = df[df['PostTypeId'] == 1]['Title'].reset_index(drop=True)
question_tokens = pad_sequences(tokenizer.texts_to_sequences(questions))

class EmbeddingWrapper(object):
    def __init__(self, model):
        self._r = questions
        self._i = {i:s for (i, s) in enumerate(questions)}
        self._w = model.predict({'title': question_tokens}, verbose=1, batch_size=1024)
        self._model = model
        self._norm = np.sqrt(np.sum(self._w * self._w + 1e-5, axis=1))

    def nearest(self, sentence, n=10):
        x = tokenizer.texts_to_sequences([sentence])
        if len(x[0]) < MIN_DOC_LEN:
            x[0] += [0] * (MIN_DOC_LEN - len(x))
        e = self._model.predict(np.asarray(x))[0]
        norm_e = np.sqrt(np.dot(e, e))
        dist = np.dot(self._w, e) / (norm_e * self._norm)

        top_idx = np.argsort(dist)[-n:]
        return pd.DataFrame.from_records([
            {'question': self._r[i], 'dist': float(dist[i])}
            for i in top_idx
        ])

In [54]:
# Our first model will just sum up the embeddings of each token.
# The similarity between documents will be the dot product of the final embedding.

import tensorflow as tf

def sum_model(embedding_size, vocab_size, embedding_weights=None, idf_weights=None):
    title = layers.Input(shape=(None,), dtype='int32', name='title')
    body = layers.Input(shape=(None,), dtype='int32', name='body')

    def make_embedding(name):
        if embedding_weights is not None:
            embedding = layers.Embedding(mask_zero=True, input_dim=vocab_size, output_dim=w2v_weights.shape[1], 
                                         weights=[w2v_weights], trainable=False, 
                                         name='%s/embedding' % name)
        else:
            embedding = layers.Embedding(mask_zero=True, input_dim=vocab_size, output_dim=embedding_size,
                                        name='%s/embedding' % name)

        if idf_weights is not None:
            idf = layers.Embedding(mask_zero=True, input_dim=vocab_size, output_dim=1, 
                                   weights=[idf_weights], trainable=False,
                                   name='%s/idf' % name)
        else:
            idf = layers.Embedding(mask_zero=True, input_dim=vocab_size, output_dim=1,
                                   name='%s/idf' % name)
            
        return embedding, idf
    
    embedding_a, idf_a = make_embedding('a')
    embedding_b, idf_b = embedding_a, idf_a
#     embedding_b, idf_b = make_embedding('b')

    mask = layers.Masking(mask_value=0)
    def _combine_and_sum(args):
        [embedding, idf] = args
        return K.sum(embedding * K.abs(idf), axis=1)

    sum_layer = layers.Lambda(_combine_and_sum, name='combine_and_sum')

    sum_a = sum_layer([mask(embedding_a(title)), idf_a(title)])
    sum_b = sum_layer([mask(embedding_b(body)), idf_b(body)])

    sim = layers.dot([sum_a, sum_b], axes=1, normalize=True)
    sim_model = models.Model(
        inputs=[title, body],
        outputs=[sim],
    )
    sim_model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
    sim_model.summary()

    embedding_model = models.Model(
        inputs=[title],
        outputs=[sum_a]
    )
    return sim_model, embedding_model

In [55]:
# Try using our model with pretrained weights from word2vec

sum_model_precomputed, sum_embedding_precomputed = sum_model(
    embedding_size=EMBEDDING_SIZE, vocab_size=VOCAB_SIZE,
    embedding_weights=w2v_weights, idf_weights=idf_weights
)

x, y = next(data_generator(batch_size=4096))
sum_model_precomputed.evaluate(x, y)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title (InputLayer)              (None, None)         0                                            
__________________________________________________________________________________________________
body (InputLayer)               (None, None)         0                                            
__________________________________________________________________________________________________
a/embedding (Embedding)         (None, None, 100)    25000000    title[0][0]                      
                                                                 body[0][0]                       
__________________________________________________________________________________________________
masking_1 (Masking)             (None, None, 100)    0           a/embedding[0][0]                
          

[0.9652091963216662, 0.50927734375]

In [62]:
SAMPLE_QUESTIONS = [
    'Roundtrip ticket versus one way',
    'Shinkansen from Kyoto to Hiroshima',
    'Bus tour of Germany',
]

def evaluate_sample(lookup):
    pd.set_option('display.max_colwidth', 100)
    results = []
    for q in SAMPLE_QUESTIONS:
        print(q)
        q_res = lookup.nearest(q, n=4)
        q_res['title'] = q_res['question']
        q_res['body'] = q
        results.append(q_res)

    return pd.concat(results)

lookup = EmbeddingWrapper(model=sum_embedding_precomputed)
evaluate_sample(lookup)

Roundtrip ticket versus one way
Shinkansen from Kyoto to Hiroshima
Bus tour of Germany


Unnamed: 0,dist,question,title,body
0,0.811505,"Buy a roundtrip ticket for two people, but second person only travels on return - is that possible","Buy a roundtrip ticket for two people, but second person only travels on return - is that possible",Roundtrip ticket versus one way
1,0.813489,How to pick the (phony) return destination for a roundtrip ticket intended as a one-way?,How to pick the (phony) return destination for a roundtrip ticket intended as a one-way?,Roundtrip ticket versus one way
2,0.815237,What is cheapest way to fly around SE Asia in a circuit - hub with roundtrip tickets or sequence...,What is cheapest way to fly around SE Asia in a circuit - hub with roundtrip tickets or sequence...,Roundtrip ticket versus one way
3,0.826175,The penalty for changing an airline ticket is per leg or per ticket?,The penalty for changing an airline ticket is per leg or per ticket?,Roundtrip ticket versus one way
0,0.753482,Culture Day in Osaka/Kyoto,Culture Day in Osaka/Kyoto,Shinkansen from Kyoto to Hiroshima
1,0.756922,Where does the Tokaido Shinkansen stop in Tokyo?,Where does the Tokaido Shinkansen stop in Tokyo?,Shinkansen from Kyoto to Hiroshima
2,0.775683,Best connection Tokyo - Kyoto,Best connection Tokyo - Kyoto,Shinkansen from Kyoto to Hiroshima
3,0.812986,Travel from Tokyo to Sendai with Shinkansen,Travel from Tokyo to Sendai with Shinkansen,Shinkansen from Kyoto to Hiroshima
0,0.891427,Trip in the south of Germany,Trip in the south of Germany,Bus tour of Germany
1,0.89519,Travelling outside of Germany on a German Working Holiday visa (Australian),Travelling outside of Germany on a German Working Holiday visa (Australian),Bus tour of Germany


# Training our own network

The results are okay but not great... instead of using the word2vec embeddings, what happens if we train our network end-to-end?

In [57]:
sum_model_trained, sum_embedding_trained = sum_model(
    embedding_size=EMBEDDING_SIZE, vocab_size=VOCAB_SIZE, 
    embedding_weights=None,
    idf_weights=None
)
sum_model_trained.fit_generator(
    data_generator(batch_size=128),
    epochs=10,
    steps_per_epoch=1000
)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title (InputLayer)              (None, None)         0                                            
__________________________________________________________________________________________________
body (InputLayer)               (None, None)         0                                            
__________________________________________________________________________________________________
a/embedding (Embedding)         (None, None, 100)    25000000    title[0][0]                      
                                                                 body[0][0]                       
__________________________________________________________________________________________________
masking_2 (Masking)             (None, None, 100)    0           a/embedding[0][0]                
          

<keras.callbacks.History at 0x7f8afcd9d240>

In [58]:
lookup = EmbeddingWrapper(model=sum_embedding_trained)
evaluate_sample(lookup)

Roundtrip ticket versus one way
Shinkansen from Kyoto to Hiroshima
Bus tour of Germany


Unnamed: 0,dist,question,result
0,0.769217,Roundtrip ticket versus one way,How to get return prices for a one way ticket?
1,0.770204,Roundtrip ticket versus one way,How do one way/round trip plane tickets work?
2,0.773594,Roundtrip ticket versus one way,Can I buy another set of round trip tickets if I already have one for a later date?
3,0.79357,Roundtrip ticket versus one way,"Buy a roundtrip ticket for two people, but second person only travels on return - is that possible"
0,0.959197,Shinkansen from Kyoto to Hiroshima,What does my Shinkansen ticket say?
1,0.959941,Shinkansen from Kyoto to Hiroshima,Hokkaido Shinkansen - Sendai with JR Pass
2,0.96179,Shinkansen from Kyoto to Hiroshima,Stopovers on Shinkansen services?
3,0.963612,Shinkansen from Kyoto to Hiroshima,What are my options for reserving JR Shinkansen tickets in advance over the new year period?
0,0.622985,Bus tour of Germany,About inter-city and inter-country bus services in Europe
1,0.625273,Bus tour of Germany,European bus tour companies for middle age people?


## CNN Model

Using a sum-of-embeddings model works well. What happens if we try to make a simple CNN model?

In [59]:
def cnn_model(embedding_size, vocab_size):
    title = layers.Input(shape=(None,), dtype='int32', name='title')
    body = layers.Input(shape=(None,), dtype='int32', name='body')

    embedding = layers.Embedding(
        mask_zero=False,
        input_dim=vocab_size,
        output_dim=embedding_size,
    )


    def _combine_sum(v):
        return K.sum(v, axis=1)

    cnn_1 = layers.Convolution1D(256, 3)
    cnn_2 = layers.Convolution1D(256, 3)
    cnn_3 = layers.Convolution1D(256, 3)
    
    global_pool = layers.GlobalMaxPooling1D()
    local_pool = layers.MaxPooling1D(strides=2, pool_size=3)

    def forward(input):
        embed = embedding(input)
        return global_pool(
            cnn_2(local_pool(cnn_1(embed))))

    sum_a = forward(title)
    sum_b = forward(body)

    sim = layers.dot([sum_a, sum_b], axes=1, normalize=False)
    sim_model = models.Model(
        inputs=[title, body],
        outputs=[sim],
    )
    sim_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

    embedding_model = models.Model(
        inputs=[title],
        outputs=[sum_a]
    )
    return sim_model, embedding_model

In [60]:
cnn, cnn_embedding = cnn_model(embedding_size=25, vocab_size=VOCAB_SIZE)
cnn.summary()
cnn.fit_generator(
    data_generator(batch_size=128),
    epochs=10,
    steps_per_epoch=1000,
)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title (InputLayer)              (None, None)         0                                            
__________________________________________________________________________________________________
body (InputLayer)               (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 25)     6250000     title[0][0]                      
                                                                 body[0][0]                       
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, None, 256)    19456       embedding_1[0][0]                
          

<keras.callbacks.History at 0x7f8b0437af28>

In [61]:
lookup = EmbeddingWrapper(model=cnn_embedding)
evaluate_sample(lookup)

Roundtrip ticket versus one way
Shinkansen from Kyoto to Hiroshima
Bus tour of Germany


Unnamed: 0,dist,question,result
0,0.945426,Roundtrip ticket versus one way,Dalhousie to Udawalawe?
1,0.945605,Roundtrip ticket versus one way,religious problem
2,0.946507,Roundtrip ticket versus one way,Mugging 'Etiquette'?
3,0.946684,Roundtrip ticket versus one way,On short connections
0,0.987494,Shinkansen from Kyoto to Hiroshima,Newbie Traveler
1,0.987831,Shinkansen from Kyoto to Hiroshima,Tegelbergbahn Tickets
2,0.987883,Shinkansen from Kyoto to Hiroshima,Mugging 'Etiquette'?
3,0.988238,Shinkansen from Kyoto to Hiroshima,Nagorno-Karabakh
0,0.926533,Bus tour of Germany,Schengeni Allamok
1,0.926621,Bus tour of Germany,Newbie Traveler


## LSTM Model

We can also make an LSTM model.  Warning, this will be very slow to train and evaluate unless you have a relatively fast GPU to run it on!

In [63]:
def lstm_model(embedding_size, vocab_size):
    title = layers.Input(shape=(None,), dtype='int32', name='title')
    body = layers.Input(shape=(None,), dtype='int32', name='body')

    embedding = layers.Embedding(
        mask_zero=True,
        input_dim=vocab_size,
        output_dim=embedding_size,
#         weights=[w2v_weights],
#         trainable=False
    )

    lstm_1 = layers.LSTM(units=512, return_sequences=True)
    lstm_2 = layers.LSTM(units=512, return_sequences=False)
    
    sum_a = lstm_2(lstm_1(embedding(title)))
    sum_b = lstm_2(lstm_1(embedding(body)))

    sim = layers.dot([sum_a, sum_b], axes=1, normalize=True)
#     sim = layers.Activation(activation='sigmoid')(sim)
    sim_model = models.Model(
        inputs=[title, body],
        outputs=[sim],
    )
    sim_model.compile(loss='binary_crossentropy', optimizer='rmsprop')

    embedding_model = models.Model(
        inputs=[title],
        outputs=[sum_a]
    )
    return sim_model, embedding_model

In [64]:
lstm, lstm_embedding = lstm_model(embedding_size=EMBEDDING_SIZE, vocab_size=VOCAB_SIZE)
lstm.summary()
lstm.fit_generator(
    data_generator(batch_size=128),
    epochs=10,
    steps_per_epoch=100,
)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title (InputLayer)              (None, None)         0                                            
__________________________________________________________________________________________________
body (InputLayer)               (None, None)         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 100)    25000000    title[0][0]                      
                                                                 body[0][0]                       
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, None, 512)    1255424     embedding_2[0][0]                
          

<keras.callbacks.History at 0x7f8af06f9a20>

In [65]:
lookup = EmbeddingWrapper(model=lstm_embedding)
evaluate_sample(lookup)

Roundtrip ticket versus one way
Shinkansen from Kyoto to Hiroshima
Bus tour of Germany


Unnamed: 0,dist,question,title,body
0,0.997782,Getting a BART clipper card,Getting a BART clipper card,Roundtrip ticket versus one way
1,0.997786,Strange ZIP codes in Alaska?,Strange ZIP codes in Alaska?,Roundtrip ticket versus one way
2,0.997941,Volunteering for an archaeological dig,Volunteering for an archaeological dig,Roundtrip ticket versus one way
3,0.998024,"Getting to Barrow, Alaska overland","Getting to Barrow, Alaska overland",Roundtrip ticket versus one way
0,0.998042,"Halifax long layover, recheck bags?","Halifax long layover, recheck bags?",Shinkansen from Kyoto to Hiroshima
1,0.998094,Renting camping equipment in Nambia,Renting camping equipment in Nambia,Shinkansen from Kyoto to Hiroshima
2,0.998123,"Luggage storage in Tulum, Mexico?","Luggage storage in Tulum, Mexico?",Shinkansen from Kyoto to Hiroshima
3,0.998173,Bandipur national park opening dates?,Bandipur national park opening dates?,Shinkansen from Kyoto to Hiroshima
0,0.994282,Other agencies for passport,Other agencies for passport,Bus tour of Germany
1,0.994292,Extension of Schengen Visa,Extension of Schengen Visa,Bus tour of Germany
