In [1]:
# 安装环境
!pip install keras==2.0.4

Collecting keras==2.0.4
  Downloading http://repo.myhuaweicloud.com/repository/pypi/packages/c9/81/ac6e14d01b62aa533ae52c4340e7da2dfecb61389ce0cebe9e5d14c01504/Keras-2.0.4.tar.gz (199kB)
[K    100% |████████████████████████████████| 204kB 20.1MB/s ta 0:00:01
[?25hCollecting theano (from keras==2.0.4)
  Downloading http://repo.myhuaweicloud.com/repository/pypi/packages/7d/c4/6341148ad458b6cd8361b774d7ee6895c38eab88f05331f22304c484ed5d/Theano-1.0.4.tar.gz (2.8MB)
[K    100% |████████████████████████████████| 2.8MB 41.3MB/s ta 0:00:01
Building wheels for collected packages: keras, theano
  Running setup.py bdist_wheel for keras ... [?25ldone
[?25h  Stored in directory: /home/ma-user/.cache/pip/wheels/1e/7c/da/e45ba505ad18fa21c98b1e9188d161e26dd1390ff21bc5a7e6
  Running setup.py bdist_wheel for theano ... [?25ldone
[?25h  Stored in directory: /home/ma-user/.cache/pip/wheels/1e/61/26/39f48cae2aa138169e3bfeea2ab39706cf3770a322b049d792
Successfully built keras theano
Installing collect

# 问题匹配相似性判断
构建一个问题匹配的模型，用来判断提出的问题和已有知识库中的哪些问题相似度较高。
使用互联网档案馆网站中的旅行相关的数据集（https://ia800107.us.archive.org/27/items/stackexchange/travel.stackexchange.com.7z） ，已经上传到obs，使用预训练模型，再训练，再使用LSTM模型三步逐步改进模型，相似度可达到90%以上。

In [1]:
# 导包
import re
import os
import keras.backend as K
import numpy as np
import pandas as pd
from keras import layers, models, utils
import json

Using TensorFlow backend.


In [2]:
def reset_everything():
    import tensorflow as tf
    %reset -f in out dhist
    tf.reset_default_graph()
    K.set_session(tf.InteractiveSession())

In [3]:
VOCAB_SIZE = 250000
EMBEDDING_SIZE = 100
MAX_DOC_LEN = 128
MIN_DOC_LEN = 12

## 获取数据集
从obs获取数据集，得到的是utf-16编码的txt文件，将该文件转码，并处理成我们需要的json文件

In [4]:
from modelarts.session import Session
sess = Session()

if sess.region_name == 'cn-north-4':
    bucket_path="wabao-awe01/pps.txt"

sess.download_data(bucket_path=bucket_path, path="./pps.txt")

def extract_stackexchange(limit=1000000):
  json_file = 'limit=%s.json' % limit
  rows = []
  for i, line in enumerate(open("./pps.txt",'r', encoding="utf-16")):
    line = str(line)
    if not line.startswith('  <row'):
      continue
    if i % 1000 == 0:
      print('\r%05d/%05d' % (i, limit), end='', flush=True)
    parts = line[6:-5].split('"')
    record = {}
    for i in range(0, len(parts), 2):
      k = parts[i].replace('=', '').strip()
      v = parts[i+1].strip()
      record[k] = v
    rows.append(record)
    if len(rows) > limit:
      break
  with open(json_file, 'w') as fout:
    json.dump(rows, fout)
  return rows

# xml_7z = utils.get_file(
#     fname='travel.stackexchange.com.7z',
#     origin='https://ia800107.us.archive.org/27/items/stackexchange/travel.stackexchange.com.7z',
# )
# print()

rows = extract_stackexchange()

Successfully download file wabao-awe01/pps.txt from OBS to local ./pps.txt
102000/1000000

## 探索数据集
使用pandas装换数据，并快速查看数据中的热门问题

In [5]:
# print(rows)
df = pd.DataFrame.from_records(rows)    
df = df.set_index('Id', drop=False)
df['Title'] = df['Title'].fillna('').astype('str')
df['Tags'] = df['Tags'].fillna('').astype('str')
df['Body'] = df['Body'].fillna('').astype('str')
df['Id'] = df['Id'].astype('int')
df['PostTypeId'] = df['PostTypeId'].astype('int')
df['ViewCount'] = df['ViewCount'].astype('float')

df.head()

Unnamed: 0_level_0,AcceptedAnswerId,AnswerCount,Body,ClosedDate,CommentCount,CommunityOwnedDate,CreationDate,FavoriteCount,Id,LastActivityDate,...,LastEditorDisplayName,LastEditorUserId,OwnerDisplayName,OwnerUserId,ParentId,PostTypeId,Score,Tags,Title,ViewCount
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,393.0,4.0,&lt;p&gt;My fiancée and I are looking for a go...,2013-02-25T23:52:47.953,4,,2011-06-21T20:19:34.730,,1,2012-05-24T14:52:14.760,...,,101.0,,9,,1,8,&lt;caribbean&gt;&lt;cruising&gt;&lt;vacations...,What are some Caribbean cruises for October?,479.0
2,,8.0,&lt;p&gt;This was one of our definition questi...,,4,,2011-06-21T20:22:33.760,5.0,2,2018-08-26T00:04:13.520,...,,51577.0,,13,,1,38,&lt;guides&gt;&lt;extreme-tourism&gt;&lt;amazo...,How can I find a guide that will take me safel...,2364.0
3,,,&lt;p&gt;One way would be to go through an Adv...,,2,,2011-06-21T20:24:28.080,,3,2011-06-21T20:24:28.080,...,,,,9,2.0,2,15,,,
4,,1.0,&lt;p&gt;Singapore Airlines has an all-busines...,,1,,2011-06-21T20:24:57.160,,4,2013-01-09T09:55:22.743,...,,693.0,,24,,1,8,&lt;loyalty-programs&gt;&lt;routes&gt;&lt;ewr&...,Does Singapore Airlines offer any reward seats...,267.0
5,770.0,5.0,&lt;p&gt;Another definition question that inte...,,0,,2011-06-21T20:25:56.787,2.0,5,2012-10-12T20:49:08.110,...,,101.0,,13,,1,14,&lt;romania&gt;&lt;transportation&gt;,What is the easiest transportation to use thro...,440.0


In [6]:
list(df[df['ViewCount'] > 250000]['Title'])

['Do I need a US visa to transit (or layover) through an American airport?',
 'How much electronics and other valuables can I bring duty-free when going to India?',
 'How to get from Nice to Monaco by public transport?',
 'Should my first trip be to the country which issued my Schengen Visa?',
 'Can I cross the USA-Canada border with a birth certificate and a passport locator number?',
 "What's the difference between 'Redress Number' and 'Known Traveler Number'? Do I need both for TSA PreCheck?",
 'Can I use Google Maps traffic information to estimate driving time for a specific date/time?',
 'Is there a way to find out if I need a transit visa for a layover in the UK?',
 'Are aerosol cans allowed and safe, in checked luggage?',
 'How to track my UK Visa Application Status?',
 "When applying for an Indian Passport, how do I know if I'm in the ECR or non-ECR category?",
 'Are battery packs allowed in hand luggage?']

## 使用keras对文本进行特征化
将文本转换为特征向量

In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(df['Body'] + df['Title'])

In [8]:
# Compute TF/IDF Values

total_count = sum(tokenizer.word_counts.values())
idf = { k: np.log(total_count/v) for (k,v) in tokenizer.word_counts.items() }

In [10]:
# Download pre-trained word2vec embeddings

import gensim

from modelarts.session import Session
sess = Session()

if sess.region_name == 'cn-north-4':
    bucket_path="wabao-awe01/glove.6B.100d.txt"

sess.download_data(bucket_path=bucket_path, path="./glove.6B.100d.txt")

# glove_100d = utils.get_file(
#     fname='glove.6B.100d.txt',
#     origin='https://storage.googleapis.com/deep-learning-cookbook/glove.6B.100d.txt',
# )
glove_100d = 'glove.6B.100d.txt'

w2v_100d = glove_100d + '.w2v'
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_100d, w2v_100d)
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_100d)

w2v_weights = np.zeros((VOCAB_SIZE, w2v_model.syn0.shape[1]))
idf_weights = np.zeros((VOCAB_SIZE, 1))

for k, v in tokenizer.word_index.items():
    if v >= VOCAB_SIZE:
        continue
    
    if k in w2v_model:
        w2v_weights[v] = w2v_model[k]
    
    idf_weights[v] = idf[k]
    
del w2v_model

Successfully download file wabao-awe01/glove.6B.100d.txt from OBS to local ./glove.6B.100d.txt




In [11]:
df['title_tokens'] = tokenizer.texts_to_sequences(df['Title'])
df['body_tokens'] = tokenizer.texts_to_sequences(df['Body'])

## 数据生成
利用pandas中的过滤器和采样构建数据生成器

In [12]:
import random

# We can create a data generator that will randomly title and body tokens for questions.  We'll use random text
# from other questions as a negative example when necessary.
def data_generator(batch_size, negative_samples=1):
    questions = df[df['PostTypeId'] == 1]
    all_q_ids = list(questions.index)
        
    batch_x_a = []
    batch_x_b = []
    batch_y = []
    
    def _add(x_a, x_b, y):
        batch_x_a.append(x_a[:MAX_DOC_LEN])
        batch_x_b.append(x_b[:MAX_DOC_LEN])
        batch_y.append(y)
    
    while True:
        questions = questions.sample(frac=1.0)
        
        for i, q in questions.iterrows():
            _add(q['title_tokens'], q['body_tokens'], 1)
            
            negative_q = random.sample(all_q_ids, negative_samples)
            for nq_id in negative_q:
                _add(q['title_tokens'], df.at[nq_id, 'body_tokens'], 0)            
            
            if len(batch_y) >= batch_size:
                yield ({
                    'title': pad_sequences(batch_x_a, maxlen=None),
                    'body': pad_sequences(batch_x_b, maxlen=None),
                }, np.asarray(batch_y))
                
                batch_x_a = []
                batch_x_b = []
                batch_y = []

## 构建嵌入模型
进来计算数据集中每个问题的表示

In [13]:
questions = df[df['PostTypeId'] == 1]['Title'].reset_index(drop=True)
question_tokens = pad_sequences(tokenizer.texts_to_sequences(questions))

class EmbeddingWrapper(object):
    def __init__(self, model):
        self._r = questions
        self._i = {i:s for (i, s) in enumerate(questions)}
        self._w = model.predict({'title': question_tokens}, verbose=1, batch_size=1024)
        self._model = model
        self._norm = np.sqrt(np.sum(self._w * self._w + 1e-5, axis=1))

    def nearest(self, sentence, n=10):
        x = tokenizer.texts_to_sequences([sentence])
        if len(x[0]) < MIN_DOC_LEN:
            x[0] += [0] * (MIN_DOC_LEN - len(x))
        e = self._model.predict(np.asarray(x))[0]
        norm_e = np.sqrt(np.dot(e, e))
        dist = np.dot(self._w, e) / (norm_e * self._norm)

        top_idx = np.argsort(dist)[-n:]
        return pd.DataFrame.from_records([
            {'question': self._r[i], 'dist': float(dist[i])}
            for i in top_idx
        ])

In [14]:
import tensorflow as tf

def sum_model(embedding_size, vocab_size, embedding_weights=None, idf_weights=None):
    title = layers.Input(shape=(None,), dtype='int32', name='title')
    body = layers.Input(shape=(None,), dtype='int32', name='body')

    def make_embedding(name):
        if embedding_weights is not None:
            embedding = layers.Embedding(mask_zero=True, input_dim=vocab_size, output_dim=w2v_weights.shape[1], 
                                         weights=[w2v_weights], trainable=False, 
                                         name='%s/embedding' % name)
        else:
            embedding = layers.Embedding(mask_zero=True, input_dim=vocab_size, output_dim=embedding_size,
                                        name='%s/embedding' % name)

        if idf_weights is not None:
            idf = layers.Embedding(mask_zero=True, input_dim=vocab_size, output_dim=1, 
                                   weights=[idf_weights], trainable=False,
                                   name='%s/idf' % name)
        else:
            idf = layers.Embedding(mask_zero=True, input_dim=vocab_size, output_dim=1,
                                   name='%s/idf' % name)
            
        return embedding, idf
    
    embedding_a, idf_a = make_embedding('a')
    embedding_b, idf_b = embedding_a, idf_a
#     embedding_b, idf_b = make_embedding('b')

    mask = layers.Masking(mask_value=0)
    def _combine_and_sum(args):
        [embedding, idf] = args
        return K.sum(embedding * K.abs(idf), axis=1)

    sum_layer = layers.Lambda(_combine_and_sum, name='combine_and_sum')

    sum_a = sum_layer([mask(embedding_a(title)), idf_a(title)])
    sum_b = sum_layer([mask(embedding_b(body)), idf_b(body)])

    sim = layers.dot([sum_a, sum_b], axes=1, normalize=True)
    sim_model = models.Model(
        inputs=[title, body],
        outputs=[sim],
    )
    sim_model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
    sim_model.summary()

    embedding_model = models.Model(
        inputs=[title],
        outputs=[sum_a]
    )
    return sim_model, embedding_model

## 使用预训练的权重进行检查相似性
使用glove.6B.100d.txt中的权重

In [15]:
# Try using our model with pretrained weights from word2vec

sum_model_precomputed, sum_embedding_precomputed = sum_model(
    embedding_size=EMBEDDING_SIZE, vocab_size=VOCAB_SIZE,
    embedding_weights=w2v_weights, idf_weights=idf_weights
)

x, y = next(data_generator(batch_size=4096))
sum_model_precomputed.evaluate(x, y)

Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title (InputLayer)              (None, None)         0                                            
__________________________________________________________________________________________________
body (InputLayer)               (None, None)         0                                            
__________________________________________________________________________________________________
a/embedding (Embedding)         (None, None, 100)    25000000    title[0][0]                      
                                                                 body[0][0]                       
__________________________________________________________________________________________________
masking_1 (Masking)             (None, None, 100)    0           a/embedding[0][0]                
          

[0.9663632987067103, 0.51025390625]

In [16]:
SAMPLE_QUESTIONS = [
    'Roundtrip ticket versus one way',
    'Shinkansen from Kyoto to Hiroshima',
    'Bus tour of Germany',
]

def evaluate_sample(lookup):
    pd.set_option('display.max_colwidth', 100)
    results = []
    for q in SAMPLE_QUESTIONS:
        print(q)
        q_res = lookup.nearest(q, n=4)
        q_res['result'] = q_res['question']
        q_res['question'] = q
        results.append(q_res)

    return pd.concat(results)

lookup = EmbeddingWrapper(model=sum_embedding_precomputed)
evaluate_sample(lookup)

Roundtrip ticket versus one way
Shinkansen from Kyoto to Hiroshima
Bus tour of Germany


Unnamed: 0,dist,question,result
0,0.811454,Roundtrip ticket versus one way,"Buy a roundtrip ticket for two people, but second person only travels on return - is that possible"
1,0.813222,Roundtrip ticket versus one way,How to pick the (phony) return destination for a roundtrip ticket intended as a one-way?
2,0.814976,Roundtrip ticket versus one way,What is cheapest way to fly around SE Asia in a circuit - hub with roundtrip tickets or sequence...
3,0.826162,Roundtrip ticket versus one way,The penalty for changing an airline ticket is per leg or per ticket?
0,0.752807,Shinkansen from Kyoto to Hiroshima,Culture Day in Osaka/Kyoto
1,0.756952,Shinkansen from Kyoto to Hiroshima,Where does the Tokaido Shinkansen stop in Tokyo?
2,0.775399,Shinkansen from Kyoto to Hiroshima,Best connection Tokyo - Kyoto
3,0.812965,Shinkansen from Kyoto to Hiroshima,Travel from Tokyo to Sendai with Shinkansen
0,0.890951,Bus tour of Germany,Trip in the south of Germany
1,0.894996,Bus tour of Germany,Travelling outside of Germany on a German Working Holiday visa (Australian)


## 自己训练权重，进行检查

In [17]:
sum_model_trained, sum_embedding_trained = sum_model(
    embedding_size=EMBEDDING_SIZE, vocab_size=VOCAB_SIZE, 
    embedding_weights=None,
    idf_weights=None
)
sum_model_trained.fit_generator(
    data_generator(batch_size=128),
    epochs=5,
    steps_per_epoch=1000
)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title (InputLayer)              (None, None)         0                                            
__________________________________________________________________________________________________
body (InputLayer)               (None, None)         0                                            
__________________________________________________________________________________________________
a/embedding (Embedding)         (None, None, 100)    25000000    title[0][0]                      
                                                                 body[0][0]                       
__________________________________________________________________________________________________
masking_2 (Masking)             (None, None, 100)    0           a/embedding[0][0]                
          

Instructions for updating:
Use tf.cast instead.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f1d063074e0>

In [18]:
lookup = EmbeddingWrapper(model=sum_embedding_trained)
evaluate_sample(lookup)

Roundtrip ticket versus one way
Shinkansen from Kyoto to Hiroshima
Bus tour of Germany


Unnamed: 0,dist,question,result
0,0.842127,Roundtrip ticket versus one way,Is it okay to board only the second flight of a return ticket? The return ticket is much cheaper...
1,0.842878,Roundtrip ticket versus one way,Why are one-way plane tickets more expensive than return tickets?
2,0.854418,Roundtrip ticket versus one way,Are airline ticket prices cheaper for some flights than others even before tickets are sold?
3,0.878517,Roundtrip ticket versus one way,"Buy a roundtrip ticket for two people, but second person only travels on return - is that possible"
0,0.965217,Shinkansen from Kyoto to Hiroshima,Options for sending a message while on a Shinkansen
1,0.969414,Shinkansen from Kyoto to Hiroshima,What are my options for reserving JR Shinkansen tickets in advance over the new year period?
2,0.970185,Shinkansen from Kyoto to Hiroshima,What does my Shinkansen ticket say?
3,0.972388,Shinkansen from Kyoto to Hiroshima,How early should I reserve Shinkansen tickets during April?
0,0.760687,Bus tour of Germany,About inter-city and inter-country bus services in Europe
1,0.76822,Bus tour of Germany,European bus tour companies for middle age people?


## 使用LSTM Model进行训练模型，并检查

In [19]:
def lstm_model(embedding_size, vocab_size):
    title = layers.Input(shape=(None,), dtype='int32', name='title')
    body = layers.Input(shape=(None,), dtype='int32', name='body')

    embedding = layers.Embedding(
        mask_zero=True,
        input_dim=vocab_size,
        output_dim=embedding_size,
#         weights=[w2v_weights],
#         trainable=False
    )

    lstm_1 = layers.LSTM(units=512, return_sequences=True)
    lstm_2 = layers.LSTM(units=512, return_sequences=False)
    
    sum_a = lstm_2(lstm_1(embedding(title)))
    sum_b = lstm_2(lstm_1(embedding(body)))

    sim = layers.dot([sum_a, sum_b], axes=1, normalize=True)
#     sim = layers.Activation(activation='sigmoid')(sim)
    sim_model = models.Model(
        inputs=[title, body],
        outputs=[sim],
    )
    sim_model.compile(loss='binary_crossentropy', optimizer='rmsprop')

    embedding_model = models.Model(
        inputs=[title],
        outputs=[sum_a]
    )
    return sim_model, embedding_model

lstm, lstm_embedding = lstm_model(embedding_size=EMBEDDING_SIZE, vocab_size=VOCAB_SIZE)
lstm.summary()
lstm.fit_generator(
    data_generator(batch_size=128),
    epochs=2,
    steps_per_epoch=100,
)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title (InputLayer)              (None, None)         0                                            
__________________________________________________________________________________________________
body (InputLayer)               (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 100)    25000000    title[0][0]                      
                                                                 body[0][0]                       
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, None, 512)    1255424     embedding_1[0][0]                
          

<keras.callbacks.History at 0x7f1cf5c5ee48>

In [20]:
lookup = EmbeddingWrapper(model=lstm_embedding)
evaluate_sample(lookup)

Roundtrip ticket versus one way
Shinkansen from Kyoto to Hiroshima
Bus tour of Germany


Unnamed: 0,dist,question,result
0,0.976807,Roundtrip ticket versus one way,Philadelphia to Newark Airport without AirTrain
1,0.976844,Roundtrip ticket versus one way,How to read this travel itinerary?
2,0.976883,Roundtrip ticket versus one way,Cheap hostel around London City Airport
3,0.977388,Roundtrip ticket versus one way,How to see the Norwegian fjords
0,0.974853,Shinkansen from Kyoto to Hiroshima,Commuting from Frankfurt to Bad Homburg
1,0.9752,Shinkansen from Kyoto to Hiroshima,From F-1 to B2 Status
2,0.975333,Shinkansen from Kyoto to Hiroshima,Transit question from Dubai to Chennai
3,0.975428,Shinkansen from Kyoto to Hiroshima,Walking from Santiago to Buenos Aires
0,0.969951,Bus tour of Germany,Transfer ownership of Oyster card
1,0.969973,Bus tour of Germany,Issue regarding turkish e-visa?


In [22]:
lookup.nearest('Where is HongKong')

Unnamed: 0,dist,question
0,0.909965,Where is this castle?
1,0.914093,Where is this building?
2,0.914417,"Where is Kechror, Turkey?"
3,0.914495,Where is Waze accurate?
4,0.914628,Where is this statue?
5,0.915962,Where is this city?
6,0.916129,Where is this geyser?
7,0.916339,Where is this gorge?
8,0.916458,Where is this lake?
9,0.917088,Where is this mural?
