In [5]:
#預處理
#模型
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
import pandas as pd
import numpy as np
import string, os 
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

hotel_df = pd.read_csv('Seattle_Hotels.csv', encoding="latin-1")
all_descriptions = list(hotel_df.desc.values)
len(all_descriptions)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


152

In [6]:
corpus = [x for x in all_descriptions]

In [10]:
corpus[:1]

["Located on the southern tip of Lake Union, the Hilton Garden Inn Seattle Downtown hotel is perfectly located for business and leisure. \nThe neighborhood is home to numerous major international companies including Amazon, Google and the Bill & Melinda Gates Foundation. A wealth of eclectic restaurants and bars make this area of Seattle one of the most sought out by locals and visitors. Our proximity to Lake Union allows visitors to take in some of the Pacific Northwest's majestic scenery and enjoy outdoor activities like kayaking and sailing. over 2,000 sq. ft. of versatile space and a complimentary business center. State-of-the-art A/V technology and our helpful staff will guarantee your conference, cocktail reception or wedding is a success. Refresh in the sparkling saltwater pool, or energize with the latest equipment in the 24-hour fitness center. Tastefully decorated and flooded with natural light, our guest rooms and suites offer everything you need to relax and stay productive

In [16]:
#將特殊符號等字都篩選出來排除,並且將英文全都轉為小寫,過去這些可能需要一行行的code
#但在這裡keras的預處理便可以設定一次做完這些事
t = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', \
              lower=True, split=' ', char_level=False, oov_token=None, document_count=0)

def get_sequence_of_tokens(corpus):
    t.fit_on_texts(corpus)
    total_words = len(t.word_index) + 1
    
    input_sequences = []
    for line in corpus:
        
        token_list = t.texts_to_sequences([line])[0]
        #觀察token_list的長相
        print(token_list,'\n')
        for i in range(1, len(token_list)):
            #這裡有一個n-gram的做法
            n_gram_sequence = token_list[:i+1]
            
            input_sequences.append(n_gram_sequence)
            
    return input_sequences, total_words
input_sequences, total_words = get_sequence_of_tokens(corpus)
input_sequences[:10]


[24, 21, 1, 1734, 1735, 4, 81, 111, 1, 330, 331, 37, 6, 20, 9, 11, 547, 24, 15, 35, 2, 215, 1, 112, 11, 56, 5, 1736, 306, 113, 548, 64, 201, 638, 2, 1, 766, 767, 639, 768, 3, 1737, 4, 446, 98, 2, 485, 70, 46, 71, 4, 6, 40, 4, 1, 84, 1738, 169, 41, 640, 2, 769, 8, 486, 5, 81, 111, 942, 769, 5, 118, 7, 176, 4, 1, 76, 1739, 770, 1204, 2, 33, 170, 447, 67, 943, 2, 1740, 266, 377, 332, 771, 772, 4, 944, 32, 2, 3, 47, 35, 18, 124, 4, 1, 72, 3, 1741, 773, 2, 8, 1205, 407, 65, 1206, 16, 549, 550, 487, 17, 774, 11, 3, 775, 776, 7, 1, 1207, 1742, 92, 17, 1208, 10, 1, 777, 945, 7, 1, 77, 114, 78, 18, 778, 779, 2, 1743, 10, 448, 142, 8, 57, 26, 2, 45, 51, 216, 14, 154, 5, 155, 2, 25, 780, 333, 7, 1, 103, 2, 33, 641, 334, 15, 42, 781, 2, 488, 7, 8, 119, 1, 77, 114, 1744, 1209, 1745, 3, 307, 4, 408, 551, 2, 1210] 

[24, 7, 1, 642, 267, 643, 1, 449, 281, 6, 409, 3, 782, 5, 1, 644, 1211, 2, 1212, 4, 1, 76, 82, 335, 169, 4, 8, 336, 450, 5, 120, 1746, 95, 2, 485, 180, 282, 121, 489, 337, 2, 283, 66, 52,

[[24, 21],
 [24, 21, 1],
 [24, 21, 1, 1734],
 [24, 21, 1, 1734, 1735],
 [24, 21, 1, 1734, 1735, 4],
 [24, 21, 1, 1734, 1735, 4, 81],
 [24, 21, 1, 1734, 1735, 4, 81, 111],
 [24, 21, 1, 1734, 1735, 4, 81, 111, 1],
 [24, 21, 1, 1734, 1735, 4, 81, 111, 1, 330],
 [24, 21, 1, 1734, 1735, 4, 81, 111, 1, 330, 331]]

In [52]:
# pad sequences 
def generate_padded_sequences(input_sequences):
    max_sequence_len =101
    #maxlen設置最長的序列長度，長於該長度會被截短，短於該長度會被補滿
    input_sequences = np.array(pad_sequences(input_sequences, maxlen = max_sequence_len, padding = 'pre'))
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes = total_words)
    
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(input_sequences)

In [53]:
len(predictors[0])

100

In [54]:
def create_model(max_sequence_len, total_words):
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=max_sequence_len - 1))
    
    # Add Hidden Layer 1 - LSTM Layer
    #100個神經元
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    #激活函數選擇softmax
    model.add(Dense(total_words, activation='softmax'))
    #損失函數使用 cross entropy(神經網路常用) 並用adam作為優化加速
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()
#配適模型
#epochs代表此訓練會遞歸30次(RNN的精隨)
model.fit(predictors, label, epochs=30, verbose=5)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 100, 10)           34210     
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dropout_5 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 3421)              345521    
Total params: 424,131
Trainable params: 424,131
Non-trainable params: 0
_________________________________________________________________
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
E

<keras.callbacks.History at 0x1ee1722b898>

In [55]:
def generate_text(seed_text, next_words, model, max_seq_len):
    for _ in range(next_words):
        token_list = t.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len-1, \
                                   padding='pre')
        #預測下一個字
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ''
        """因為我們預測到的結果是一個數字(前面做過的轉換),因此在這裡我們需要
        回去用index去對照我們的item,並回傳這個字
        如 24  = located
        """
        for word,index in t.word_index.items():
            if index == predicted:
                output_word = word
                break
                
        seed_text = seed_text + " " + output_word
        
    return seed_text.title()

In [56]:
print(generate_text("hilton seattle downtown", 100, model, max_sequence_len))

Hilton Seattle Downtown Seattle Is Located In The Heart Of Downtown Seattle And The Retail Core The Inn Is Located In The City District To The Pike Place Market And The Space Needle And The Ferries Bound Across The Seattle Center Is Also Located Across The Street From The Seattle Center And The Museum Of Flight Afterwards The Very Best Inn Is A Stroll Away From The Seattle Center And The Museum Of Flight Afterwards Visitors To The Very Best District In The City District Near Seattle And The Museum Of Flight Afterwards Visitors And The Pacific Northwest Shuttle In The Heart Of
