In [1]:
import matchzoo as mz
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

  from pandas import Panel
Using TensorFlow backend.
  from pandas import Panel


In [2]:
train_dataset = pd.read_csv('../../train_data/train_data_1_1', delimiter=',')
validation_dataset = pd.read_csv('../../train_data/validation_data_1_1', delimiter=',')

train_dataset = train_dataset.replace(np.nan, ' ', regex=True)
validation_dataset = validation_dataset.replace(np.nan, ' ', regex=True)

In [3]:
train_dataset = train_dataset.head(1000)
validation_dataset = validation_dataset.head(1000)

In [4]:
list_data1 = []
for i, row in train_dataset.iterrows():
    
    line1 = {'id_left': str(row['article_id']),
            'text_left':str(row['article_page_title']),
            'id_right':str(row['table_id']),
            'text_right':str(row['table_page_title']),
            'label':row['label']
           }
    
    list_data1.append(line1)

df1 = pd.DataFrame(list_data1)
train_pack = mz.pack(df1)

In [5]:
list_data2 = []
for i, row in validation_dataset.iterrows():
    
    line2 = {'id_left': str(row['article_id']),
            'text_left':str(row['article_page_title']),
            'id_right':str(row['table_id']),
            'text_right':str(row['table_page_title']),
            'label':row['label']
           }
    
    list_data2.append(line2)

df2 = pd.DataFrame(list_data2)
valid_pack = mz.pack(df2)

In [6]:
ranking_task = mz.tasks.Ranking(loss=mz.losses.RankHingeLoss())
ranking_task.metrics = [
    mz.metrics.MeanAveragePrecision()
]

In [7]:
glove_embedding = mz.datasets.embeddings.load_glove_embedding(dimension=300)

In [8]:
preprocessor = mz.preprocessors.BasicPreprocessor(fixed_length_left=10, fixed_length_right=100, remove_stop_words=True)
train_pack_processed = preprocessor.fit_transform(train_pack)
valid_pack_processed = preprocessor.transform(valid_pack)

Processing text_left with chain_transform of Tokenize => Lowercase => PuncRemoval => StopRemoval: 100%|██████████| 989/989 [00:00<00:00, 9729.35it/s]
Processing text_right with chain_transform of Tokenize => Lowercase => PuncRemoval => StopRemoval: 100%|██████████| 996/996 [00:00<00:00, 12907.62it/s]
Processing text_right with append: 100%|██████████| 996/996 [00:00<00:00, 804143.75it/s]
Building FrequencyFilter from a datapack.: 100%|██████████| 996/996 [00:00<00:00, 552509.82it/s]
Processing text_right with transform: 100%|██████████| 996/996 [00:00<00:00, 531762.57it/s]
Processing text_left with extend: 100%|██████████| 989/989 [00:00<00:00, 943408.38it/s]
Processing text_right with extend: 100%|██████████| 996/996 [00:00<00:00, 1000844.94it/s]
Building Vocabulary from a datapack.: 100%|██████████| 7401/7401 [00:00<00:00, 3814456.12it/s]
Processing text_left with chain_transform of Tokenize => Lowercase => PuncRemoval => StopRemoval: 100%|██████████| 989/989 [00:00<00:00, 10201.68it

In [9]:
preprocessor.context

{'filter_unit': <matchzoo.preprocessors.units.frequency_filter.FrequencyFilter at 0x7ff949b26850>,
 'vocab_unit': <matchzoo.preprocessors.units.vocabulary.Vocabulary at 0x7ff949e20ed0>,
 'vocab_size': 3859,
 'embedding_input_dim': 3859,
 'input_shapes': [(10,), (100,)]}

In [10]:
bin_size = 30
model = mz.models.DRMM()
model.params.update(preprocessor.context)
model.params['input_shapes'] = [[10,], [10, bin_size,]]
model.params['task'] = ranking_task
model.params['mask_value'] = 0
model.params['embedding_output_dim'] = glove_embedding.output_dim
model.params['mlp_num_layers'] = 1
model.params['mlp_num_units'] = 10
model.params['mlp_num_fan_out'] = 1
model.params['mlp_activation_func'] = 'tanh'
model.params['optimizer'] = 'adadelta'
model.build()
model.compile()
model.backend.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text_left (InputLayer)          (None, 10)           0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 10, 300)      1157700     text_left[0][0]                  
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 10, 1)        300         embedding[0][0]                  
__________________________________________________________________________________________________
match_histogram (InputLayer)    (None, 10, 30)       0                                            
____________________________________________________________________________________________

In [11]:
embedding_matrix = glove_embedding.build_matrix(preprocessor.context['vocab_unit'].state['term_index'])
# normalize the word embedding for fast histogram generating.
l2_norm = np.sqrt((embedding_matrix*embedding_matrix).sum(axis=1))
embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis]
model.load_embedding_matrix(embedding_matrix)

In [12]:
hist_callback = mz.data_generator.callbacks.Histogram(embedding_matrix, bin_size=30, hist_mode='LCH')

In [13]:
pred_generator = mz.DataGenerator(valid_pack_processed, mode='point', callbacks=[hist_callback])
pred_x, pred_y = pred_generator[:]
evaluate = mz.callbacks.EvaluateAllMetrics(model, x=pred_x, y=pred_y, batch_size=len(pred_y), model_save_path='DRMM_title', once_every=1)

In [14]:
train_generator = mz.DataGenerator(
    train_pack_processed,
    mode='pair',
    num_dup=5,
    num_neg=10,
    batch_size=20,
    callbacks=[hist_callback]
)

In [15]:
history = model.fit_generator(train_generator, epochs=5, callbacks=[evaluate])

Epoch 1/5
Validation: mean_average_precision(0.0): 0.5101214574898786
Epoch 2/5
Validation: mean_average_precision(0.0): 0.5101214574898786
Epoch 3/5
Validation: mean_average_precision(0.0): 0.5101214574898786
Epoch 4/5
Validation: mean_average_precision(0.0): 0.5101214574898786
Epoch 5/5
Validation: mean_average_precision(0.0): 0.5101214574898786
