In [1]:
import matchzoo as mz
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

  from pandas import Panel
Using TensorFlow backend.
  from pandas import Panel


In [2]:
train_dataset = pd.read_csv('../../train_data/train_data_1_1', delimiter=',')
validation_dataset = pd.read_csv('../../train_data/validation_data_1_1', delimiter=',')

train_dataset = train_dataset.replace(np.nan, ' ', regex=True)
validation_dataset = validation_dataset.replace(np.nan, ' ', regex=True)

In [3]:
train_dataset = train_dataset.head(1000)
validation_dataset = validation_dataset.head(1000)

In [4]:
list_data1 = []
for i, row in train_dataset.iterrows():
    
    line1 = {'id_left': str(row['article_id']),
            'text_left':str(row['article_page_title']),
            'id_right':str(row['table_id']),
            'text_right':str(row['table_page_title']),
            'label':row['label']
           }
    
    list_data1.append(line1)

df1 = pd.DataFrame(list_data1)
train_pack = mz.pack(df1)

In [5]:
list_data2 = []
for i, row in validation_dataset.iterrows():
    
    line2 = {'id_left': str(row['article_id']),
            'text_left':str(row['article_page_title']),
            'id_right':str(row['table_id']),
            'text_right':str(row['table_page_title']),
            'label':row['label']
           }
    
    list_data2.append(line2)

df2 = pd.DataFrame(list_data2)
valid_pack = mz.pack(df2)

In [6]:
ranking_task = mz.tasks.Ranking(loss=mz.losses.RankHingeLoss())
ranking_task.metrics = [
    mz.metrics.MeanAveragePrecision()
]

In [7]:
glove_embedding = mz.datasets.embeddings.load_glove_embedding(dimension=300)

In [8]:
preprocessor = mz.preprocessors.CDSSMPreprocessor(fixed_length_left=10, fixed_length_right=10)
train_pack_processed = preprocessor.fit_transform(train_pack)
valid_pack_processed = preprocessor.transform(valid_pack)

Processing text_left with chain_transform of Tokenize => Lowercase => PuncRemoval => StopRemoval => NgramLetter: 100%|██████████| 989/989 [00:00<00:00, 6564.89it/s]
Processing text_right with chain_transform of Tokenize => Lowercase => PuncRemoval => StopRemoval => NgramLetter: 100%|██████████| 996/996 [00:00<00:00, 8657.87it/s]
Processing text_left with extend: 100%|██████████| 989/989 [00:00<00:00, 415898.00it/s]
Processing text_right with extend: 100%|██████████| 996/996 [00:00<00:00, 501564.03it/s]
Building Vocabulary from a datapack.: 100%|██████████| 53383/53383 [00:00<00:00, 2929652.22it/s]
Processing text_left with chain_transform of Tokenize => Lowercase => PuncRemoval => StopRemoval: 100%|██████████| 989/989 [00:00<00:00, 7782.03it/s]
Processing text_right with chain_transform of Tokenize => Lowercase => PuncRemoval => StopRemoval: 100%|██████████| 996/996 [00:00<00:00, 10127.44it/s]
Processing text_left with transform: 100%|██████████| 989/989 [00:00<00:00, 103275.57it/s]
Pr

In [9]:
model = mz.models.CDSSM()
model.params['input_shapes'] = preprocessor.context['input_shapes']
model.params['task'] = ranking_task
model.params['filters'] = 64
model.params['kernel_size'] = 3
model.params['strides'] = 1
model.params['padding'] = 'same'
model.params['conv_activation_func'] = 'tanh'
model.params['w_initializer'] = 'glorot_normal'
model.params['b_initializer'] = 'zeros'
model.params['mlp_num_layers'] = 1
model.params['mlp_num_units'] = 64
model.params['mlp_num_fan_out'] = 64
model.params['mlp_activation_func'] = 'tanh'
model.params['dropout_rate'] = 0.8
model.params['optimizer'] = 'adadelta'
model.guess_and_fill_missing_params()
model.build()
model.compile()
model.backend.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text_left (InputLayer)          (None, 10, 4166)     0                                            
__________________________________________________________________________________________________
text_right (InputLayer)         (None, 10, 4166)     0                                            
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 10, 64)       799936      text_left[0][0]                  
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 10, 64)       799936      text_right[0][0]                 
____________________________________________________________________________________________

In [10]:
train_generator = mz.DataGenerator(
    train_pack_processed,
    mode='pair',
    num_dup=2,
    num_neg=1,
    batch_size=20
)

In [11]:
pred_x, pred_y = valid_pack_processed[:].unpack()

In [12]:
evaluate = mz.callbacks.EvaluateAllMetrics(model, x=pred_x, y=pred_y, batch_size=len(pred_x), model_save_path='CDSSM_title', once_every=1)

In [13]:
history = model.fit_generator(train_generator, epochs=5, callbacks=[evaluate])

Epoch 1/5
Validation: mean_average_precision(0.0): 0.5096153846153846
Epoch 2/5
Validation: mean_average_precision(0.0): 0.5096153846153846
Epoch 3/5
Validation: mean_average_precision(0.0): 0.5101214574898786
Epoch 4/5
Validation: mean_average_precision(0.0): 0.5101214574898786
Epoch 5/5
Validation: mean_average_precision(0.0): 0.5106275303643725
