In [None]:
import matchzoo as mz
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [None]:
train_dataset = pd.read_csv('../../train_data/train_data_1_1', delimiter=',')
validation_dataset = pd.read_csv('../../train_data/validation_data_1_1', delimiter=',')

train_dataset = train_dataset.replace(np.nan, ' ', regex=True)
validation_dataset = validation_dataset.replace(np.nan, ' ', regex=True)

In [None]:
train_dataset = train_dataset.head(1000)
validation_dataset = validation_dataset.head(1000)

In [None]:
list_data1 = []
for i, row in train_dataset.iterrows():
    
    line1 = {'id_left': str(row['article_id']),
            'text_left':str(row['article_page_title']),
            'id_right':str(row['table_id']),
            'text_right':str(row['table_page_title']),
            'label':row['label']
           }
    
    list_data1.append(line1)

df1 = pd.DataFrame(list_data1)
train_pack = mz.pack(df1)

In [None]:
list_data2 = []
for i, row in validation_dataset.iterrows():
    
    line2 = {'id_left': str(row['article_id']),
            'text_left':str(row['article_page_title']),
            'id_right':str(row['table_id']),
            'text_right':str(row['table_page_title']),
            'label':row['label']
           }
    
    list_data2.append(line2)

df2 = pd.DataFrame(list_data2)
valid_pack = mz.pack(df2)

In [None]:
ranking_task = mz.tasks.Ranking(loss=mz.losses.RankHingeLoss())
ranking_task.metrics = [
    mz.metrics.MeanAveragePrecision()
]

In [None]:
glove_embedding = mz.datasets.embeddings.load_glove_embedding(dimension=300)

In [None]:
preprocessor = mz.preprocessors.BasicPreprocessor(fixed_length_left=10, fixed_length_right=100, remove_stop_words=True)
train_pack_processed = preprocessor.fit_transform(train_pack)
valid_pack_processed = preprocessor.transform(valid_pack)

In [None]:
preprocessor.context

In [None]:
model = mz.models.ArcI()
model.params.update(preprocessor.context)
model.params['task'] = ranking_task
model.params['embedding_output_dim'] = glove_embedding.output_dim
model.params['num_blocks'] = 1
model.params['left_filters'] = [128]
model.params['left_kernel_sizes'] = [3]
model.params['left_pool_sizes'] = [4]
model.params['right_filters'] = [128]
model.params['right_kernel_sizes'] = [3]
model.params['right_pool_sizes'] = [4]
model.params['conv_activation_func']= 'relu'
model.params['mlp_num_layers'] = 1
model.params['mlp_num_units'] = 100
model.params['mlp_num_fan_out'] = 1 
model.params['mlp_activation_func'] = 'relu' 
model.params['dropout_rate'] = 0.9
model.params['optimizer'] = 'adadelta'
model.guess_and_fill_missing_params()
model.build()
model.compile()
model.backend.summary()

In [None]:
embedding_matrix = glove_embedding.build_matrix(preprocessor.context['vocab_unit'].state['term_index'])
model.load_embedding_matrix(embedding_matrix)

In [None]:
pred_x, pred_y = valid_pack_processed.unpack()
evaluate = mz.callbacks.EvaluateAllMetrics(model, x=pred_x, y=pred_y, batch_size=len(pred_y), model_save_path='ARCI_title', once_every=1)

In [None]:
train_generator = mz.DataGenerator(
    train_pack_processed,
    mode='pair',
    num_dup=2,
    num_neg=1,
    batch_size=20
)

In [None]:
history = model.fit_generator(train_generator, epochs=5, callbacks=[evaluate])