Skip to content

Commit

Permalink
upload data generator
Browse files Browse the repository at this point in the history
  • Loading branch information
jacoxu committed Dec 28, 2016
1 parent 49ae195 commit 66b3656
Show file tree
Hide file tree
Showing 62 changed files with 5,753 additions and 0 deletions.
17 changes: 17 additions & 0 deletions datasets/data_generator/README.md
@@ -0,0 +1,17 @@
The data generator for HMN4QA

COLING2016 - Hierarchical Memory Networks for Answer Selection on Unknown Words

There are total four datasets, and the statistics are listed as follows:

Domain Train/Dev/Test Vocab (Total/Train/Dev/Test) UNK Answer (Dev/Test) Lang

Air-Ticket Booking 5,400/600/6,000 8,553/4,279/729/4,682 405 (67.5%)/3,993 (66.6%) CH

Hotel Reservation 5,400/600/6,000 7,586/3,755/613/4,115 367 (61.2%)/3,690 (61.5%) CH

Air-Ticket Booking 5,400/600/6,000 7,497/3,972/712/4,314 342 (57.0%)/3,477 (58.0%) EN

Hotel Reservation 5,400/600/6,000 7,134/3,593/577/3,930 357 (59.5%)/3,452 (57.5%) EN

Total 21,600/2,400/24,000 29,055/14,705/2,372/16,033 1,389 (57.9%)/13,846 (57.7%) --
@@ -0,0 +1,51 @@
# -*- coding: utf8 -*-
__author__ = 'shin'
import jieba

departurelist_answer = []
departurelist_answer.append('北京')#这里的trick是加大一下常用句式的比重
departurelist_answer.append('北京')
departurelist_answer.append('北京')
departurelist_answer.append('额,北京')
departurelist_answer.append('北京。')
departurelist_answer.append('北京。')
departurelist_answer.append('北京。')
departurelist_answer.append('啊,是北京。')

departurelist_answer.append('从北京起飞。')
departurelist_answer.append('从北京出发的机票')
departurelist_answer.append('机票的出发地是北京。')
departurelist_answer.append('从北京出发。')
departurelist_answer.append('自北京出发。')
departurelist_answer.append('由北京起飞的飞机。')
departurelist_answer.append('出发地是北京。')
departurelist_answer.append('帮我预订北京起飞的机票。')
departurelist_answer.append('出行地点是北京。')
departurelist_answer.append('订从北京走的机票。')

departurelist_answer.append('我从北京出发')
departurelist_answer.append('出发地是北京')
departurelist_answer.append('我的出发地是北京')
departurelist_answer.append('我准备从北京出发')
departurelist_answer.append('我打算从北京出发')
departurelist_answer.append('我计划从北京出发')
departurelist_answer.append('我可能从北京出发')
departurelist_answer.append('从北京出发')
departurelist_answer.append('从北京走')
departurelist_answer.append('打算从北京走')
departurelist_answer.append('打算从北京出发')
departurelist_answer.append('准备从北京走')
departurelist_answer.append('准备从北京出发')

departurelist_answer_split = []

for ans in departurelist_answer:
w_sent = ''
sent = jieba._lcut(ans)
for word in sent:
w_sent += ' '
w_sent += word
w_sent += '\n'
w_sent = w_sent.replace('北京'.decode('utf8'), '[slot_departure]')
departurelist_answer_split.append(w_sent)
pass
@@ -0,0 +1,88 @@
# -*- coding: utf8 -*-
__author__ = 'shin'
import jieba

departurelist_question_female = []
departurelist_question_female.append('请问女士您从哪里起飞?')
departurelist_question_female.append('请问小姐您从哪里出行?')
departurelist_question_female.append('请问女士您从哪个城市出行?')
departurelist_question_female.append('小姐,请告诉我您的起飞城市。')
departurelist_question_female.append('请女士告诉我,您从哪个城市出行?')
departurelist_question_female.append('小姐,您从哪里走?')
departurelist_question_female.append('女士,您从哪里出发呢?')
departurelist_question_female.append('小姐,您从哪里起飞?')
departurelist_question_female.append('说下小姐您的出发地?')
departurelist_question_female.append('小姐,您旅行的起点是哪里?')
departurelist_question_female.append('小姐,您从哪座城市出发?')
departurelist_question_female.append('小姐,从哪走?')
departurelist_question_female.append('小姐,在哪里起飞?')
departurelist_question_female.append('您要买从哪里出发的机票,女士?')
departurelist_question_female.append('好的,麻烦女士说一下起点是哪里?')

departurelist_question_male = []
departurelist_question_male.append('请问先生您从哪里起飞?')
departurelist_question_male.append('请问先生您从哪里出行?')
departurelist_question_male.append('请问先生您从哪个城市出行?')
departurelist_question_male.append('先生,请告诉我您的起飞城市。')
departurelist_question_male.append('请先生告诉我,您从哪个城市出行?')
departurelist_question_male.append('先生,您从哪里走?')
departurelist_question_male.append('先生,您从哪里出发呢?')
departurelist_question_male.append('先生,您从哪里起飞?')
departurelist_question_male.append('说下先生您的出发地?')
departurelist_question_male.append('先生,您旅行的起点是哪里?')
departurelist_question_male.append('先生,您从哪座城市出发?')
departurelist_question_male.append('先生,从哪走?')
departurelist_question_male.append('先生,在哪里起飞?')
departurelist_question_male.append('您要买从哪里出发的机票,先生?')
departurelist_question_male.append('好的,麻烦先生说一下起点是哪里?')

departurelist_question_unisex = []
departurelist_question_unisex.append('请问您从哪里起飞?')
departurelist_question_unisex.append('请问您从哪里出行?')
departurelist_question_unisex.append('请问您从哪个城市出行?')
departurelist_question_unisex.append('请告诉我您的起飞城市。')
departurelist_question_unisex.append('请您告诉我,您从哪个城市出行?')
departurelist_question_unisex.append('您从哪里走?')
departurelist_question_unisex.append('您从哪里出发呢?')
departurelist_question_unisex.append('您从哪里起飞?')
departurelist_question_unisex.append('说下您的出发地?')
departurelist_question_unisex.append('您旅行的起点是哪里?')
departurelist_question_unisex.append('您从哪座城市出发?')
departurelist_question_unisex.append('从哪走?')
departurelist_question_unisex.append('在哪里起飞?')
departurelist_question_unisex.append('您要买从哪里出发的机票?')
departurelist_question_unisex.append('好的,麻烦说一下起点是哪里?')

departurelist_question_female_split = []
for ans in departurelist_question_female:
w_sent = ''
sent = jieba._lcut(ans)
for word in sent:
w_sent += ' '
w_sent += word
w_sent += '\n'
departurelist_question_female_split.append(w_sent)

departurelist_question_male_split = []
for ans in departurelist_question_male:
w_sent = ''
sent = jieba._lcut(ans)
for word in sent:
w_sent += ' '
w_sent += word
w_sent += '\n'
departurelist_question_male_split.append(w_sent)

departurelist_question_unisex_split = []
for ans in departurelist_question_unisex:
w_sent = ''
sent = jieba._lcut(ans)
for word in sent:
w_sent += ' '
w_sent += word
w_sent += '\n'
departurelist_question_unisex_split.append(w_sent)

departurelist_question_female_split += departurelist_question_unisex_split
departurelist_question_male_split += departurelist_question_unisex_split
pass
@@ -0,0 +1,50 @@
# -*- coding: utf8 -*-
__author__ = 'shin'
import jieba

destinationlist_answer = []

destinationlist_answer.append('北京')
destinationlist_answer.append('北京')
destinationlist_answer.append('北京')
destinationlist_answer.append('北京。')
destinationlist_answer.append('北京。')
destinationlist_answer.append('北京。')

destinationlist_answer.append('去北京。')
destinationlist_answer.append('目的地是北京。')
destinationlist_answer.append('飞往北京。')
destinationlist_answer.append('是北京。')
destinationlist_answer.append('去北京。')
destinationlist_answer.append('往北京。')
destinationlist_answer.append('到北京去。')
destinationlist_answer.append('北京是我的目的地。')
destinationlist_answer.append('目的地是北京。')
destinationlist_answer.append('我要到北京去。')
destinationlist_answer.append('飞往北京的飞机。')
destinationlist_answer.append('买去北京的机票。')
destinationlist_answer.append('我要订去北京的飞机。')

destinationlist_answer.append('我要去北京')
destinationlist_answer.append('到北京')
destinationlist_answer.append('我到北京')
destinationlist_answer.append('我去北京')
destinationlist_answer.append('到北京的机票')
destinationlist_answer.append('飞北京的机票')
destinationlist_answer.append('到北京的票')
destinationlist_answer.append('飞北京的票')
destinationlist_answer.append('去北京的机票')
destinationlist_answer.append('去北京的票')

destinationlist_answer_split = []

for ans in destinationlist_answer:
w_sent = ''
sent = jieba._lcut(ans)
for word in sent:
w_sent += ' '
w_sent += word
w_sent += '\n'
w_sent = w_sent.replace('北京'.decode('utf8'), '[slot_destination]')
destinationlist_answer_split.append(w_sent)
pass
@@ -0,0 +1,103 @@
# -*- coding: utf8 -*-
__author__ = 'shin'
import jieba

destinationlist_question_female = []
destinationlist_question_female.append('请问女士您的目的地是哪里?')
destinationlist_question_female.append('小姐,请问您要预定的目的地是?')
destinationlist_question_female.append('请问小姐您要预定飞往哪个城市的机票?')
destinationlist_question_female.append('请问女士您此次出行的目的地是?')
destinationlist_question_female.append('请问小姐您此次前往哪个城市?')
destinationlist_question_female.append('女士,您要去什么地方?')
destinationlist_question_female.append('小姐,您要飞往什么地方?')
destinationlist_question_female.append('小姐,您要飞往哪座城市?')
destinationlist_question_female.append('小姐,您想订去哪里的飞机票?')
destinationlist_question_female.append('去哪?女士')
destinationlist_question_female.append('去哪儿?小姐')
destinationlist_question_female.append('往哪里去?女士')
destinationlist_question_female.append('去到哪里?小姐')
destinationlist_question_female.append('小姐,您机票的目的地是哪里?')
destinationlist_question_female.append('女士,您的目的地?')
destinationlist_question_female.append('小姐,您要订飞往那里的票?')
destinationlist_question_female.append('请告诉我您旅行的目的地,女士。')
destinationlist_question_female.append('女士,您告诉我您的目的地。')
destinationlist_question_female.append('没问题,麻烦说下目的地,女士。')

destinationlist_question_male = []
destinationlist_question_male.append('请问先生您的目的地是哪里?')
destinationlist_question_male.append('先生,请问您要预定的目的地是?')
destinationlist_question_male.append('请问先生您要预定飞往哪个城市的机票?')
destinationlist_question_male.append('请问先生您此次出行的目的地是?')
destinationlist_question_male.append('请问先生您此次前往哪个城市?')
destinationlist_question_male.append('先生,您要去什么地方?')
destinationlist_question_male.append('先生,您要飞往什么地方?')
destinationlist_question_male.append('先生,您要飞往哪座城市?')
destinationlist_question_male.append('先生,您想订去哪里的飞机票?')
destinationlist_question_male.append('去哪?先生')
destinationlist_question_male.append('去哪儿?先生')
destinationlist_question_male.append('往哪里去?先生')
destinationlist_question_male.append('去到哪里?先生')
destinationlist_question_male.append('先生,您机票的目的地是哪里?')
destinationlist_question_male.append('先生,您的目的地?')
destinationlist_question_male.append('先生,您要订飞往那里的票?')
destinationlist_question_male.append('请告诉我您旅行的目的地,先生。')
destinationlist_question_male.append('先生,您告诉我您的目的地。')
destinationlist_question_male.append('没问题,麻烦说下目的地,先生。')

destinationlist_question_unisex = []
destinationlist_question_unisex.append('请问您的目的地是哪里?')
destinationlist_question_unisex.append('请问您要预定的目的地是?')
destinationlist_question_unisex.append('请问您要预定飞往哪个城市的机票?')
destinationlist_question_unisex.append('请问您此次出行的目的地是?')
destinationlist_question_unisex.append('请问您此次前往哪个城市?')
destinationlist_question_unisex.append('您要去什么地方?')
destinationlist_question_unisex.append('您要飞往什么地方?')
destinationlist_question_unisex.append('您要飞往哪座城市?')
destinationlist_question_unisex.append('您想订去哪里的飞机票?')
destinationlist_question_unisex.append('去哪?')
destinationlist_question_unisex.append('去哪儿?')
destinationlist_question_unisex.append('往哪里去?')
destinationlist_question_unisex.append('去到哪里?')
destinationlist_question_unisex.append('您机票的目的地是哪里?')
destinationlist_question_unisex.append('目的地?')
destinationlist_question_unisex.append('终点是?')
destinationlist_question_unisex.append('您的目的地?')
destinationlist_question_unisex.append('您要订飞往那里的票?')
destinationlist_question_unisex.append('去哪里的票?')
destinationlist_question_unisex.append('请告诉我您旅行的目的地。')
destinationlist_question_unisex.append('您告诉我您的目的地。')
destinationlist_question_unisex.append('没问题,麻烦说下目的地。')

destinationlist_question_female_split = []
for ans in destinationlist_question_female:
w_sent = ''
sent = jieba._lcut(ans)
for word in sent:
w_sent += ' '
w_sent += word
w_sent += '\n'
destinationlist_question_female_split.append(w_sent)

destinationlist_question_male_split = []
for ans in destinationlist_question_male:
w_sent = ''
sent = jieba._lcut(ans)
for word in sent:
w_sent += ' '
w_sent += word
w_sent += '\n'
destinationlist_question_male_split.append(w_sent)

destinationlist_question_unisex_split = []
for ans in destinationlist_question_unisex:
w_sent = ''
sent = jieba._lcut(ans)
for word in sent:
w_sent += ' '
w_sent += word
w_sent += '\n'
destinationlist_question_unisex_split.append(w_sent)

destinationlist_question_female_split += destinationlist_question_unisex_split
destinationlist_question_male_split += destinationlist_question_unisex_split
pass

0 comments on commit 66b3656

Please sign in to comment.