Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
62 changed files
with
5,753 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
The data generator for HMN4QA | ||
|
||
COLING2016 - Hierarchical Memory Networks for Answer Selection on Unknown Words | ||
|
||
There are total four datasets, and the statistics are listed as follows: | ||
|
||
Domain Train/Dev/Test Vocab (Total/Train/Dev/Test) UNK Answer (Dev/Test) Lang | ||
|
||
Air-Ticket Booking 5,400/600/6,000 8,553/4,279/729/4,682 405 (67.5%)/3,993 (66.6%) CH | ||
|
||
Hotel Reservation 5,400/600/6,000 7,586/3,755/613/4,115 367 (61.2%)/3,690 (61.5%) CH | ||
|
||
Air-Ticket Booking 5,400/600/6,000 7,497/3,972/712/4,314 342 (57.0%)/3,477 (58.0%) EN | ||
|
||
Hotel Reservation 5,400/600/6,000 7,134/3,593/577/3,930 357 (59.5%)/3,452 (57.5%) EN | ||
|
||
Total 21,600/2,400/24,000 29,055/14,705/2,372/16,033 1,389 (57.9%)/13,846 (57.7%) -- |
51 changes: 51 additions & 0 deletions
51
datasets/data_generator/air_ticket_booking_CH/departurelist_answer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
# -*- coding: utf8 -*- | ||
__author__ = 'shin' | ||
import jieba | ||
|
||
departurelist_answer = [] | ||
departurelist_answer.append('北京')#这里的trick是加大一下常用句式的比重 | ||
departurelist_answer.append('北京') | ||
departurelist_answer.append('北京') | ||
departurelist_answer.append('额,北京') | ||
departurelist_answer.append('北京。') | ||
departurelist_answer.append('北京。') | ||
departurelist_answer.append('北京。') | ||
departurelist_answer.append('啊,是北京。') | ||
|
||
departurelist_answer.append('从北京起飞。') | ||
departurelist_answer.append('从北京出发的机票') | ||
departurelist_answer.append('机票的出发地是北京。') | ||
departurelist_answer.append('从北京出发。') | ||
departurelist_answer.append('自北京出发。') | ||
departurelist_answer.append('由北京起飞的飞机。') | ||
departurelist_answer.append('出发地是北京。') | ||
departurelist_answer.append('帮我预订北京起飞的机票。') | ||
departurelist_answer.append('出行地点是北京。') | ||
departurelist_answer.append('订从北京走的机票。') | ||
|
||
departurelist_answer.append('我从北京出发') | ||
departurelist_answer.append('出发地是北京') | ||
departurelist_answer.append('我的出发地是北京') | ||
departurelist_answer.append('我准备从北京出发') | ||
departurelist_answer.append('我打算从北京出发') | ||
departurelist_answer.append('我计划从北京出发') | ||
departurelist_answer.append('我可能从北京出发') | ||
departurelist_answer.append('从北京出发') | ||
departurelist_answer.append('从北京走') | ||
departurelist_answer.append('打算从北京走') | ||
departurelist_answer.append('打算从北京出发') | ||
departurelist_answer.append('准备从北京走') | ||
departurelist_answer.append('准备从北京出发') | ||
|
||
departurelist_answer_split = [] | ||
|
||
for ans in departurelist_answer: | ||
w_sent = '' | ||
sent = jieba._lcut(ans) | ||
for word in sent: | ||
w_sent += ' ' | ||
w_sent += word | ||
w_sent += '\n' | ||
w_sent = w_sent.replace('北京'.decode('utf8'), '[slot_departure]') | ||
departurelist_answer_split.append(w_sent) | ||
pass |
88 changes: 88 additions & 0 deletions
88
datasets/data_generator/air_ticket_booking_CH/departurelist_question.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
# -*- coding: utf8 -*- | ||
__author__ = 'shin' | ||
import jieba | ||
|
||
departurelist_question_female = [] | ||
departurelist_question_female.append('请问女士您从哪里起飞?') | ||
departurelist_question_female.append('请问小姐您从哪里出行?') | ||
departurelist_question_female.append('请问女士您从哪个城市出行?') | ||
departurelist_question_female.append('小姐,请告诉我您的起飞城市。') | ||
departurelist_question_female.append('请女士告诉我,您从哪个城市出行?') | ||
departurelist_question_female.append('小姐,您从哪里走?') | ||
departurelist_question_female.append('女士,您从哪里出发呢?') | ||
departurelist_question_female.append('小姐,您从哪里起飞?') | ||
departurelist_question_female.append('说下小姐您的出发地?') | ||
departurelist_question_female.append('小姐,您旅行的起点是哪里?') | ||
departurelist_question_female.append('小姐,您从哪座城市出发?') | ||
departurelist_question_female.append('小姐,从哪走?') | ||
departurelist_question_female.append('小姐,在哪里起飞?') | ||
departurelist_question_female.append('您要买从哪里出发的机票,女士?') | ||
departurelist_question_female.append('好的,麻烦女士说一下起点是哪里?') | ||
|
||
departurelist_question_male = [] | ||
departurelist_question_male.append('请问先生您从哪里起飞?') | ||
departurelist_question_male.append('请问先生您从哪里出行?') | ||
departurelist_question_male.append('请问先生您从哪个城市出行?') | ||
departurelist_question_male.append('先生,请告诉我您的起飞城市。') | ||
departurelist_question_male.append('请先生告诉我,您从哪个城市出行?') | ||
departurelist_question_male.append('先生,您从哪里走?') | ||
departurelist_question_male.append('先生,您从哪里出发呢?') | ||
departurelist_question_male.append('先生,您从哪里起飞?') | ||
departurelist_question_male.append('说下先生您的出发地?') | ||
departurelist_question_male.append('先生,您旅行的起点是哪里?') | ||
departurelist_question_male.append('先生,您从哪座城市出发?') | ||
departurelist_question_male.append('先生,从哪走?') | ||
departurelist_question_male.append('先生,在哪里起飞?') | ||
departurelist_question_male.append('您要买从哪里出发的机票,先生?') | ||
departurelist_question_male.append('好的,麻烦先生说一下起点是哪里?') | ||
|
||
departurelist_question_unisex = [] | ||
departurelist_question_unisex.append('请问您从哪里起飞?') | ||
departurelist_question_unisex.append('请问您从哪里出行?') | ||
departurelist_question_unisex.append('请问您从哪个城市出行?') | ||
departurelist_question_unisex.append('请告诉我您的起飞城市。') | ||
departurelist_question_unisex.append('请您告诉我,您从哪个城市出行?') | ||
departurelist_question_unisex.append('您从哪里走?') | ||
departurelist_question_unisex.append('您从哪里出发呢?') | ||
departurelist_question_unisex.append('您从哪里起飞?') | ||
departurelist_question_unisex.append('说下您的出发地?') | ||
departurelist_question_unisex.append('您旅行的起点是哪里?') | ||
departurelist_question_unisex.append('您从哪座城市出发?') | ||
departurelist_question_unisex.append('从哪走?') | ||
departurelist_question_unisex.append('在哪里起飞?') | ||
departurelist_question_unisex.append('您要买从哪里出发的机票?') | ||
departurelist_question_unisex.append('好的,麻烦说一下起点是哪里?') | ||
|
||
departurelist_question_female_split = [] | ||
for ans in departurelist_question_female: | ||
w_sent = '' | ||
sent = jieba._lcut(ans) | ||
for word in sent: | ||
w_sent += ' ' | ||
w_sent += word | ||
w_sent += '\n' | ||
departurelist_question_female_split.append(w_sent) | ||
|
||
departurelist_question_male_split = [] | ||
for ans in departurelist_question_male: | ||
w_sent = '' | ||
sent = jieba._lcut(ans) | ||
for word in sent: | ||
w_sent += ' ' | ||
w_sent += word | ||
w_sent += '\n' | ||
departurelist_question_male_split.append(w_sent) | ||
|
||
departurelist_question_unisex_split = [] | ||
for ans in departurelist_question_unisex: | ||
w_sent = '' | ||
sent = jieba._lcut(ans) | ||
for word in sent: | ||
w_sent += ' ' | ||
w_sent += word | ||
w_sent += '\n' | ||
departurelist_question_unisex_split.append(w_sent) | ||
|
||
departurelist_question_female_split += departurelist_question_unisex_split | ||
departurelist_question_male_split += departurelist_question_unisex_split | ||
pass |
50 changes: 50 additions & 0 deletions
50
datasets/data_generator/air_ticket_booking_CH/destinationlist_answer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
# -*- coding: utf8 -*- | ||
__author__ = 'shin' | ||
import jieba | ||
|
||
destinationlist_answer = [] | ||
|
||
destinationlist_answer.append('北京') | ||
destinationlist_answer.append('北京') | ||
destinationlist_answer.append('北京') | ||
destinationlist_answer.append('北京。') | ||
destinationlist_answer.append('北京。') | ||
destinationlist_answer.append('北京。') | ||
|
||
destinationlist_answer.append('去北京。') | ||
destinationlist_answer.append('目的地是北京。') | ||
destinationlist_answer.append('飞往北京。') | ||
destinationlist_answer.append('是北京。') | ||
destinationlist_answer.append('去北京。') | ||
destinationlist_answer.append('往北京。') | ||
destinationlist_answer.append('到北京去。') | ||
destinationlist_answer.append('北京是我的目的地。') | ||
destinationlist_answer.append('目的地是北京。') | ||
destinationlist_answer.append('我要到北京去。') | ||
destinationlist_answer.append('飞往北京的飞机。') | ||
destinationlist_answer.append('买去北京的机票。') | ||
destinationlist_answer.append('我要订去北京的飞机。') | ||
|
||
destinationlist_answer.append('我要去北京') | ||
destinationlist_answer.append('到北京') | ||
destinationlist_answer.append('我到北京') | ||
destinationlist_answer.append('我去北京') | ||
destinationlist_answer.append('到北京的机票') | ||
destinationlist_answer.append('飞北京的机票') | ||
destinationlist_answer.append('到北京的票') | ||
destinationlist_answer.append('飞北京的票') | ||
destinationlist_answer.append('去北京的机票') | ||
destinationlist_answer.append('去北京的票') | ||
|
||
destinationlist_answer_split = [] | ||
|
||
for ans in destinationlist_answer: | ||
w_sent = '' | ||
sent = jieba._lcut(ans) | ||
for word in sent: | ||
w_sent += ' ' | ||
w_sent += word | ||
w_sent += '\n' | ||
w_sent = w_sent.replace('北京'.decode('utf8'), '[slot_destination]') | ||
destinationlist_answer_split.append(w_sent) | ||
pass |
103 changes: 103 additions & 0 deletions
103
datasets/data_generator/air_ticket_booking_CH/destinationlist_question.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
# -*- coding: utf8 -*- | ||
__author__ = 'shin' | ||
import jieba | ||
|
||
destinationlist_question_female = [] | ||
destinationlist_question_female.append('请问女士您的目的地是哪里?') | ||
destinationlist_question_female.append('小姐,请问您要预定的目的地是?') | ||
destinationlist_question_female.append('请问小姐您要预定飞往哪个城市的机票?') | ||
destinationlist_question_female.append('请问女士您此次出行的目的地是?') | ||
destinationlist_question_female.append('请问小姐您此次前往哪个城市?') | ||
destinationlist_question_female.append('女士,您要去什么地方?') | ||
destinationlist_question_female.append('小姐,您要飞往什么地方?') | ||
destinationlist_question_female.append('小姐,您要飞往哪座城市?') | ||
destinationlist_question_female.append('小姐,您想订去哪里的飞机票?') | ||
destinationlist_question_female.append('去哪?女士') | ||
destinationlist_question_female.append('去哪儿?小姐') | ||
destinationlist_question_female.append('往哪里去?女士') | ||
destinationlist_question_female.append('去到哪里?小姐') | ||
destinationlist_question_female.append('小姐,您机票的目的地是哪里?') | ||
destinationlist_question_female.append('女士,您的目的地?') | ||
destinationlist_question_female.append('小姐,您要订飞往那里的票?') | ||
destinationlist_question_female.append('请告诉我您旅行的目的地,女士。') | ||
destinationlist_question_female.append('女士,您告诉我您的目的地。') | ||
destinationlist_question_female.append('没问题,麻烦说下目的地,女士。') | ||
|
||
destinationlist_question_male = [] | ||
destinationlist_question_male.append('请问先生您的目的地是哪里?') | ||
destinationlist_question_male.append('先生,请问您要预定的目的地是?') | ||
destinationlist_question_male.append('请问先生您要预定飞往哪个城市的机票?') | ||
destinationlist_question_male.append('请问先生您此次出行的目的地是?') | ||
destinationlist_question_male.append('请问先生您此次前往哪个城市?') | ||
destinationlist_question_male.append('先生,您要去什么地方?') | ||
destinationlist_question_male.append('先生,您要飞往什么地方?') | ||
destinationlist_question_male.append('先生,您要飞往哪座城市?') | ||
destinationlist_question_male.append('先生,您想订去哪里的飞机票?') | ||
destinationlist_question_male.append('去哪?先生') | ||
destinationlist_question_male.append('去哪儿?先生') | ||
destinationlist_question_male.append('往哪里去?先生') | ||
destinationlist_question_male.append('去到哪里?先生') | ||
destinationlist_question_male.append('先生,您机票的目的地是哪里?') | ||
destinationlist_question_male.append('先生,您的目的地?') | ||
destinationlist_question_male.append('先生,您要订飞往那里的票?') | ||
destinationlist_question_male.append('请告诉我您旅行的目的地,先生。') | ||
destinationlist_question_male.append('先生,您告诉我您的目的地。') | ||
destinationlist_question_male.append('没问题,麻烦说下目的地,先生。') | ||
|
||
destinationlist_question_unisex = [] | ||
destinationlist_question_unisex.append('请问您的目的地是哪里?') | ||
destinationlist_question_unisex.append('请问您要预定的目的地是?') | ||
destinationlist_question_unisex.append('请问您要预定飞往哪个城市的机票?') | ||
destinationlist_question_unisex.append('请问您此次出行的目的地是?') | ||
destinationlist_question_unisex.append('请问您此次前往哪个城市?') | ||
destinationlist_question_unisex.append('您要去什么地方?') | ||
destinationlist_question_unisex.append('您要飞往什么地方?') | ||
destinationlist_question_unisex.append('您要飞往哪座城市?') | ||
destinationlist_question_unisex.append('您想订去哪里的飞机票?') | ||
destinationlist_question_unisex.append('去哪?') | ||
destinationlist_question_unisex.append('去哪儿?') | ||
destinationlist_question_unisex.append('往哪里去?') | ||
destinationlist_question_unisex.append('去到哪里?') | ||
destinationlist_question_unisex.append('您机票的目的地是哪里?') | ||
destinationlist_question_unisex.append('目的地?') | ||
destinationlist_question_unisex.append('终点是?') | ||
destinationlist_question_unisex.append('您的目的地?') | ||
destinationlist_question_unisex.append('您要订飞往那里的票?') | ||
destinationlist_question_unisex.append('去哪里的票?') | ||
destinationlist_question_unisex.append('请告诉我您旅行的目的地。') | ||
destinationlist_question_unisex.append('您告诉我您的目的地。') | ||
destinationlist_question_unisex.append('没问题,麻烦说下目的地。') | ||
|
||
destinationlist_question_female_split = [] | ||
for ans in destinationlist_question_female: | ||
w_sent = '' | ||
sent = jieba._lcut(ans) | ||
for word in sent: | ||
w_sent += ' ' | ||
w_sent += word | ||
w_sent += '\n' | ||
destinationlist_question_female_split.append(w_sent) | ||
|
||
destinationlist_question_male_split = [] | ||
for ans in destinationlist_question_male: | ||
w_sent = '' | ||
sent = jieba._lcut(ans) | ||
for word in sent: | ||
w_sent += ' ' | ||
w_sent += word | ||
w_sent += '\n' | ||
destinationlist_question_male_split.append(w_sent) | ||
|
||
destinationlist_question_unisex_split = [] | ||
for ans in destinationlist_question_unisex: | ||
w_sent = '' | ||
sent = jieba._lcut(ans) | ||
for word in sent: | ||
w_sent += ' ' | ||
w_sent += word | ||
w_sent += '\n' | ||
destinationlist_question_unisex_split.append(w_sent) | ||
|
||
destinationlist_question_female_split += destinationlist_question_unisex_split | ||
destinationlist_question_male_split += destinationlist_question_unisex_split | ||
pass |
Oops, something went wrong.