## 加载相关数据集：dureader and SQuAD2.0

In [171]:
import json
import re
import jieba
from collections import Counter
from nltk.corpus import stopwords
import nltk

### dureader数据集路径

In [5]:
dureader_train_file = "/Users/jiaoyasen/nlp/week1/dataset/dureader/train.json"
dureader_test_file1 = "/Users/jiaoyasen/nlp/week1/dataset/dureader/test1.json"
dureader_test_file2 = "/Users/jiaoyasen/nlp/week1/dataset/dureader/test2.json"
dureader_dev_file2 = "/Users/jiaoyasen/nlp/week1/dataset/dureader/dev.json"

### SQuAD 2.0数据集路径

In [108]:
squad_train_file = "/Users/jiaoyasen/nlp/week1/dataset/squad/train-v2.0.json"
squad_dev_file = "/Users/jiaoyasen/nlp/week1/dataset/squad/dev-v2.0.json"

### 中文停词表

In [63]:
stopword_file = "/Users/jiaoyasen/stopwords.dat.txt" 

### dureader词频结果文件

In [98]:
dureader_result_file = "/Users/jiaoyasen/nlp/week1/dataset/dureader/result.txt"

### SQuAD词频结果文件

In [176]:
squad_result_file = "/Users/jiaoyasen/nlp/week1/dataset/squad/result.txt"

### 读取json文件

In [32]:
def load_json_file(filename):
    with open(filename,"r") as file:
        json_content = file.read()
    return json_content

### 读取文本文件

In [75]:
def read_file(file_name):
    with open(file_name,'r') as file:
        content = file.readlines()
        for i in range(len(content)):
            content[i] = content[i].rstrip("\n")
    return content

### 解析dureader json文件

In [33]:
def load_dureader_file(json_content):
    dureader_content = json.loads(json_content)
    dureader_data = dureader_content["data"]
    return dureader_data

In [42]:
dureader_train_data = load_dureader_file(load_json_file(dureader_train_file))
dureader_test_data1 = load_dureader_file(load_json_file(dureader_test_file1))
dureader_test_data2 = load_dureader_file(load_json_file(dureader_test_file2))
dureader_dev_data = load_dureader_file(load_json_file(dureader_dev_file2))

In [46]:
def load_dureader_paragraphs(dureader_data):
    data_list = list()
    for data_item in dureader_data:
        data_item_title = data_item['title']
        data_item_paragraphs = data_item['paragraphs']
        data_item_paragraphs_list = list()
        for paragraphs_item in data_item_paragraphs:
            paragraphs_item_dict = dict()
            paragraphs_item_dict['context'] = paragraphs_item['context']
            paragraphs_item_qas_list = list()
            for paragraphs_item_qas_item in paragraphs_item['qas']:
                qas_item_dict = dict()
                qas_item_dict['question'] = paragraphs_item_qas_item['question']
                qas_item_answers_list = list()
                if "answers" in paragraphs_item_qas_item.keys():
                    for answer_item in paragraphs_item_qas_item['answers']:
                        qas_item_answers_list.append(answer_item['text'])
                    qas_item_dict['answer'] = qas_item_answers_list
                else:
                    qas_item_dict['answer'] = qas_item_answers_list
                paragraphs_item_qas_list.append(qas_item_dict)
            paragraphs_item_dict['qas'] = paragraphs_item_qas_list
            data_item_paragraphs_list.append(paragraphs_item_dict)
        data_list.append(data_item_paragraphs_list)
    return data_list

### 解析dureader context and qas，具体格式为“context”“qas”的list

In [47]:
dureader_train_json_list = load_dureader_paragraphs(dureader_train_data)
dureader_test_json_list1 = load_dureader_paragraphs(dureader_test_data1)
dureader_test_json_list2 = load_dureader_paragraphs(dureader_test_data2)
dureader_dev_json_list = load_dureader_paragraphs(dureader_dev_data)

In [51]:
dureader_train_json_list

[[{'context': '第35集雪见缓缓张开眼睛，景天又惊又喜之际，长卿和紫萱的仙船驶至，见众人无恙，也十分高兴。众人登船，用尽合力把自身的真气和水分输给她。雪见终于醒过来了，但却一脸木然，全无反应。众人向常胤求助，却发现人世界竟没有雪见的身世纪录。长卿询问清微的身世，清微语带双关说一切上了天界便有答案。长卿驾驶仙船，众人决定立马动身，往天界而去。众人来到一荒山，长卿指出，魔界和天界相连。由魔界进入通过神魔之井，便可登天。众人至魔界入口，仿若一黑色的蝙蝠洞，但始终无法进入。后来花楹发现只要有翅膀便能飞入。于是景天等人打下许多乌鸦，模仿重楼的翅膀，制作数对翅膀状巨物。刚佩戴在身，便被吸入洞口。众人摔落在地，抬头发现魔界守卫。景天和众魔套交情，自称和魔尊重楼相熟，众魔不理，打了起来。',
   'qas': [{'question': '仙剑奇侠传3第几集上天界', 'answer': ['第35集']}]},
  {'context': '选择燃气热水器时，一定要关注这几个问题：1、出水稳定性要好，不能出现忽热忽冷的现象2、快速到达设定的需求水温3、操作要智能、方便4、安全性要好，要装有安全报警装置 市场上燃气热水器品牌众多，购买时还需多加对比和仔细鉴别。方太今年主打的磁化恒温热水器在使用体验方面做了全面升级：9秒速热，可快速进入洗浴模式；水温持久稳定，不会出现忽热忽冷的现象，并通过水量伺服技术将出水温度精确控制在±0.5℃，可满足家里宝贝敏感肌肤洗护需求；配备CO和CH4双气体报警装置更安全（市场上一般多为CO单气体报警）。另外，这款热水器还有智能WIFI互联功能，只需下载个手机APP即可用手机远程操作热水器，实现精准调节水温，满足家人多样化的洗浴需求。当然方太的磁化恒温系列主要的是增加磁化功能，可以有效吸附水中的铁锈、铁屑等微小杂质，防止细菌滋生，使沐浴水质更洁净，长期使用磁化水沐浴更利于身体健康。',
   'qas': [{'question': '燃气热水器哪个牌子好', 'answer': ['方太']}]},
  {'context': '迈克尔.乔丹在NBA打了15个赛季。他在84年进入nba，期间在1993年10月6日第一次退役改打棒球，95年3月18日重新回归，在99年1月13日第二次退役，后于2001年10月31日复出，在03年最终退役。迈

### dureader文字合并

In [52]:
dureader_file_list = [dureader_train_json_list,dureader_test_json_list1,dureader_test_json_list2,dureader_dev_json_list]

#### dureader合并问题以及答案

In [115]:
def dureader_merge_file_content(filename_content):
    merged_content = list()
    for data_item in filename_content:
        for paragraph_item in data_item:
            if "context" in paragraph_item.keys():
                merged_content.append(paragraph_item['context'])
            if "qas" in paragraph_item.keys():
                for qas_item in paragraph_item['qas']:
                    merged_content.append(qas_item['question'])
                    if "answer" in qas_item.keys():
                        for answer_item in qas_item['answer']:
                            merged_content.append(answer_item)
    return merged_content

#### dureader合并多个文件内容

In [116]:
def dureader_merge_content(dureader_file_list):
    merged_content = list()
    for filename_content in dureader_file_list:
        merged_content = merged_content+dureader_merge_file_content(filename_content)
    return merged_content

In [87]:
dureader_merge_content = dureader_merge_content(dureader_file_list)

### dureader正则表达式处理数字、空格

In [67]:
def regex_process(line):
    #剔除所有数字
    decimal_regex = re.compile(r"[^a-zA-Z]\d+")
    #剔除空格
    space_regex = re.compile(r"\s+")
    line = decimal_regex.sub(r"", line)
    line = space_regex.sub(r"", line)
    return line

In [88]:
for i in range(len(dureader_merge_content)):
    dureader_merge_content[i] = regex_process(dureader_merge_content[i])

In [89]:
dureader_merge_content

['集雪见缓缓张开眼睛，景天又惊又喜之际，长卿和紫萱的仙船驶至，见众人无恙，也十分高兴。众人登船，用尽合力把自身的真气和水分输给她。雪见终于醒过来了，但却一脸木然，全无反应。众人向常胤求助，却发现人世界竟没有雪见的身世纪录。长卿询问清微的身世，清微语带双关说一切上了天界便有答案。长卿驾驶仙船，众人决定立马动身，往天界而去。众人来到一荒山，长卿指出，魔界和天界相连。由魔界进入通过神魔之井，便可登天。众人至魔界入口，仿若一黑色的蝙蝠洞，但始终无法进入。后来花楹发现只要有翅膀便能飞入。于是景天等人打下许多乌鸦，模仿重楼的翅膀，制作数对翅膀状巨物。刚佩戴在身，便被吸入洞口。众人摔落在地，抬头发现魔界守卫。景天和众魔套交情，自称和魔尊重楼相熟，众魔不理，打了起来。',
 '仙剑奇侠第几集上天界',
 '集',
 '选择燃气热水器时，一定要关注这几个问题、出水稳定性要好，不能出现忽热忽冷的现、快速到达设定的需求水、操作要智能、方、安全性要好，要装有安全报警装置市场上燃气热水器品牌众多，购买时还需多加对比和仔细鉴别。方太今年主打的磁化恒温热水器在使用体验方面做了全面升级秒速热，可快速进入洗浴模式；水温持久稳定，不会出现忽热忽冷的现象，并通过水量伺服技术将出水温度精确控制在℃，可满足家里宝贝敏感肌肤洗护需求；配备CO和CH4双气体报警装置更安全（市场上一般多为CO单气体报警）。另外，这款热水器还有智能WIFI互联功能，只需下载个手机APP即可用手机远程操作热水器，实现精准调节水温，满足家人多样化的洗浴需求。当然方太的磁化恒温系列主要的是增加磁化功能，可以有效吸附水中的铁锈、铁屑等微小杂质，防止细菌滋生，使沐浴水质更洁净，长期使用磁化水沐浴更利于身体健康。',
 '燃气热水器哪个牌子好',
 '方太',
 '迈克尔.乔丹在NBA打个赛季。他年进入nba，期间日第一次退役改打棒球日重新回归，日第二次退役，后日复出，年最终退役。迈克尔·乔丹（MichaelJordan）日生于纽约布鲁克林，美国著名篮球运动员，司职得分后卫，历史上最伟大的篮球运动员年的NBA选秀大会，乔丹在首轮顺位被芝加哥公牛队选中。赛季，乔丹场均得分，首次获得分王称号赛季，乔丹连夺常规赛MVP和总决赛MVP称号，率领芝加哥公牛首次夺得NBA总冠军。赛季，乔丹获得个人职业生涯个得分王，并率领公牛队第六次夺得总冠军日，乔丹正

### dureader删除停用词：结巴分词+中文停词表

In [90]:
#剔除停用词
def delete_stopwords(lines):
    stopwords = read_file(stopword_file)
    all_words = []
    for line in lines:
        all_words += [word for word in jieba.cut(line) if word not in stopwords]
    return all_words

In [91]:
dureader_words_list = delete_stopwords(dureader_merge_content)

### dureader统计词频并倒排

In [92]:
dureader_bow_words = dict(Counter(dureader_words_list))
dureader_sorted_words = sorted(dureader_bow_words.items(), key=lambda d:d[1], reverse=True)

In [93]:
dureader_sorted_words 

[('中', 29488),
 ('年', 23153),
 ('时间', 22738),
 ('时', 16625),
 ('月', 15656),
 ('吃', 14091),
 ('价格', 13844),
 ('中国', 13811),
 ('元', 13795),
 ('选择', 12626),
 ('高', 12386),
 ('做', 12028),
 ('说', 11679),
 ('日', 11122),
 ('治疗', 10570),
 ('情况', 10457),
 ('钱', 9487),
 ('医院', 9371),
 ('公司', 9211),
 ('建议', 8651),
 ('小时', 8643),
 ('品牌', 7567),
 ('食物', 7560),
 ('单位', 7475),
 ('一种', 7122),
 ('费用', 7024),
 ('公里', 6926),
 ('手机', 6879),
 ('尺寸', 6770),
 ('标准', 6761),
 ('买', 6696),
 ('国家', 6652),
 ('症状', 6578),
 ('里', 6359),
 ('效果', 6267),
 ('米', 6239),
 ('系统', 6127),
 ('专业', 6110),
 ('考试', 6071),
 ('新', 6030),
 ('方法', 6006),
 ('服务', 5892),
 ('游戏', 5761),
 ('功能', 5747),
 ('适合', 5746),
 ('影响', 5727),
 ('包括', 5703),
 ('号', 5633),
 ('工作', 5581),
 ('前', 5518),
 ('企业', 5489),
 ('之间', 5429),
 ('手术', 5376),
 ('喜欢', 5269),
 ('分', 5252),
 ('皮肤', 5222),
 ('不错', 5194),
 ('软件', 5188),
 ('应', 5158),
 ('指', 5127),
 ('想', 5083),
 ('等于', 5067),
 ('地方', 5047),
 ('世界', 5043),
 ('患者', 5025),
 ('厘米', 4953),
 ('北京', 4948),


In [95]:
dureader_sorted_words_dict = dict(dureader_sorted_words)

### dureader将词频存入txt文件

In [103]:
with open(dureader_result_file, "w+") as output_file:
    for (word,freq) in dureader_sorted_words:
        output_file.write(word+" "+str(freq)+"\n")

### 解析SQuAD json文件

In [160]:
squad_train_json_list = load_dureader_file(load_json_file(squad_train_file))
squad_dev_json_list = load_dureader_file(load_json_file(squad_dev_file))

In [161]:
squad_train_json_list = load_dureader_paragraphs(squad_train_json_list)
squad_dev_json_list = load_dureader_paragraphs(squad_dev_json_list)

In [162]:
squad_file_list = [squad_train_json_list,squad_dev_json_list]

In [163]:
squad_merge_content = dureader_merge_content(squad_file_list)

### dureader正则表达式处理数字、空格

In [164]:
def regex_process_en(line):
    #剔除所有非字母空格'的
    pat_letter = re.compile(r'[^a-zA-Z \']+')
    pat_is = re.compile("(it|he|she|that|this|there|here)(\'s)", re.I)
    pat_s = re.compile("(?<=[a-zA-Z])\'s") # 找出字母后面的字母
    pat_s2 = re.compile("(?<=s)\'s?")
    pat_not = re.compile("(?<=[a-zA-Z])n\'t") # not的缩写
    pat_would = re.compile("(?<=[a-zA-Z])\'d") # would的缩写
    pat_will = re.compile("(?<=[a-zA-Z])\'ll") # will的缩写
    pat_am = re.compile("(?<=[I|i])\'m") # am的缩写
    pat_are = re.compile("(?<=[a-zA-Z])\'re") # are的缩写
    pat_ve = re.compile("(?<=[a-zA-Z])\'ve") # have的缩写
    line = pat_letter.sub('', line).strip().lower()#转为小写
    line = pat_is.sub(r"\1 is", line)
    line = pat_s.sub("", line)
    line = pat_s2.sub("", line)
    line = pat_not.sub(" not", line)
    line = pat_would.sub(" would", line)
    line = pat_will.sub(" will", line)
    line = pat_am.sub(" am", line)
    line = pat_are.sub(" are", line)
    line = pat_ve.sub(" have", line)
    line = line.replace('\'', '')
    return line

In [165]:
for i in range(len(squad_merge_content)):
    squad_merge_content[i] = regex_process_en(squad_merge_content[i])
    if squad_merge_content[i] !="":
        squad_merge_content[i] = " ".join(squad_merge_content[i].split())

In [166]:
squad_merge_content

['beyonc giselle knowlescarter bijnse beeyonsay born september is an american singer songwriter record producer and actress born and raised in houston texas she performed in various singing and dancing competitions as a child and rose to fame in the late s as lead singer of rb girlgroup destiny child managed by her father mathew knowles the group became one of the world bestselling girl groups of all time their hiatus saw the release of beyonc debut album dangerously in love which established her as a solo artist worldwide earned five grammy awards and featured the billboard hot numberone singles crazy in love and baby boy',
 'when did beyonce start becoming popular',
 'in the late s',
 'what areas did beyonce compete in when she was growing up',
 'singing and dancing',
 'when did beyonce leave destiny child and become a solo singer',
 '',
 'in what city and state did beyonce grow up',
 'houston texas',
 'in which decade did beyonce become famous',
 'late s',
 'in what rb group was she

### SQuAD删除停用词：nltk分词+停词表

In [172]:
#剔除停用词
def delete_stopwords_en(lines):
    stopwords_en = stopwords.words('english')
    all_words = []
    for line in lines:
        all_words += [word for word in nltk.word_tokenize(line) if word not in stopwords_en]
    return all_words

In [173]:
squad_words_list = delete_stopwords_en(squad_merge_content)

### squad 统计词频

In [174]:
squad_bow_words = dict(Counter(squad_words_list))
squad_sorted_words = sorted(squad_bow_words.items(), key=lambda d:d[1], reverse=True)

In [175]:
squad_sorted_words

[('many', 12300),
 ('first', 8892),
 ('one', 7638),
 ('year', 7606),
 ('new', 7552),
 ('city', 7097),
 ('also', 7055),
 ('used', 6742),
 ('two', 5809),
 ('th', 5654),
 ('people', 5086),
 ('world', 5064),
 ('century', 5057),
 ('name', 5049),
 ('state', 4910),
 ('time', 4848),
 ('states', 4761),
 ('war', 4756),
 ('use', 4277),
 ('united', 4128),
 ('us', 4115),
 ('would', 4036),
 ('system', 4002),
 ('may', 3994),
 ('government', 3823),
 ('type', 3821),
 ('years', 3818),
 ('known', 3754),
 ('called', 3748),
 ('much', 3366),
 ('part', 3332),
 ('made', 3285),
 ('university', 3277),
 ('population', 3244),
 ('country', 3169),
 ('early', 3009),
 ('american', 2931),
 ('north', 2886),
 ('british', 2873),
 ('law', 2862),
 ('million', 2847),
 ('area', 2845),
 ('three', 2838),
 ('group', 2834),
 ('national', 2784),
 ('power', 2751),
 ('number', 2750),
 ('school', 2708),
 ('language', 2685),
 ('period', 2611),
 ('since', 2598),
 ('french', 2564),
 ('often', 2543),
 ('music', 2541),
 ('south', 2541),


### 将SQuAD词频统计存入txt文件

In [177]:
with open(squad_result_file, "w+") as output_file:
    for (word,freq) in squad_sorted_words:
        output_file.write(word+" "+str(freq)+"\n")