## Part1 图谱抽取
> [1]【必做】根据实验一中提供的电影ID列表，匹配获得Freebase中对应的实体（共578个可匹配实体）。
> 
> [2]【必做】以578个可匹配实体为起点，通过三元组关联，提取一跳可达的全部实体，以形成新的起点集合。重复若干次该步骤，并将所获得的全部实体及对应三元组合并为用于下一阶段实验的知识图谱子图。
>
> [3]【选做】根据实验二提供的电影Tag信息，在图谱中添加一类新实体（Tag类），并建立其与电影实体的三元组，以充实电影的语义信息。

#### 获得初始电影实体

In [69]:
# 读取 id 列表
import csv
movie_ids = []
with open('../data/Movie_id.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        movie_ids.append(row[0])
print(len(movie_ids), movie_ids[:5])

# 读取映射表，txt 文件
entity_movie_map = {}
with open('../data/douban2fb.txt', 'r') as f:
    for line in f:
        # 格式不统一，有的 \t 有的空格
        # 将 \t 替换成空格
        line = line.replace('\t', ' ')
        line = line.strip().split(' ')
        entity_movie_map[line[0]] = line[1]
print(len(entity_movie_map), list(entity_movie_map.items())[:5])

# 得到实体 ID
entity_ids = set()
for movie_id in movie_ids:
    # 只保留有映射的
    if movie_id in entity_movie_map:
        entity_ids.add(entity_movie_map[movie_id])
print(len(entity_ids), list(entity_ids)[:5])
# 保存到 pkl 文件
import pickle
with open('../result/entity_ids0.pkl', 'wb') as f:
    pickle.dump(entity_ids, f)

1200 ['1292052', '1295644', '1292720', '3541415', '3742360']
578 [('1291544', 'm.03177r'), ('1291545', 'm.027pfg'), ('1291546', 'm.01d1_s'), ('1291550', 'm.053xlz'), ('1291552', 'm.017jd9')]
578 ['m.09_33n', 'm.0680y4', 'm.027m67', 'm.025cm9', 'm.02pxxg7']


#### 定义生成子图的函数

In [70]:
from tqdm.notebook import tqdm as tqdm
import gzip

# 以 entity_ids 为起点生成一跳子图，保存到 des.gz 文件中
def Get1StepSubGraph(entity_ids, des='graph1step'):
    with gzip.open('../result/' + des + '.gz', 'wb') as ans:
        with gzip.open('../data/freebase_douban.gz', 'rb') as f:
            for line in tqdm(f, total=395577070):
                line = line.strip()
                triplet = line.decode().split('\t')[:3]
                # 排除前缀不是 http://rdf.freebase.com/ns/ 的实体
                if (triplet[0][:28] != '<http://rdf.freebase.com/ns/') \
                    or (triplet[1][:28] != '<http://rdf.freebase.com/ns/') \
                    or (triplet[2][:28] != '<http://rdf.freebase.com/ns/'):
                    continue
                # 保存起点在 entity_ids 中的三元组
                if triplet[0][28:-1] in entity_ids:
                    ans.write(line + b'\n')

#### 获得一跳子图

In [71]:
import os

# 获得起始实体 ID
entity_ids0 = set()
with open('../result/entity_ids0.pkl', 'rb') as f:
    entity_ids0 = pickle.load(f)
# 如果已经提取过，就不用再提取了
if not os.path.exists('../result/graph1step.gz'):
    Get1StepSubGraph(entity_ids0)
# 大约耗时 10 分钟

#### 定义筛选子图的函数

In [72]:
# 从 triplets 中筛选出度大于 limit 的实体
def Select(entity_ids0, triplets, entity_limit=20, relation_limit=50):
    entity_count = {}
    relation_count = {}
    # 计数
    for triplet in triplets:
        if triplet[0] not in entity_count:
            entity_count[triplet[0]] = 0
        entity_count[triplet[0]] += 1
        if triplet[1] not in relation_count:
            relation_count[triplet[1]] = 0
        relation_count[triplet[1]] += 1
        if triplet[2] not in entity_count:
            entity_count[triplet[2]] = 0
        entity_count[triplet[2]] += 1
    # 筛选
    ans = []
    for triplet in triplets:
        # 要不就是在 entity_ids0 中，要不就是度大于 limit
        if (triplet[0] in entity_ids0 or entity_count[triplet[0]] > entity_limit) \
        and (relation_count[triplet[1]] > relation_limit) \
        and (triplet[2] in entity_ids0 or entity_count[triplet[2]] > entity_limit):
            ans.append(triplet)
    return ans

#### 筛选一跳子图

In [73]:
entity_ids0 = set()
with open('../result/entity_ids0.pkl', 'rb') as f:
    entity_ids0 = pickle.load(f)
# 读取一跳子图
triplets = []
with gzip.open('../result/graph1step.gz', 'rb') as f:
    for line in f:
        line = line.strip()
        triplet = line.decode().split('\t')[:3]
        triplet = [triplet[0][28:-1], triplet[1][28:-1], triplet[2][28:-1]]
        triplets.append(triplet)
print(len(triplets), triplets[:5])

# 筛选至收敛
triplets_selected = Select(entity_ids0, triplets)
while len(triplets_selected) < len(triplets):
    triplets = triplets_selected
    triplets_selected = Select(entity_ids0, triplets)
    print(len(triplets_selected), len(triplets))

# 保存 ID
entity_ids = set()
for triplet in triplets_selected:
    entity_ids.add(triplet[0])
    entity_ids.add(triplet[2])
print(len(entity_ids), list(entity_ids)[:5])
# 保存到 pkl 文件
with open('../result/entity_ids1.pkl', 'wb') as f:
    pickle.dump(entity_ids, f)

# 验证是否包含 entity_ids0
entity_ids0 = set()
with open('../result/entity_ids0.pkl', 'rb') as f:
    entity_ids0 = pickle.load(f)
print(len(entity_ids0), list(entity_ids0)[:5])
entity_ids1 = set()
with open('../result/entity_ids1.pkl', 'rb') as f:
    entity_ids1 = pickle.load(f)
print(len(entity_ids1), list(entity_ids1)[:5])
for entity_id in entity_ids0:
    if entity_id not in entity_ids1:
        print(entity_id, 'not in entity_ids1')

118773 [['m.012x63', 'type.object.type', 'base.type_ontology.inanimate'], ['m.012x63', 'film.film.genre', 'm.03k9fj'], ['m.012x63', 'film.film.dubbing_performances', 'm.0p7zw8x'], ['m.012x63', 'film.film.distributors', 'm.0zcrwbm'], ['m.012x63', 'film.film.dubbing_performances', 'm.0p7zxhh']]
18204 18396
18204 18204
750 ['m.09_33n', 'm.0680y4', 'm.03h64', 'm.0cq22n0', 'm.024qqx']
578 ['m.09_33n', 'm.0680y4', 'm.027m67', 'm.025cm9', 'm.02pxxg7']
750 ['m.09_33n', 'm.0680y4', 'm.03h64', 'm.0cq22n0', 'm.024qqx']


#### 获得两跳子图

In [74]:
# 获取起始 ID
entity_ids1 = set()
with open('../result/entity_ids1.pkl', 'rb') as f:
    entity_ids1 = pickle.load(f)
if not os.path.exists('../result/graph2step.gz'):
    Get1StepSubGraph(entity_ids1, des='graph2step')
# 大约耗时 15 分钟

#### 筛选两跳子图

In [75]:
# 获取起始 ID
entity_ids0 = set()
with open('../result/entity_ids0.pkl', 'rb') as f:
    entity_ids0 = pickle.load(f)
if not os.path.exists('../result/graph2step_selected.gz'):
    # 读取两跳子图
    with gzip.open('../result/graph2step.gz', 'rb') as f:
        entity_count = {}
        relation_count = {}
        # 计数
        for line in tqdm(f, total=104698451):
            line = line.strip()
            triplet = line.decode().split('\t')[:3]
            triplet = [triplet[0][28:-1], triplet[1][28:-1], triplet[2][28:-1]]
            if triplet[0] not in entity_count:
                entity_count[triplet[0]] = 0
            entity_count[triplet[0]] += 1
            if triplet[1] not in relation_count:
                relation_count[triplet[1]] = 0
            relation_count[triplet[1]] += 1
            if triplet[2] not in entity_count:
                entity_count[triplet[2]] = 0
            entity_count[triplet[2]] += 1
        
        f.seek(0)
        remove_entity = set()
        remove_relation = set()
        # ！只能边读边筛选，不然内存会爆掉
        for line in tqdm(f, total=104698451):
            line = line.strip()
            triplet = line.decode().split('\t')[:3]
            triplet = [triplet[0][28:-1], triplet[1][28:-1], triplet[2][28:-1]]
            # 过滤出现大于 20000 的实体
            if entity_count[triplet[0]] > 20000:
                remove_entity.add(triplet[0])
            if entity_count[triplet[2]] > 20000:
                remove_entity.add(triplet[2])
            # 过滤出现小于 50 的关系
            if relation_count[triplet[1]] <= 50:
                remove_relation.add(triplet[1])
        print(len(remove_entity), len(remove_relation))
        
        f.seek(0)
        for line in tqdm(f, total=104698451):
            line = line.strip()
            triplet = line.decode().split('\t')[:3]
            triplet = [triplet[0][28:-1], triplet[1][28:-1], triplet[2][28:-1]]
            if triplet[0] in remove_entity or triplet[1] in remove_relation or triplet[2] in remove_entity:
                continue
            triplets.append(triplet)
    print(len(triplets), triplets[:5])

    # 筛选至收敛
    triplets_selected = Select(entity_ids0, triplets, entity_limit=16, relation_limit=50)
    while len(triplets_selected) < len(triplets):
        triplets = triplets_selected
        triplets_selected = Select(entity_ids0, triplets, entity_limit=16, relation_limit=50)
        print(len(triplets_selected), len(triplets))

    # 保存元组
    entity_ids = set()
    with gzip.open('../result/graph2step_selected.gz', 'wb') as f:
        for triplet in triplets_selected:
            # 保存 ID
            entity_ids.add(triplet[0])
            entity_ids.add(triplet[2])
            # 加上前缀
            triplet = ['<http://rdf.freebase.com/ns/' + triplet[0] + '>',
                    '<http://rdf.freebase.com/ns/' + triplet[1] + '>',
                    '<http://rdf.freebase.com/ns/' + triplet[2] + '>']
            f.write(('\t'.join(triplet) + '\n').encode())

    print(len(entity_ids), list(entity_ids)[:5])
    # 保存到 pkl 文件
    with open('../result/entity_ids2.pkl', 'wb') as f:
        pickle.dump(entity_ids, f)
# 大约耗时 10 分钟

In [76]:
# 检验 578 个实体是否都在
entity_ids0 = set()
with open('../result/entity_ids0.pkl', 'rb') as f:
    entity_ids0 = pickle.load(f)
entity_ids2 = set()
with open('../result/entity_ids2.pkl', 'rb') as f:
    entity_ids2 = pickle.load(f)
print(len(entity_ids0), len(entity_ids2))
for entity_id in entity_ids0:
    if entity_id not in entity_ids2:
        print(entity_id)
print('ok')

578 1479
ok


#### 添加 Tag 实体

In [77]:
# 建立实体 ID 到电影 ID 的映射
entity_movie_map = {}
with open('../data/douban2fb.txt', 'r') as f:
    for line in f:
        # 格式不统一，有的 \t 有的空格
        # 将 \t 替换成空格
        line = line.replace('\t', ' ')
        line = line.strip().split(' ')
        entity_movie_map[line[1]] = line[0]
print(len(entity_movie_map), list(entity_movie_map.items())[:5])

# 建立电影 ID 到 Tag 的映射，编码为 utf-8
movie_tag_map = {}
with open('../data/Movie_tag.csv', 'r', encoding='utf-8') as f:
    reader = csv.reader(f)
    for row in reader:
        movie_tag_map[row[0]] = row[1]
print(len(movie_tag_map), list(movie_tag_map.items())[:5])

# 建立实体 ID 到 Tag 的映射
entity_tag_map = {}
for entity_id in entity_movie_map:
    movie_id = entity_movie_map[entity_id]
    if movie_id in movie_tag_map:
        entity_tag_map[entity_id] = movie_tag_map[movie_id]
print(len(entity_tag_map), list(entity_tag_map.items())[:5])

# 在三元组中加入 Tag
triplets = []
with gzip.open('../result/graph2step_selected.gz', 'rb') as f:
    triplets = []
    for line in f:
        line = line.strip()
        triplet = line.decode().split('\t')[:3]
        triplet = [triplet[0][28:-1], triplet[1][28:-1], triplet[2][28:-1]]
        triplets.append(triplet)

new_triplets = []
for entity_id in entity_tag_map:
    tags = entity_tag_map[entity_id].split(',')
    for tag in tags:
        new_triplets.append([entity_id, 'has_tag', tag])
print(len(new_triplets), new_triplets[:5])
triplets += new_triplets

# 保存到文件
with gzip.open('../result/FinalGraph.gz', 'wb') as f:
    for triplet in triplets:
        # 不要前缀了
        f.write(('\t'.join(triplet) + '\n').encode())

578 [('m.03177r', '1291544'), ('m.027pfg', '1291545'), ('m.01d1_s', '1291546'), ('m.053xlz', '1291550'), ('m.017jd9', '1291552')]
51798 [('id', 'tag'), ('1291543', '科幻,喜剧,人性,爱情,青春,大陆,犯罪,动作,经典,香港'), ('1291544', '惊悚,科幻,人性,青春,经典,动作,动画,美国,悬疑'), ('1291545', '科幻,喜剧,人性,文艺,悬疑,爱情,青春,经典,香港,美国,动画'), ('1291546', '人性,文艺,爱情,青春,大陆,经典,香港,美国,惊悚')]
578 [('m.03177r', '惊悚,科幻,人性,青春,经典,动作,动画,美国,悬疑'), ('m.027pfg', '科幻,喜剧,人性,文艺,悬疑,爱情,青春,经典,香港,美国,动画'), ('m.01d1_s', '人性,文艺,爱情,青春,大陆,经典,香港,美国,惊悚'), ('m.053xlz', '喜剧,人性,文艺,爱情,青春,大陆,经典,动作,犯罪,香港,悬疑'), ('m.017jd9', '惊悚,科幻,喜剧,人性,文艺,爱情,青春,经典,动作,香港,美国,动画')]
5161 [['m.03177r', 'has_tag', '惊悚'], ['m.03177r', 'has_tag', '科幻'], ['m.03177r', 'has_tag', '人性'], ['m.03177r', 'has_tag', '青春'], ['m.03177r', 'has_tag', '经典']]
