In [3]:
# 挂载google云硬盘
import os
from google.colab import drive
drive.mount('/content/gdrive')

# 更改工程的运行文件夹
os.chdir("/content/gdrive/My Drive/Colab Notebooks/")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import pickle

with open("best_params.txt", "rb") as f:
    d = pickle.load(f)

print(d)

{'thresh': 2.0423832188246194, 'eps': 0.11885847364839468, 'min_samples': 6, 'weight': 0.6858433677155684, 'best_f1': 0.7001693410111576}


In [4]:
import json
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

# 加载训练集
train_row_data_path = './Task2/train/train_author.json'
train_pub_data_path = './Task2/train/train_pub.json'

train_pub_data = json.load(open(train_pub_data_path, 'r', encoding='utf-8'))
train_data = json.load(open(train_row_data_path, 'r', encoding='utf-8'))

m_train_data = {}    # m 代表 Modify

## 将训练集进行转化，生成类似验证集的作者词典
for author in tqdm(list(train_data.keys())):
  m_train_data[author] = []
  clusters = train_data[author]
  for papers in clusters.values():
    m_train_data[author].extend(papers)


# 加载测试集
valid_row_data_path = './Task2/sna_data/sna_valid_author_raw.json'
valid_pub_data_path = './Task2/sna_data/sna_valid_pub.json'

valid_pub_data = json.load(open(valid_pub_data_path, 'r', encoding='utf-8'))
valid_data = json.load(open(valid_row_data_path, 'r', encoding='utf-8'))

100%|██████████| 221/221 [00:00<00:00, 27907.31it/s]


In [0]:
# 数据预处理
import re

# 预处理名字
def precessName(name):
  name = name.strip()
  name = re.sub(r'\s', ' ', name)
  name = name.lower().replace(' ', '_')
  name = name.replace('.', '_')
  name = name.replace('-', '')
  name = re.sub(r"_{2,}", "_", name)    # 两个及其以上的下划线用一个下划线代替
  return name

# 预处理机构,简写替换，
def preprocessOrg(org):
  if org != "":
    org = org.replace('Sch.', 'School')
    org = org.replace('Dept.', 'Department')
    org = org.replace('Coll.', 'College')
    org = org.replace('Inst.', 'Institute')
    org = org.replace('Univ.', 'University')
    org = org.replace('Lab ', 'Laboratory ')
    org = org.replace('Lab.', 'Laboratory')
    org = org.replace('Natl.', 'National')
    org = org.replace('Comp.', 'Computer')
    org = org.replace('Sci.', 'Science')
    org = org.replace('Tech.', 'Technology')
    org = org.replace('Technol.', 'Technology')
    org = org.replace('Elec.', 'Electronic')
    org = org.replace('Engr.', 'Engineering')
    org = org.replace('Aca.', 'Academy')
    org = org.replace('Syst.', 'Systems')
    org = org.replace('Eng.', 'Engineering')
    org = org.replace('Res.', 'Research')
    org = org.replace('Appl.', 'Applied')
    org = org.replace('Chem.', 'Chemistry')
    org = org.replace('Prep.', 'Petrochemical')
    org = org.replace('Phys.', 'Physics')
    org = org.replace('Phys.', 'Physics')
    org = org.replace('Mech.', 'Mechanics')
    org = org.replace('Mat.', 'Material')
    org = org.replace('Cent.', 'Center')
    org = org.replace('Ctr.', 'Center')
    org = org.replace('Behav.', 'Behavior')
    org = org.replace('Atom.', 'Atomic')
    org = org.split(';')[0]  # 多个机构只取第一个
  return org

#正则去标点
def etl(content):
  content = content.lower().strip()        # 统一大小写
  content = re.sub('[:-]+', ' ', content)    # 替换爬虫的连续空格标记
  content = re.sub("[\s+\.\!\/,;$%^*(+\"\')]+|[+——()?【】“”！，。？、~@#￥%……&*（）]+", " ", content)
  content = re.sub(r" {2,}", " ", content)
  return content

# 将train_pub_data或valid_pub_data转换格式，提出author和org
def formatTransfer(pub_data):
  for p_id in pub_data:
    authors = pub_data[p_id]['authors']
    rep_dict = {}
    for author in authors:
      name = author['name']
      org = author['org'] if 'org' in author else ''
      rep_dict[name] = org
    pub_data[p_id]['authors'] = rep_dict
  

In [6]:
# 转换pub_data格式
formatTransfer(train_pub_data)

for item_dict in train_pub_data.items():
  print(item_dict[0])
  for item in item_dict[1].items():
    print(item)
  break


P9a1gcvg
('authors', {'Fenghe Qiu': 'Institute of Pharmacology and Toxicology', 'Li Liu': 'Institute of Pharmacology and Toxicology', 'Li Guo': 'Institute of Pharmacology and Toxicology'})
('title', 'Rapid determination of central nervous drugs in plasma by solid-phase extraction and GC-FID and GC-MS')
('abstract', 'Objective: To establish a simultaneous determination method of central nervous drugs including barbitals, benzodiazepines, phenothiozines and tricyclic antidepressants in human plasma. Methods: Drugs in plasma were extracted and purified by using X-5 resin solid phase extraction columns, followed by identification and quantitation using capillary GC-FID and GC-MS. Results: More than 20 drugs were simultaneously extracted from human plasma, and effectively separated in GC and TIC spectra. The correlation coefficient of standard curves was larger than 0.99, and relative standard differences (RSD) were less than 10% for most of the drugs. Under neutral extraction conditions, t

In [0]:
# load wordMatrix, 加载预训练的词向量
import io

def load_vectors(fname):
  fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
  n, d = map(int, fin.readline().split())
  print('Total numbers:', n, 'Dimension:', d)
  data = {}
  for i, line in enumerate(fin):
    tokens = line.rstrip().split(' ')
    data[tokens[0]] = list(map(float, tokens[1:]))    
    # 根据FastText文档，序列化储存的字符以实际出现的频次排列，高频1W词大致够用
    if i == int(2.5e5):
      break
  return data

# word2vector, 转化函数
def word2vector(word, vocabulary, wordMatrix, lost_words):
  r_arr = np.zeros(300)
  if word in vocabulary:
    try:
      r_arr = np.array(wordMatrix[word])
    except KeyError:    # 找不到对应词向量
      lost_words.append(word)
      return r_arr
  return r_arr

# 加载预训练词向量
pretrained_fastText_path = './Task2/wiki-news-300d-1M.vec'
wordMatrix = load_vectors(pretrained_fastText_path)
print('Actual Loaded words:', len(wordMatrix))

Total numbers: 999994 Dimension: 300
Actual Loaded words: 250001


In [0]:
print(len(train_pub_data))

203184


#### 根据训练集的label构建训练对，训练二分类器，从而提取特征

In [0]:
from functools import reduce
from tqdm import tqdm_notebook
import math
import random

# 1).构造训练集pairs
def get_label(paper_author_dict, id_i, id_j):
    if paper_author_dict[id_i] == paper_author_dict[id_j]:
        return 1
    return 0

def gen_train_data(train_data):
  pairs = []
  for i, author in tqdm_notebook(enumerate(train_data), total=len(train_data)):
    papers = [p_id for author_id in train_data[author] for p_id in train_data[author][author_id]]    # 从train训练集中直接提取出所有paper ids
    # 空文本集
    if (len(papers) == 0):
      continue
    
    paper_author_dict = {p_id: author_id for author_id in train_data[author] for p_id in train_data[author][author_id]}    # 建立从p_id到author_id的映射

    sasn = []    # 正样本集
    dasn = []    # 负样本集

    # 产生索引pair, 偏移1个单位（不包含对角线）
    for i, j in zip(*np.triu_indices(len(papers), k=1)): 
        p_i = papers[i]
        p_j = papers[j]

        # 标签
        label = get_label(paper_author_dict, p_i, p_j)
        if label == 0:
            dasn.append((p_i, p_j, 0))
        else:
            sasn.append((p_i, p_j, 1))

    # 进行降采样，解决正负样本不平衡
    category_size = len(sasn)
    all_pairs = map(lambda x: int(math.ceil(category_size / float(len(x)))) * x, [sasn, dasn])
    pairs.extend(reduce(lambda x, y: x + random.sample(y, int(category_size)), all_pairs, []))

  return pairs

pairs = gen_train_data(train_data)


HBox(children=(IntProgress(value=0, max=221), HTML(value='')))

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 2).提取特征训练相似度模型
corpus = []
for p_id in tqdm_notebook(train_pub_data):
  paper = train_pub_data[p_id]
  title = paper['title'] if 'title' in paper else ''
  # abstract = paper['abstract'] if 'abstract' in paper else ''
  keywords = ' '.join(paper['keywords']) if 'keywords' in paper else ''
  # semantic_feature = title + ' ' + keywords + ' ' + abstract
  semantic_feature = title + ' ' + keywords
  semantic_feature = etl(semantic_feature)    # 去除句中标点，并进行分词
  corpus.append(semantic_feature)

stop_words = ['based','for','and','using','with','the', 'are','can']
tfidf_model = TfidfVectorizer(token_pattern=r"(?u)\b\w\w\w+\b", stop_words=stop_words).fit(corpus)
vocabulary = list(tfidf_model.vocabulary_.keys())

import torch

# 3).预测验证集相似度矩阵



# 4).聚类消歧

HBox(children=(IntProgress(value=0, max=203184), HTML(value='')))

In [0]:
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer

res_dict = {}    # 储存分类结果
for author in tqdm_notebook(m_train_data):
  p_ids = m_train_data[author]
  corpus = []         # 储存在同一个作者下的所有语料
  coauther_orgs = []  # 储存所有机构和共同作者信息

  if(len(p_ids) == 0):
    res_dict[author] = []
    continue

  for p_id in p_ids:
    paper = train_pub_data[p_id]
    title = paper['title'] if 'title' in paper else ''
    # abstract = paper['abstract'] if 'abstract' in paper else ''
    keywords = ' '.join(paper['keywords']) if 'keywords' in paper else ''
    # semantic_feature = title + ' ' + keywords + ' ' + abstract
    semantic_feature = title + ' ' + keywords
    semantic_feature = etl(semantic_feature)    # 去除句中标点，并进行分词
    corpus.append(semantic_feature)
    # 作者和机构信息
    authors = paper['authors'].keys()
    names = [precessName(paper_author) for paper_author in authors]
    orgs = [preprocessOrg(org) for org in paper['authors'].values() if org != ''] 
    abstract = paper["abstract"] if ('abstract' in paper and paper["abstract"] is not None) else ''
    try:
      coauther_orgs.append(etl(' '.join(names + orgs) + ' '+ abstract))   
    except TypeError:
      print(p_id, names, orgs, abstract)      
  co_vec = TfidfVectorizer().fit_transform(coauther_orgs).todense()

  # 删除两个字母及以下的词（多为介词），加入stopwords列表
  stop_words = ['based','for','and','using','with','the', 'are','can']
  tfidf_model = TfidfVectorizer(token_pattern=r"(?u)\b\w\w\w+\b", stop_words=stop_words).fit(corpus)
  vocabulary = list(tfidf_model.vocabulary_.keys())
  
  corpus_vec = []
  lost_words = []
  # Tokenize, 将语义文本转化为语义向量
  unvalid_sample = 0
  for i, sample in enumerate(corpus):
    text_vector = np.zeros(300)
    valid_word_count = 0
    words = set(sample.split())    # 去重

    for word in words:
      tmp_arr = word2vector(word, vocabulary, wordMatrix, lost_words)
      if(np.sum(np.abs(tmp_arr))!=0):
        valid_word_count += 1
      text_vector = text_vector + tmp_arr

    if(np.sum(np.abs(text_vector)) == 0):
      unvalid_sample += 1
    
    # 求平均，防止devide by 0, TFIDF加权则可以不用
    # text_vector = text_vector/valid_word_count if (valid_word_count!=0) else text_vector
    corpus_vec.append(text_vector)

  corpus_vec = np.array(corpus_vec)
  
  # 拼接一下
  corpus_vec = np.concatenate((corpus_vec, co_vec), axis=1)
  clf = DBSCAN(eps=0.07, min_samples=4, metric='cosine')
  s = clf.fit_predict(corpus_vec)

  paper_dict = {}
  for label, p_id in zip(clf.labels_, p_ids):
      if str(label) not in paper_dict:
          paper_dict[str(label)] = [p_id]
      else:
          paper_dict[str(label)].append(p_id)
  res_dict[author] = list(paper_dict.values())

## Output Info

  print('Name:', author, end=', ')
  print('Unvalid:', unvalid_sample, end='; ')
  print('Lost words:', len(lost_words))
  print('num of clusters:', len(paper_dict))


HBox(children=(IntProgress(value=0, max=221), HTML(value='')))

Name: li_guo, Unvalid: 1; Lost words: 1977
num of clusters: 14
Name: bo_shen, Unvalid: 3; Lost words: 2876
num of clusters: 31
Name: di_wang, Unvalid: 1; Lost words: 2293
num of clusters: 22
Name: long_wang, Unvalid: 0; Lost words: 560
num of clusters: 10
Name: qiang_xu, Unvalid: 1; Lost words: 2143
num of clusters: 25
Name: xiang_wang, Unvalid: 0; Lost words: 2344
num of clusters: 19
Name: changming_liu, Unvalid: 0; Lost words: 177
num of clusters: 3
Name: kenji_kaneko, Unvalid: 0; Lost words: 136
num of clusters: 4
Name: guohua_chen, Unvalid: 0; Lost words: 1431
num of clusters: 19
Name: hai_jin, Unvalid: 0; Lost words: 75
num of clusters: 2
Name: jia_li, Unvalid: 0; Lost words: 721
num of clusters: 8
Name: guoliang_li, Unvalid: 0; Lost words: 1016
num of clusters: 14
Name: lan_wang, Unvalid: 0; Lost words: 1699
num of clusters: 18
Name: alessandro_giuliani, Unvalid: 0; Lost words: 317
num of clusters: 9
Name: jiang_he, Unvalid: 1; Lost words: 463
num of clusters: 10
Name: xiang_gao,

In [0]:
tt = np.concatenate((corpus_vec, co_vec), axis=1)
print(co_vec.shape)
print(tt.shape)

(1464, 18695)
(1464, 18995)


In [31]:
# 这里主要试一下直接抽取TF-IDF特征在数据集上的表现
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm_notebook

def disambiguate_by_cluster(input_data, pub_data):
  res_dict = {}
  for author in tqdm_notebook(input_data):
    coauther_orgs = []
    p_ids = input_data[author]
    if len(p_ids) == 0:
      res_dict[author] = []
      continue

    paper_dict = {}
    for p_id in p_ids:
      paper = pub_data[p_id]
      authors = paper['authors'].keys()
      names = [precessName(paper_author) for paper_author in authors]
      orgs = [preprocessOrg(org) for org in paper['authors'].values() if org != ''] 
      # abstract = paper["abstract"] if ('abstract' in paper and paper["abstract"] is not None) else ''
      title = paper['title'] if 'title' in paper and paper['title'] is not None else ''
      keywords = paper['keywords'] if 'keywords' in paper else []
      year = str(paper['year']) if 'year' in paper and paper['year'] is not None else ''
      venue = paper['venue'] if 'venue' in paper and paper['venue'] is not None else '' 
      
      try:
        coauther_orgs.append(etl(' '.join(names + orgs) + ' '.join(keywords) + ' '+ title+ ' '+ str(year) + ' ' + venue))   
      except TypeError:
        print(p_id, names, orgs, abstract)      
    tfidf = TfidfVectorizer().fit_transform(coauther_orgs)
    # sim_mertric = pairwise_distances(tfidf, metric='cosine')

    clf = DBSCAN(metric='cosine', min_samples=3)
    s = clf.fit_predict(tfidf)
    #每个样本所属的簇
    for label, p_id in zip(clf.labels_, p_ids):
      paper = pub_data[p_id]
      if str(label) not in paper_dict:
        paper_dict[str(label)] = [paper['id']]
      else:
        paper_dict[str(label)].append(paper['id'])
    res_dict[author] = list(paper_dict.values())

  return res_dict

res_dict = disambiguate_by_cluster(m_train_data, train_pub_data)

HBox(children=(IntProgress(value=0, max=221), HTML(value='')))




In [32]:
avg_num_clusters = np.sum([len(res_dict[author]) for author in res_dict])/len(res_dict)
print('Average cluster number of per author:', avg_num_clusters)

Average cluster number of per author: 17.846153846153847


### Pair-wise F1 评估函数

In [33]:
from tqdm import tqdm_notebook

# 评估论文对
def evaluate_fun(correct_labels, pred_labels):
  TP = 0.0  # Pairs Correctly Predicted To SameAuthor
  TP_FP = 0.0  # Total Pairs Predicted To SameAuthor
  TP_FN = 0.0  # Total Pairs To SameAuthor

  for i in range(len(correct_labels)):
    for j in range(i + 1, len(correct_labels)):
      if correct_labels[i] == correct_labels[j]:
        TP_FN += 1
      if pred_labels[i] == pred_labels[j]:
        TP_FP += 1
      if (correct_labels[i] == correct_labels[j]) and (pred_labels[i] == pred_labels[j]):
        TP += 1

  if TP == 0:
    pairwise_precision = 0
    pairwise_recall = 0
    pairwise_f1 = 0
  else:
    pairwise_precision = TP / TP_FP
    pairwise_recall = TP / TP_FN
    pairwise_f1 = (2 * pairwise_precision * pairwise_recall) / (pairwise_precision + pairwise_recall)

  return pairwise_precision, pairwise_recall, pairwise_f1


# 评估作者
def evaluate_author(correct_clusters, pred_clusters):
  dict_array = {}
  for i, author_id in enumerate(correct_clusters):
    for p_id in correct_clusters[author_id]:
      dict_array[p_id] = i
  
  correct_labels = list(dict_array.values())

  ## 重值键值，聚类方法不同，未必论文集能全回收
  for p_id in dict_array:
    dict_array[p_id] = -1

  for i, cluster in enumerate(pred_clusters):
    for p_id in cluster:
      dict_array[p_id] = i
  
  pred_labels = list(dict_array.values())

  author_precision, author_recall, author_f1 = evaluate_fun(correct_labels, pred_labels)
  return author_f1

# 进行测试

f1_results = []

for author in tqdm_notebook(train_data):
  clusters = train_data[author]
  pred_clusters = res_dict[author]
  f1_score = evaluate_author(clusters, pred_clusters)
  f1_results.append(f1_score)

f1_score = np.mean(f1_results)
print('Final results: ', f1_score)

HBox(children=(IntProgress(value=0, max=221), HTML(value='')))


Final results:  0.3295689008444801


In [19]:
# 对离散论文集出力，规则匹配+近似
import difflib

name_list = ['z_z_chen','b_shen','ruiquan_zhang','x_y_zhang','kongzhang_yang','hongzhuan_chen','pinhsun_chen','l_zang']

difflib.get_close_matches('shen_bo',name_list, n=1)

['b_shen']

#### 从FastText官网下载预训练的词向量

In [0]:
!wget "https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip"
!unzip wiki-news-300d-1M.vec.zip -d ./Task2/ 


--2020-02-20 09:55:55--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.20.22.166, 104.20.6.166, 2606:4700:10::6814:16a6, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.20.22.166|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 681808098 (650M) [application/zip]
Saving to: ‘wiki-news-300d-1M.vec.zip’


2020-02-20 09:56:08 (50.8 MB/s) - ‘wiki-news-300d-1M.vec.zip’ saved [681808098/681808098]



In [0]:
iu1 = np.triu_indices(4)

In [0]:
for i, j in zip(*np.triu_indices(4, k=1)):
  print(i, j)


0 1
0 2
0 3
1 2
1 3
2 3
