Skip to content

Commit

Permalink
fix(PubScholar.js): 完善专利条目的信息收集
Browse files Browse the repository at this point in the history
  • Loading branch information
l0o0 committed Nov 6, 2023
1 parent c97b923 commit c5eeee3
Showing 1 changed file with 174 additions and 164 deletions.
338 changes: 174 additions & 164 deletions PubScholar.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,178 +2,184 @@
"translatorID": "58df4473-a324-4fb5-8a8f-25d1e1897c73",
"label": "PubScholar",
"creator": "l0o0",
"target": "https?://pubscholar.cn/",
"target": "https?://pubscholar.cn/(patents|books|articles)",
"minVersion": "5.0",
"maxVersion": "",
"priority": 100,
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcsibv",
"lastUpdated": "2023-11-03 09:13:36"
"lastUpdated": "2023-11-06 14:25:21"
}

/*
***** BEGIN LICENSE BLOCK *****
Copyright © 2022 YOUR_NAME <- TODO
This file is part of Zotero.
Zotero is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Zotero is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with Zotero. If not, see <http://www.gnu.org/licenses/>.
***** END LICENSE BLOCK *****
*/

const ItemTypes = {
patents: "patent",
articles: "journalArticle",
books: "book"
};

const FieldMatch = {
author: "//span[@class='AuthorInfo__nameText'] | //div[@class='AuthorInfo__content']", // 书籍|期刊,过滤等主译,主编,过滤数字
date: "//span[text()='出版日期:' and @class='ArticleInfo__label']/following-sibling::span[1]",
publisher: "//span[text()='出版社:' and @class='ArticleInfo__label']/following-sibling::span[1]", // book
publicationTitle: "//span[@class='ArticleInfo__metaSource']", // 期刊
ISBN: "//span[text()='ISBN:' and @class='ArticleInfo__label']/following-sibling::span[1]",
学科分类: "//span[text()='学科分类:' and @class='ArticleInfo__label']/following-sibling::span[1]",
abstractNote: "//div[contains(@class, 'FullAbstracts')] | //div[@class='ArticleInfo__abstracts']", // Click to get fulltext
影响因子: "//div[@class='JournalContent__meta']",
tags: "//div[@class='ArticleInfo__keywords']/span[@class='ArticleInfo__keyword']",
metadata: "//div[@class='ArticleInfo__source']/span[@class='ArticleInfo__sourceTitle']/span[contains(text(), '年') or contains(text(), '期') or contains(text(), '卷')]",
filingDate: "//span[text()='申请日:' and @class='ArticleInfo__label']/following-sibling::span[1]",
applicationNumber: "//span[text()='申请号:' and @class='ArticleInfo__label']/following-sibling::span[1]",
issueDate: "//span[text()='公开日:' and @class='ArticleInfo__label']/following-sibling::span[1]",
patentNumber: "//span[text()='公开号:' and @class='ArticleInfo__label']/following-sibling::span[1]",
};

function parseAuthors(s) {
let type = 'author';
let sclean = s.replace(/主编$|等主译$|^发明人: /g, "");
if (s.match(/等主译$/)) type = 'translator';
if (s.match(/主编$/)) type = 'editor';
if (s.match(/^发明人: /)) type = 'inventor';
return sclean.split(/[,,]/).map((c) => {
return { lastName: c.replace(/\s?\d+\s?$/, ''), creatorType: type };
});
}

function parseAuthorStr(s) {
let creators = [];
const parts = s.split(/[;;]/);
parts.forEach((p) => {
let pc = parseAuthors(p.trim());
creators = creators.concat(pc);
});
return creators;
}

function parseMetadata(metadata, newItem) {
const ymatch = metadata.match(/(\d{4}) 年/);
const imatch = metadata.match(/第 (\d+) 期/);
const pmatch = metadata.match(/共 (\d+) 页/);
const vmatch = metadata.match(/第 (\d+) 卷/);
if (!ymatch && !imatch && !pmatch && !vmatch) return newItem;
if (ymatch) newItem.date = ymatch[1];
if (imatch) newItem.issue = imatch[1];
if (pmatch) newItem.pages = pmatch[1];
if (vmatch) newItem.volume = vmatch[1];
return newItem;
}

function parseTags(nodeList) {
return nodeList.map((n) => {
return { tag: n.textContent.trim() };
});
}

function getIDFromUrl(url) {
const mre = url.match(/\/(books|articles|patents)\/([\d\w]*)/);
if (!mre || mre.length != 3) return false;
return {
type: ItemTypes[mre[1]],
id: mre[2]
};
}

function detectWeb(doc, url) {
const id = getIDFromUrl(url);
// Z.debug(id);
if (id) return id.type;
return false;
}

// TODO: Hard to parse search page
function getSearchResults(doc, checkOnly) {
var items = {};
var found = false;
var rows = doc.querySelectorAll("div.List div.List__item");
for (let row of rows) {
let href = row.href;
let title = ZU.trimInternal(row.textContent);
if (!href || !title) continue;
if (checkOnly) return true;
found = true;
items[href] = title;
}
return found ? items : false;
}


async function doWeb(doc, url) {
if (detectWeb(doc, url) == 'multiple') {
let items = await Zotero.selectItems(getSearchResults(doc, false));
if (!items) return;
for (let url of Object.keys(items)) {
await scrape(await requestDocument(url));
}
}
else {
await scrape(doc, url);
}
}

async function scrape(doc, url = doc.location.href) {
const id = getIDFromUrl(url);
let newItem = new Zotero.Item(id.type);
newItem.title = doc.title;
newItem.url = url;
// Read more button
const button = ZU.xpath(doc, "//button[contains(@class, 'RichContent__more')]");
if (button.length > 0) button[0].click();

for (let field in FieldMatch) {
// Z.debug(field);
let tmp = ZU.xpath(doc, FieldMatch[field]);
if (tmp.length == 0) continue;
const v = tmp[0].textContent.trim();

// Z.debug(tmp[0].textContent);
if (field == 'author') {
newItem.creators = parseAuthorStr(v);
}
else if (field == 'metadata') {
newItem = parseMetadata(v, newItem);
}
else if (field == 'tags') {
newItem.tags = parseTags(tmp);
}
else {
newItem[field] = v;
}
}
// fix item
if (newItem.publicationTitle) newItem.publicationTitle = newItem.publicationTitle.replace(/[《》]/g, "");
if (newItem.abstractNote) newItem.abstractNote = newItem.abstractNote.replace(/收起\s?$/, '').trim();
if (newItem.影响因子) newItem.影响因子 = newItem.影响因子.replace("影响因子:", '').trim();
newItem.complete();
}

/*
***** BEGIN LICENSE BLOCK *****
Copyright © 2022 YOUR_NAME <- TODO
This file is part of Zotero.
Zotero is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Zotero is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with Zotero. If not, see <http://www.gnu.org/licenses/>.
***** END LICENSE BLOCK *****
*/

const ItemTypes = {
patents: "patent",
articles: "journalArticle",
books: "book"
}

const FieldMatch = {
author: "//span[@class='AuthorInfo__nameText'] | //div[@class='AuthorInfo__content']", // 书籍|期刊,过滤等主译,主编,过滤数字
date: "//span[text()='出版日期:' and @class='ArticleInfo__label']/following-sibling::span[1]",
publisher: "//span[text()='出版社:' and @class='ArticleInfo__label']/following-sibling::span[1]", // book
publicationTitle: "//span[@class='ArticleInfo__metaSource']", // 期刊
ISBN: "//span[text()='ISBN:' and @class='ArticleInfo__label']/following-sibling::span[1]",
"学科分类": "//span[text()='学科分类:' and @class='ArticleInfo__label']/following-sibling::span[1]",
abstractNote: "//div[contains(@class, 'FullAbstracts')] | //div[@class='ArticleInfo__abstracts']", // 注意摘要过长会收起
影响因子: "//div[@class='JournalContent__meta']",
tags: "//div[@class='ArticleInfo__keywords']/span[@class='ArticleInfo__keyword']",
metadata: "//div[@class='ArticleInfo__source']/span[@class='ArticleInfo__sourceTitle']/span[contains(text(), '年') or contains(text(), '期') or contains(text(), '卷')]",

}

function parseAuthors(s) {
let type = 'author';
let sclean = s.replace(/主编$|等主译$|^发明人: /g, "");
if (s.match(/等主译$/)) type = 'translator';
if (s.match(/主编$/)) type = 'editor';
if (s.match(/^发明人: /)) type = 'inventor';
return sclean.split(/[,,]/).map( (c) => {return {lastName: c.replace(/\s?\d+\s?$/, ''), creatorType: type}});
}

function parseAuthorStr(s) {
let creators = [];
const parts = s.split(/[;;]/);
parts.forEach( (p) => {
let pc = parseAuthors(p.trim());
creators = creators.concat(pc);
});
return creators;
}

function parseMetadata(metadata, newItem) {
let meta = {};
const ymatch = metadata.match(/(\d{4}) 年/);
const imatch = metadata.match(/第 (\d+) 期/);
const pmatch = metadata.match(/共 (\d+) 页/);
const vmatch = metadata.match(/第 (\d+) 卷/);
if (!ymatch && !imatch && !pmatch && !vmatch) return newItem;
if (ymatch) newItem.date = ymatch[1];
if (imatch) newItem.issue = imatch[1];
if (pmatch) newItem.pages = pmatch[1];
if (vmatch) newItem.volume = vmatch[1];
return newItem;
}

function parseTags(nodeList) {
return nodeList.map( (n) => {
return {tag: n.textContent.trim()}
})
}

function getIDFromUrl(url) {
const mre = url.match(/\/(books|articles|patents)\/([\d\w]*)/);
if (!mre || mre.length != 3) return false;
return {
type: ItemTypes[mre[1]],
id: mre[2]
}
}

function detectWeb(doc, url) {
const id = getIDFromUrl(url);
// Z.debug(id);
if (id) return id.type;
return false;
}

// TODO: Hard to parse search page
function getSearchResults(doc, checkOnly) {
var items = {};
var found = false;
var rows = doc.querySelectorAll("div.List div.List__item");
for (let row of rows) {
let href = row.href;
let title = ZU.trimInternal(row.textContent);
if (!href || !title) continue;
if (checkOnly) return true;
found = true;
items[href] = title;
}
return found ? items : false;
}



async function doWeb(doc, url) {
if (detectWeb(doc, url) == 'multiple') {
let items = await Zotero.selectItems(getSearchResults(doc, false));
if (!items) return;
for (let url of Object.keys(items)) {
await scrape(await requestDocument(url));
}
}
else {
await scrape(doc, url);
}
}

async function scrape(doc, url = doc.location.href) {
const id = getIDFromUrl(url);
let newItem = new Zotero.Item(id.type);
newItem.title = doc.title;
newItem.url = url;
// Read more button
const button = ZU.xpath(doc, "//button[contains(@class, 'RichContent__more')]");
if (button.length > 0) button[0].click();

for (let field in FieldMatch) {
// Z.debug(field);
let tmp = ZU.xpath(doc, FieldMatch[field])
if (tmp.length == 0) continue;
const v = tmp[0].textContent.trim();

// Z.debug(tmp[0].textContent);
if (field == 'author') {
newItem.creators = parseAuthorStr(v);
} else if (field == 'metadata') {
newItem = parseMetadata(v, newItem);
} else if (field == 'tags') {
newItem.tags = parseTags(tmp);
}else {
newItem[field] = v
}
}
// fix item
if (newItem.publicationTitle) newItem.publicationTitle = newItem.publicationTitle.replace(/[《》]/g, "");
if (newItem.abstractNote) newItem.abstractNote = newItem.abstractNote.replace(/收起\s?$/, '').trim();
if (newItem.影响因子) newItem.影响因子 = newItem.影响因子.replace("影响因子:", '').trim();
newItem.complete();
}

/** BEGIN TEST CASES **/
var testCases = [
{
Expand Down Expand Up @@ -283,7 +289,11 @@ var testCases = [
"creatorType": "inventor"
}
],
"issueDate": "2023-08-22",
"abstractNote": "本公开提供一种基因编辑构建体及其应用,所述基因编辑构建体用于将外源基因定点整合入基因组的核糖体DNA(rDNA)区,并能高效地表达其携带的外源基因。",
"applicationNumber": "CN202310645338.1",
"filingDate": "2023-06-01",
"patentNumber": "CN116622777A",
"url": "https://pubscholar.cn/patents/d1067ea442b3b43a3a301abc252eb139a0fbe21d5ec4e8bb250fde14e9c6a173880526c9b4434ff87754e650b78fecac/0",
"attachments": [],
"tags": [],
Expand Down

0 comments on commit c5eeee3

Please sign in to comment.