In [1]:
"""
The objectives of this program are as follows
1. to extract articles published in pubmed for each research group
2. to determine  which diseases the extracted papers are related to.
3. to output research achievements by disease

このプログラムの目的は以下です。
１．研究班毎のpubmedに掲載された論文を抽出する
２．抽出した論文がどの病気の論文か判定する。
３．疾患別研究業績を出力する
"""

'\nThe objectives of this program are as follows\n1. to extract articles published in pubmed for each research group\n2. to determine  which diseases the extracted papers are related to.\n3. to output research achievements by disease\n\nこのプログラムの目的は以下です。\n１．研究班毎のpubmedに掲載された論文を抽出する\n２．抽出した論文がどの病気の論文か判定する。\n３．疾患別研究業績を出力する\n'

In [2]:
from selenium import webdriver
import time
import pandas as pd 
import pytest
import time
import json
import os
import re
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

In [3]:
#path for download dir
Download_path = r"C:/Users/hs691/Downloads/"

#criteria of year
"""
The program searches for papers with the following criteria: year1 > Publication Year >= year2.
"""
year1 = 2022
year2 = 2020

In [4]:
"""
preprocessing
The csv file downloaded from pubmed is saved as "csv-" + name_10 + "-set.csv". 
Here, the variable, name_10 means the first 10 characters of the full name.
In this preprocessing phase, create this name_10 variable in advance

pubmedからダウンロードしたcsvファイルは、"csv-" + name_10 + "-set.csv "のように保存されます。
ここで、変数name_10は、フルネームの最初の10文字を意味します。
この前処理の過程では、name_10変数をあらかじめ作成しておきます。
"""
name_list = pd.read_csv("name_list_repo.csv", sep=",")
name_list["name_seq"] = name_list["name_list"].str.replace(' ', '')
name_list["name_10"] = name_list["name_seq"].str[:10]
name_list.head()

Unnamed: 0,group_list,name_list,name_seq,name_10
0,harigai,masayoshi harigai,masayoshiharigai,masayoshih
1,harigai,suguru honda,suguruhonda,suguruhond
2,ikari,katsunori ikari,katsunoriikari,katsunorii
3,ikari,suguru honda,suguruhonda,suguruhond


In [5]:
#reading template
template = pd.read_csv("template.csv", sep=",")
#template.rename(columns={'Publication Year': 'Publication_Year'}, inplace=True)


#leader_list
leader = 'Harigai M|Ikari K'

#co_author_list
"""
To solve the same name problem, a list of co-authors is prepared in advance. 
First, extract papers whose authors include members of the co-authors list or whose titles contain disease keywords
0 is assigned to papers whose authors include members of the coauthors list and whose title contains the disease keyword    
A paper that includes a member of the co-author list as an author of the paper but does not include the disease keyword in the title is assigned a 1
A paper whose authors do not include members of the list of co-authors, but whose title contains the disease keyword, is assigned a 2.

同姓同名問題を解決するため、予め共著者リストを作成しておきます。
最初に論文の著者に共著者リストのメンバーが含まれている、または、タイトルに疾患キーワードが含まれている論文を抽出
論文の著者に共著者リストのメンバーが含まれている、かつ、タイトルに疾患キーワードが含まれている論文は０が付与される
論文の著者に共著者リストのメンバーが含まれているが、タイトルに疾患キーワードが含まれていない論文は1が付与される
論文の著者に共著者リストのメンバーが含まれていないが、タイトルに疾患キーワードが含まれている論文は2が付与される
付与した数値はグループごとの業績を出力したcsvファイルの最初の列（check_col）になります。
"""
co_author = 'Yamanaka H|Harigai M|Devoe S|Seto N|Demoruelle MK|Mori M|Yokota S|Nishioka K|Rider LG|Targoff IN|Hashimoto M|Gono T|Sato S|Kuwana M|Terao C|Okada Y|Yano K|Okazaki K'

#Keywords for disease classification
RA = 'rheumatoid arthritis|Rheumatoid Arthritis|Rheumatoid arthritis'
SLE = 'lupus|Lupus'
SS = 'systemic sclerosis|Systemic sclerosis|Systemic Sclerosis'
VAS = 'vasculitis|Vasculitis|arteritis|Arteritis|ANCA|anti-neutrophil cytoplasmic antibody|anti neutrophil cytoplasmic antibody|microscopic polyangiitis|Microscopic polyangiitis|granulomatosis with polyangiitis|Granulomatosis with polyangiitis'
MYO = 'myositis|Myositis'
PED = 'child|pediatric|paediatric|juvenile|Child|Pediatric|Juvenile|Mediterranean'
Other = 'Still|connective tissue disease|arthroplasty'
ALL_dis = RA+'|'+SLE+'|'+SS+'|'+VAS+'|'+MYO+'|'+PED+'|'+Other

In [6]:
def disease_class(list_group):
    """
    Determine that an article is related to the disease 
    when the title of the extracted article contains the keywords for the disease defined in the cell above.

    抽出した論文のtitleに、上のセルで定義した病気のキーワードが含まれていた場合にその病気に関連する論文と判定する
    """
    for i in range(len(list_group)):
        if len(re.findall(RA, list_group.iloc[i,3]))>=1:
            list_group.iloc[i,1]="RA"
    for i in range(len(list_group)):
        if len(re.findall(SLE, list_group.iloc[i,3]))>=1:
            list_group.iloc[i,1]="SLE"
    for i in range(len(list_group)):
        if len(re.findall(SS, list_group.iloc[i,3]))>=1:
            list_group.iloc[i,1]="SS"
    for i in range(len(list_group)):
        if len(re.findall(VAS, list_group.iloc[i,3]))>=1:
            list_group.iloc[i,1]="VAS"
    for i in range(len(list_group)):
        if len(re.findall(MYO, list_group.iloc[i,3]))>=1:
            list_group.iloc[i,1]="MYO"
    for i in range(len(list_group)):
        if len(re.findall(PED, list_group.iloc[i,3]))>=1:
            list_group.iloc[i,1]="PED"

    return list_group

In [7]:
#site where chromedriver.exe is
driver_path = "driver/chromedriver.exe"

# make wevdriver
driver = webdriver.Chrome(executable_path=driver_path)

# wait 5 sec
time.sleep(1)

k=0
for i in range (len(name_list["group_list"].unique())):
    leader_name = name_list["group_list"].unique()[i]
    template = pd.read_csv("template.csv", sep=",")
    print("create " + leader_name + " group achievements" )
    for j in range (name_list["group_list"].value_counts()[leader_name]):
        driver.get("https://pubmed.ncbi.nlm.nih.gov/")
        driver.set_window_size(850, 850)
        
        name_k = name_list["name_10"][k]
        
        driver.find_element(By.ID, "id_term").click()
        driver.find_element(By.ID, "id_term").send_keys(name_list["name_list"][k])
        time.sleep(1)
        driver.find_element(By.ID, "id_term").send_keys(Keys.ENTER)
        time.sleep(1)
        try:
            driver.find_element(By.ID, "save-results-panel-trigger").click()
            time.sleep(1)
            driver.find_element(By.ID, "save-action-selection").click()
            time.sleep(1)
            dropdown = driver.find_element(By.ID, "save-action-selection")
            dropdown.find_element(By.XPATH, "//option[. = 'All results']").click()
            driver.find_element(By.ID, "save-action-format").click()
            dropdown = driver.find_element(By.ID, "save-action-format")
            dropdown.find_element(By.XPATH, "//option[. = 'CSV']").click()
            driver.find_element(By.CSS_SELECTOR, "#save-action-panel-form .action-panel-submit").click()
            time.sleep(1)
            df = pd.read_csv(Download_path + "csv-" + name_k + "-set.csv", sep=",")
            df.insert(loc = 0, column = 'check_column', value = 0)

            df = df[df["Publication Year"]>=year2]
            df = df[df["Publication Year"]<year1]
            
            #Delete papers with no title.
            df = df.dropna(subset=['Title'])
            
            df_leader = df[df["Authors"].str.contains(leader)]
            
            #First, extract papers whose authors include members of the co-authors list or whose titles contain disease keywords
            #最初に論文の著者に共著者リストのメンバーが含まれている、または、タイトルに疾患キーワードが含まれている論文を抽出
            df_lead_coau_or_alldis = df_leader[(df_leader["Authors"].str.contains(co_author))|(df_leader["Title"].str.contains(ALL_dis))]

            #0 is assigned to papers whose authors include members of the coauthors list and whose title contains the disease keyword    
            #論文の著者に共著者リストのメンバーが含まれている、かつ、タイトルに疾患キーワードが含まれている論文は０が付与される
            df_lead_coau_and_alldis = df_leader[(df_leader["Authors"].str.contains(co_author))&(df_leader["Title"].str.contains(ALL_dis))]

            df_lead_select = df_lead_coau_or_alldis[~((df_lead_coau_or_alldis["Authors"].str.contains(co_author))&(df_lead_coau_or_alldis["Title"].str.contains(ALL_dis)))]
     
            #A paper that includes a member of the co-author list as an author of the paper but does not include the disease keyword in the title is assigned a 1
            #論文の著者に共著者リストのメンバーが含まれているが、タイトルに疾患キーワードが含まれていない論文は1が付与される
            df_lead_add1 = df_lead_select[(df_lead_select["Authors"].str.contains(co_author))]
            df_lead_add1["check_column"]=1
     
            #A paper whose authors do not include members of the list of co-authors, but whose title contains the disease keyword, is assigned a 2.
            #論文の著者に共著者リストのメンバーが含まれていないが、タイトルに疾患キーワードが含まれている論文は2が付与される
            df_lead_add2 = df_lead_select[(df_lead_select["Title"].str.contains(ALL_dis))]
            df_lead_add2["check_column"]=2
            df_nonleader = df[~df["Authors"].str.contains(leader)]

            df_coau_or_alldis = df_nonleader[(df_nonleader["Authors"].str.contains(co_author))|(df_nonleader["Title"].str.contains(ALL_dis))]

            df_coau_and_alldis = df_nonleader[(df_nonleader["Authors"].str.contains(co_author))&(df_nonleader["Title"].str.contains(ALL_dis))]

            df_select = df_coau_or_alldis[~((df_coau_or_alldis["Authors"].str.contains(co_author))&(df_coau_or_alldis["Title"].str.contains(ALL_dis)))]
            df_add1 = df_select[(df_select["Authors"].str.contains(co_author))]
            df_add1["check_column"]=1
            df_add2 = df_select[(df_select["Title"].str.contains(ALL_dis))]
            df_add2["check_column"]=2
            
            template = pd.concat([template, df_lead_coau_and_alldis, df_lead_add1, df_lead_add2, df_coau_and_alldis, df_add1, df_add2])

            os.remove(Download_path + "csv-" + name_k + "-set.csv")
            print(name_list["name_list"][k], "True")
            k += 1
        except:
            print(name_list["name_list"][k], "False")
            k += 1
            pass

    template = template.drop_duplicates(subset='PMID')
    template.insert(loc = 1, column = 'disease_class', value = 'other')
    template = template.reset_index(drop=True)

    template = template.dropna(subset=['Title'])
    
    template = disease_class(template)

    template.to_csv("list_" + leader_name + "_group_c3.csv", sep=",", index=False)

create harigai group achievements
masayoshi harigai True
suguru honda True
create ikari group achievements


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


katsunori ikari True
suguru honda True


In [8]:
leader_name = name_list["group_list"].unique()
print(leader_name)

#各グループのcsvファイルを読み込み
csv_files = []
for i, name in enumerate(leader_name):
    csv_files.append(i)
    csv_files[i] = pd.read_csv("list_" + name + "_group_c3.csv", sep=",")

#読み込んだグループファイルをすべて一度連結して、重複をPMIDで削除して、病気別にソートする
for i in range(len(leader_name)):
    if i == 0:
        pass
    else:
        csv_files[0] = pd.concat([csv_files[0], csv_files[i]])
csv_files[0].drop_duplicates(subset='PMID', inplace=True)
csv_files[0].sort_values('disease_class', inplace = True)
csv_files[0]

#病気別ごとのDataFrameに作り変える
disease_class = csv_files[0]["disease_class"].unique()
print(disease_class)
disease_file = []
for i in range(len(disease_class)):
    disease_file.append(i)
    disease_file[i] = csv_files[0][csv_files[0]["disease_class"]==disease_class[i]]
    disease_file[i].reset_index(inplace=True, drop=True)
    disease_file[i].reset_index(inplace=True)
    disease_file[i]["index"] = disease_file[i]["index"]+1
    
#病気ごとのDataFrameを業績フォーマットに変換する
for i in range(len(disease_file)):
    disease_file[i] = disease_file[i]["index"].astype(str) + ". " + disease_file[i]["Authors"] + " " + disease_file[i]["Title"] + " " + disease_file[i]["Citation"] + "doi: " + disease_file[i]["DOI"] + " PMID" + disease_file[i]["PMID"].astype(str) + "\n" 
    disease_file[i].to_csv(disease_class[i] + "_achieve_list.txt", index=False)

['harigai' 'ikari']
['MYO' 'PED' 'RA' 'SLE' 'SS' 'VAS' 'other']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
