In [1]:
import os
import json

import numpy as np
import pandas as pd
from gensim.models import Word2Vec
import matplotlib.pyplot as plt

import torch
from torch_geometric.data import Data
from torch_geometric.nn import Node2Vec

from tqdm import tqdm

In [2]:
'cuda' if torch.cuda.is_available() else 'cpu'

'cpu'

In [32]:
# Read the Chemical Disease Database
chem_dis = pd.read_csv("./CTD/CTD_chemicals_diseases.csv", comment='#', low_memory=False)
chem_dis["DiseaseID"] = chem_dis["DiseaseID"].str.extract(r'(D\d+)')
chem_dis.head()

Unnamed: 0,ChemicalName,ChemicalID,CasRN,DiseaseName,DiseaseID,DirectEvidence,InferenceGeneSymbol,InferenceScore,OmimIDs,PubMedIDs
0,06-Paris-LA-66 protocol,C046983,,Precursor Cell Lymphoblastic Leukemia-Lymphoma,D054198,therapeutic,,,,4519131
1,10074-G5,C534883,,Adenocarcinoma,D000230,,MYC,4.07,,26432044
2,10074-G5,C534883,,Adenocarcinoma of Lung,D000077192,,MYC,4.3,,26656844|27602772
3,10074-G5,C534883,,Alopecia,D000505,,AR,4.49,,15902657
4,10074-G5,C534883,,Androgen-Insensitivity Syndrome,D013734,,AR,6.86,300068|312300,1303262|8281139


In [33]:
%%time
disease_l = chem_dis.DiseaseName.unique().tolist()
chem_l_s = []
for i in tqdm(disease_l):
    chem_l_s.append(chem_dis[chem_dis['DiseaseName'] == i]['ChemicalName'].tolist())

100%|██████████| 7277/7277 [21:40<00:00,  5.60it/s]

CPU times: user 21min 25s, sys: 6.12 s, total: 21min 32s
Wall time: 21min 40s





In [34]:
%%time
chemical_l = chem_dis.ChemicalName.unique().tolist()
disease_seq = []
for i in tqdm(chemical_l):
    disease_seq.append(chem_dis[chem_dis['ChemicalName'] == i]['DiseaseName'].tolist())

 53%|█████▎    | 9248/17337 [26:38<23:18,  5.79it/s]  


KeyboardInterrupt: 

In [35]:
model = Word2Vec(vector_size=50, window=5, min_count=1, workers=4)
model.build_vocab(chem_l_s)
model.train(chem_l_s, total_examples=len(chem_l_s), epochs=10)

KeyboardInterrupt: 

In [None]:
plt.hist([np.log10(len(x)) for x in disease_seq])

In [5]:
pubmed_lists = os.listdir("./PubMed/pubmed_portions_rev")
json_data = []
for i in pubmed_lists:
    with open(os.path.join("./PubMed/pubmed_portions_rev", i), 'r', encoding='utf') as f:
        jsons = json.load(f)
        json_data.extend([x for x in jsons if x['abstract_text'] and x['year']])

pmids = []
years = []
abstract_texts = []

for js in json_data:
    pmids.append(str(js['pmid']))
    years.append(js['year'])
    abstract_texts.append(js['abstract_text'])

pmid_df = pd.DataFrame({
    'pmid': pmids,
    'year': years,
    'abstract_text': abstract_texts
})
pmid_df['pmid'] = pmid_df['pmid'].astype(int)
#pmid_df.to_csv("./CTD_PMid.csv", index=False)

PMid_set = set(pmid_df.pmid.unique())
len(PMid_set)

26662

affiliations = {}
for i in PMid_set:
    affiliations[i] = []

for idx, row in df_demo.iterrows():
    if row['PMID'] in PMid_set:
        affiliations[row['PMID']].append(row["AffiliationOrder"])

affiliation_l = []
for i in pmid_df['pmid']:
    affiliation_l.append(affiliations[i])

pmid_df['affiliation'] = affiliation_l


In [16]:
auth_dict = {}
age_dict = {}
order_dict = {}
for i in PMid_set:
    auth_dict[i] = []
    age_dict[i] = []
    order_dict[i] = []

chunks = pd.read_csv("./OA01_Author_List.csv", chunksize=100000, low_memory=False)
m = 0
for chunk in chunks:
    m += 1
    n = 0
    for idx, row in chunk.iterrows():
        if row['PMID'] in PMid_set:
            n += 1
            auth_dict[row['PMID']].append(row['AND_ID'])
            age_dict[row['PMID']].append(row['PubYear'] - row['BeginYear'])
            order_dict[row['PMID']].append(row['AuOrder'])
    if n:
        print('Chunk', m, n)



Chunk 1 373
Chunk 2 296
Chunk 3 273
Chunk 4 245
Chunk 5 266
Chunk 6 266
Chunk 7 223
Chunk 8 230
Chunk 9 228
Chunk 10 247
Chunk 11 285
Chunk 12 263
Chunk 13 213
Chunk 14 178
Chunk 15 231
Chunk 16 239
Chunk 17 261
Chunk 18 256
Chunk 19 234
Chunk 20 318
Chunk 21 317
Chunk 22 204
Chunk 23 213
Chunk 24 235
Chunk 25 259
Chunk 26 118
Chunk 27 212
Chunk 28 281
Chunk 29 210
Chunk 30 146
Chunk 31 280
Chunk 32 378
Chunk 33 437
Chunk 34 429
Chunk 35 356
Chunk 36 429
Chunk 37 435
Chunk 38 380
Chunk 39 339
Chunk 40 384
Chunk 41 442
Chunk 42 408
Chunk 43 463
Chunk 44 465
Chunk 45 569
Chunk 46 330
Chunk 47 555
Chunk 48 448
Chunk 49 306
Chunk 50 412
Chunk 51 376
Chunk 52 470
Chunk 53 378
Chunk 54 384
Chunk 55 552
Chunk 56 441
Chunk 57 452
Chunk 58 357
Chunk 59 392
Chunk 60 391
Chunk 61 379
Chunk 62 345
Chunk 63 484
Chunk 64 384
Chunk 65 365
Chunk 66 477
Chunk 67 454
Chunk 68 412
Chunk 69 437
Chunk 70 512
Chunk 71 435
Chunk 72 407
Chunk 73 449
Chunk 74 412
Chunk 75 442
Chunk 76 347
Chunk 77 298
Chunk 78

In [2]:
pmid_df = pd.read_csv("./CTD_PMid_f.csv", sep='$')
pmid_df

Unnamed: 0,pmid,year,abstract_text,affiliation,authors,pub_ages,pub_orders,entities
0,706,1976,Thirteen out of 18 young out-patients with sim...,,60807885580950,20,12,"260079301,260079301,0,256846301,257196101,3082..."
1,1803,1976,Mice were given a drug per os and 2 h later we...,,4785193,15,1,"254194001,254194001,304309303,4784103,4853703,..."
2,1959,1976,"In a double-blind trial lasting 2 weeks, a new...",,534573462895918234202756055,21272824,1234,"260202601,107368601,0,7600703,7600703,7600703,..."
3,2004,1976,Eight patients had cardiac manifestations that...,,34343612020640276965424605231048829,297181524,12345,"256349801,106988801,258380301,255367101,255458..."
4,2127,1976,Beta-Adrenergic stimulation with isoproterenol...,,871426834667288417281,71426,123,"4206703,6293303,6293303,6293303,4133203,629330..."
...,...,...,...,...,...,...,...,...
26657,4938432,1971,One hundred and three children with proved typ...,,32027321393129,1911,12,"106936401,106936401,106936401,4253703,99516003..."
26658,4945727,1972,1. Injection of 0.5-2.0 units of vasopressin o...,,72976301687865,014,12,"260959401,260959401,260959401,260959401,260959..."
26659,4951148,1986,Severe intrahepatic cholestasis occurred 13 mo...,,125950587653613327169,5019,123,"107286901,107286901,107420801,106984201,255128..."
26660,4973764,1969,The results of treatment of 25 patients admitt...,,4459668,5,1,"106984001,106984001,106908201,106985901,106985..."


In [17]:
affiliations = {}
for i in PMid_set:
    affiliations[i] = []

chunks = pd.read_csv("./OA04_Affiliations.csv", chunksize=100000, low_memory=False)
m = 0
for chunk in chunks:
    m += 1
    n = 0
    for idx, row in chunk.iterrows():
        if row['PMID'] in PMid_set:
            n += 1
            affiliations[row['PMID']].append(row["Affiliation"])
    if n:
        print('Chunk', m, n)

affiliation_l = []
for i in pmid_df['pmid']:
    affiliation_l.append(affiliations[i])

pmid_df['affiliation'] = affiliation_l
pmid_df["affiliation"] = pmid_df["affiliation"].apply(lambda x: ','.join([str(i) for i in x]))

Chunk 1 393
Chunk 2 377
Chunk 3 414
Chunk 4 443
Chunk 5 357
Chunk 6 436
Chunk 7 375
Chunk 8 359
Chunk 9 418
Chunk 10 436
Chunk 11 364
Chunk 12 443
Chunk 13 467
Chunk 14 388
Chunk 15 427
Chunk 16 434
Chunk 17 390
Chunk 18 379
Chunk 19 468
Chunk 20 416
Chunk 21 403
Chunk 22 425
Chunk 23 404
Chunk 24 393
Chunk 25 408
Chunk 26 409
Chunk 27 407
Chunk 28 426
Chunk 29 428
Chunk 30 415
Chunk 31 446
Chunk 32 403
Chunk 33 440
Chunk 34 469
Chunk 35 456
Chunk 36 322
Chunk 37 383
Chunk 38 435
Chunk 39 425
Chunk 40 434
Chunk 41 467
Chunk 42 469
Chunk 43 20
Chunk 596 1
Chunk 601 1
Chunk 608 1
Chunk 616 3
Chunk 618 1


In [18]:
authors = []
ages = []
orders = []
for i in pmid_df['pmid']:
    authors.append(auth_dict[i])
    ages.append(age_dict[i])
    orders.append(order_dict[i])
pmid_df['authors'] = authors
pmid_df['pub_ages'] = ages
pmid_df['pub_orders'] = orders

In [40]:
len([x for x in pmid_df.affiliation if len(x)]) / len(pmid_df)

0.655427199759958

In [13]:
pmid_df.to_csv("./CTD_PMid.csv", sep='$', index=False)

In [17]:
pmid_df = pd.read_csv("./CTD_PMid.csv", sep='$')
pmid_df.head()

Unnamed: 0,pmid,year,abstract_text,affiliation,authors,pub_ages,pub_orders
0,706,1976,Thirteen out of 18 young out-patients with sim...,[],"[6080788, 5580950]","[2, 0]","[1, 2]"
1,1803,1976,Mice were given a drug per os and 2 h later we...,[],[4785193],[15],[1]
2,1959,1976,"In a double-blind trial lasting 2 weeks, a new...",[],"[5345734, 6289591, 8234202, 756055]","[21, 27, 28, 24]","[1, 2, 3, 4]"
3,2004,1976,Eight patients had cardiac manifestations that...,[],"[3434361, 2020640, 2769654, 2460523, 1048829]","[29, 7, 18, 15, 24]","[1, 2, 3, 4, 5]"
4,2127,1976,Beta-Adrenergic stimulation with isoproterenol...,[],"[8714268, 3466728, 8417281]","[7, 14, 26]","[1, 2, 3]"


In [15]:
df_demo = pd.read_csv("./OA02_Bio_entities_Main.csv", nrows=10000)
df_demo

Unnamed: 0,id,PMID,Start,End,Mention,EntityID,Type
0,1,30941,0,7,Chloride,4167203,drug
1,2,30941,25,28,Bohr,0,drug
2,3,30941,124,127,Bohr,0,drug
3,4,30941,333,335,CO2,292621503,drug
4,5,30941,344,365,"2,3 diphosphoglycerate",271302003,drug
...,...,...,...,...,...,...,...
9995,9996,29125,528,538,noncatechol,0,drug
9996,9997,29125,550,577,2-aminotetrahydronaphthalene,272301403,drug
9997,9998,29125,580,582,ATN,0,drug
9998,9999,29125,767,770,[3H],318065903,drug


In [12]:
entity_code = {}

chunks = pd.read_csv("./OA02_Bio_entities_Main.csv", chunksize=100000, low_memory=False)
m = 0
for chunk in chunks:
    m += 1
    print(m, len(entity_code))
    for idx, row in chunk.iterrows():
        if row["EntityID"] not in entity_code.keys():
            entity_code[row["EntityID"]] = row['Mention']

with open('./entity_code.json', 'w') as f:
    json.dump(entity_code, f)

1 0
2 7944
3 11179
4 14397
5 16664
6 18677
7 20667
8 22466
9 24249
10 25524
11 26539
12 27546
13 28484
14 29263
15 29994
16 30857
17 31494
18 32487
19 33387
20 34117
21 34875
22 35615
23 36420
24 37177
25 38067
26 38894
27 39738
28 40534
29 41424
30 42198
31 43001
32 43701
33 44320
34 44915
35 45530
36 46137
37 46702
38 47140
39 47497
40 47973
41 48396
42 48881
43 49385
44 49914
45 50395
46 50908
47 51366
48 51797
49 52294
50 52658
51 52975
52 53340
53 53720
54 54143
55 54523
56 54939
57 55297
58 56427
59 56805
60 57736
61 58652
62 59375
63 60063
64 60794
65 61428
66 61962
67 62417
68 63302
69 63950
70 64495
71 64979
72 65725
73 66306
74 66804
75 67295
76 67877
77 68424
78 68866
79 69321
80 69884
81 70330
82 70788
83 71261
84 71725
85 72187
86 72592
87 73004
88 73380
89 73728
90 74136
91 74505
92 74832
93 75160
94 75566
95 75932
96 76255
97 76587
98 76935
99 77277
100 77557
101 77851
102 78110
103 78397
104 78632
105 78841
106 79039
107 79332
108 79617
109 79897
110 80093
111 80283
112

KeyboardInterrupt: 

In [19]:
terms = {}
for i in PMid_set:
    terms[i] = []

chunks = pd.read_csv("./OA02_Bio_entities_Main.csv", chunksize=100000, low_memory=False)
m = 0
for chunk in chunks:
    m += 1
    n = 0
    for idx, row in chunk.iterrows():
        if row['PMID'] in PMid_set:
            n += 1
            terms[row['PMID']].append(row["EntityID"])
    if n:
        print('Chunk', m, n)

entities = []
for i in pmid_df['pmid']:
    entities.append(','.join([str(i) for i in terms[i]]))

pmid_df['entities'] = entities

Chunk 1 725
Chunk 2 636
Chunk 3 679
Chunk 4 1189
Chunk 5 554
Chunk 6 611
Chunk 7 664
Chunk 8 728
Chunk 9 542
Chunk 10 342
Chunk 11 460
Chunk 12 545
Chunk 13 254
Chunk 14 537
Chunk 15 636
Chunk 16 402
Chunk 17 676
Chunk 18 758
Chunk 19 516
Chunk 20 609
Chunk 21 491
Chunk 22 652
Chunk 23 639
Chunk 24 755
Chunk 25 653
Chunk 26 754
Chunk 27 784
Chunk 28 581
Chunk 29 834
Chunk 30 581
Chunk 31 805
Chunk 32 897
Chunk 33 823
Chunk 34 586
Chunk 35 808
Chunk 36 772
Chunk 37 1018
Chunk 38 713
Chunk 39 516
Chunk 40 525
Chunk 41 974
Chunk 42 700
Chunk 43 721
Chunk 44 795
Chunk 45 698
Chunk 46 644
Chunk 47 513
Chunk 48 919
Chunk 49 610
Chunk 50 568
Chunk 51 691
Chunk 52 651
Chunk 53 714
Chunk 54 793
Chunk 55 536
Chunk 56 750
Chunk 57 553
Chunk 58 794
Chunk 59 603
Chunk 60 511
Chunk 61 675
Chunk 62 486
Chunk 63 511
Chunk 64 669
Chunk 65 715
Chunk 66 647
Chunk 67 297
Chunk 68 1049
Chunk 69 512
Chunk 70 493
Chunk 71 447
Chunk 72 742
Chunk 73 333
Chunk 74 572
Chunk 75 587
Chunk 76 711
Chunk 77 778
Chunk

In [20]:
pmid_df['pmid']

0            706
1           1803
2           1959
3           2004
4           2127
          ...   
26657    4938432
26658    4945727
26659    4951148
26660    4973764
26661    4981054
Name: pmid, Length: 26662, dtype: int64

In [None]:
entities = []
for i in pmid_df['pmid']:
    entities.append(','.join([str(i) for i in terms[i]]))

pmid_df['entities'] = entities

In [9]:
pmid_df["affiliation"].unique()

array(['', 'Illinois State Psychiatric Institute, Chicago 60612.',
       'Industrial Toxicology Research Centre, Lucknow, India.', ...,
       'Department of Pharmacology and Experimental Therapeutics, Dr. Sampurnanand Medical College, Jodhpur, Rajasthan, India.',
       'Department of Neurology, University of Iowa Hospitals and Clinics, Iowa City.',
       'Departamento de Farmacología y Terapéutica, Facultad de Medicina, Oviedo.'],
      dtype=object)

In [20]:
pmid_df["affiliation"] = pmid_df["affiliation"].apply(lambda x: ','.join([str(i) for i in x]))
pmid_df["authors"] = pmid_df["authors"].apply(lambda x: ','.join([str(i) for i in x]))
pmid_df["pub_ages"] = pmid_df["pub_ages"].apply(lambda x: ','.join([str(i) for i in x]))
pmid_df["pub_orders"] = pmid_df["pub_orders"].apply(lambda x: ','.join([str(i) for i in x]))
pmid_df

Unnamed: 0,pmid,year,abstract_text,affiliation,authors,pub_ages,pub_orders,entities
0,706,1976,Thirteen out of 18 young out-patients with sim...,,60807885580950,20,12,"260079301,260079301,0,256846301,257196101,3082..."
1,1803,1976,Mice were given a drug per os and 2 h later we...,,4785193,15,1,"254194001,254194001,304309303,4784103,4853703,..."
2,1959,1976,"In a double-blind trial lasting 2 weeks, a new...",,534573462895918234202756055,21272824,1234,"260202601,107368601,0,7600703,7600703,7600703,..."
3,2004,1976,Eight patients had cardiac manifestations that...,,34343612020640276965424605231048829,297181524,12345,"256349801,106988801,258380301,255367101,255458..."
4,2127,1976,Beta-Adrenergic stimulation with isoproterenol...,,871426834667288417281,71426,123,"4206703,6293303,6293303,6293303,4133203,629330..."
...,...,...,...,...,...,...,...,...
26657,4938432,1971,One hundred and three children with proved typ...,,32027321393129,1911,12,"106936401,106936401,106936401,4253703,99516003..."
26658,4945727,1972,1. Injection of 0.5-2.0 units of vasopressin o...,,72976301687865,014,12,"260959401,260959401,260959401,260959401,260959..."
26659,4951148,1986,Severe intrahepatic cholestasis occurred 13 mo...,,125950587653613327169,5019,123,"107286901,107286901,107420801,106984201,255128..."
26660,4973764,1969,The results of treatment of 25 patients admitt...,,4459668,5,1,"106984001,106984001,106908201,106985901,106985..."


In [30]:
','.join([])

''

In [40]:
pmid_df.to_csv("./CTD_PMid_f.csv", sep='$', index=False)

In [21]:
pmid_df

Unnamed: 0,pmid,year,abstract_text,affiliation,authors,pub_ages,pub_orders,entities
0,706,1976,Thirteen out of 18 young out-patients with sim...,,60807885580950,20,12,"260079301,260079301,0,256846301,257196101,3082..."
1,1803,1976,Mice were given a drug per os and 2 h later we...,,4785193,15,1,"254194001,254194001,304309303,4784103,4853703,..."
2,1959,1976,"In a double-blind trial lasting 2 weeks, a new...",,534573462895918234202756055,21272824,1234,"260202601,107368601,0,7600703,7600703,7600703,..."
3,2004,1976,Eight patients had cardiac manifestations that...,,34343612020640276965424605231048829,297181524,12345,"256349801,106988801,258380301,255367101,255458..."
4,2127,1976,Beta-Adrenergic stimulation with isoproterenol...,,871426834667288417281,71426,123,"4206703,6293303,6293303,6293303,4133203,629330..."
...,...,...,...,...,...,...,...,...
26657,4938432,1971,One hundred and three children with proved typ...,,32027321393129,1911,12,"106936401,106936401,106936401,4253703,99516003..."
26658,4945727,1972,1. Injection of 0.5-2.0 units of vasopressin o...,,72976301687865,014,12,"260959401,260959401,260959401,260959401,260959..."
26659,4951148,1986,Severe intrahepatic cholestasis occurred 13 mo...,,125950587653613327169,5019,123,"107286901,107286901,107420801,106984201,255128..."
26660,4973764,1969,The results of treatment of 25 patients admitt...,,4459668,5,1,"106984001,106984001,106908201,106985901,106985..."


In [36]:
affiliations_s = []
for k in affiliation_l:
    if k:
        affiliations_s.append(k)
    else:
        affiliations_s.append("")

In [39]:
pmid_df

Unnamed: 0,pmid,year,abstract_text,affiliation,authors,pub_ages,pub_orders,entities
0,706,1976,Thirteen out of 18 young out-patients with sim...,,60807885580950,20,12,"260079301,260079301,0,256846301,257196101,3082..."
1,1803,1976,Mice were given a drug per os and 2 h later we...,,4785193,15,1,"254194001,254194001,304309303,4784103,4853703,..."
2,1959,1976,"In a double-blind trial lasting 2 weeks, a new...",,534573462895918234202756055,21272824,1234,"260202601,107368601,0,7600703,7600703,7600703,..."
3,2004,1976,Eight patients had cardiac manifestations that...,,34343612020640276965424605231048829,297181524,12345,"256349801,106988801,258380301,255367101,255458..."
4,2127,1976,Beta-Adrenergic stimulation with isoproterenol...,,871426834667288417281,71426,123,"4206703,6293303,6293303,6293303,4133203,629330..."
...,...,...,...,...,...,...,...,...
26657,4938432,1971,One hundred and three children with proved typ...,,32027321393129,1911,12,"106936401,106936401,106936401,4253703,99516003..."
26658,4945727,1972,1. Injection of 0.5-2.0 units of vasopressin o...,,72976301687865,014,12,"260959401,260959401,260959401,260959401,260959..."
26659,4951148,1986,Severe intrahepatic cholestasis occurred 13 mo...,,125950587653613327169,5019,123,"107286901,107286901,107420801,106984201,255128..."
26660,4973764,1969,The results of treatment of 25 patients admitt...,,4459668,5,1,"106984001,106984001,106908201,106985901,106985..."


In [37]:
pmid_df['affiliation'] = affiliations_s

In [42]:
pmid_a = pmid_df[pmid_df["affiliation"] != '']
import collections

value_counts = pmid_a["affiliation"].value_counts()

# 按数量排序
sorted_counts = value_counts.sort_values(ascending=False)

# 打印结果
print("Element Counts:")
print(sorted_counts)

Element Counts:
affiliation
[First Department of Pathology, Nagoya City University Medical School, Japan.]                             24
[First Department of Pathology, Nagoya City University Medical School.]                                    18
[Institute of Pharmacology, Polish Academy of Sciences, Kraków.]                                           12
[Department of Anaesthesia, Helsinki University Central Hospital, Finland.]                                10
[Second Department of Internal Medicine, Faculty of Medicine, Kyushu University, Fukuoka, Japan.]          10
                                                                                                           ..
[Department of Gastroenterology and Hepatology, University Hospital, Leiden, The Netherlands.]              1
[Department of Medicine, University of Vermont College of Medicine, Burlington 05405.]                      1
[Department of Psychiatry, Royal Perth Hospital, WA.]                                       