## This module create CORD-19 corpus dataset as <user, item, rating, item_name, year> and <user_id, author_name>. The items should be defined by user

### 1. import libraries

In [1]:
import pandas as pd
import sys
import json
import numpy as np
import time
from datetime import date, datetime

In [None]:
### 2. import user-defined-functions

In [2]:
from utils import *
from utils2json import *
from functions import *
from utils2ontologies import *
from utils2pubmed import *

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option("max_rows", None)

### get authors name from .csv file

In [5]:
def get_authors_metadata(data, label, ident):
    
    authors = data[data.label == ident].authors.values[0].split(';')
    for a in authors:
        a = a.split(',')
        first = unidecode.unidecode( ''.join(m for m in a[1] if m.isalpha()))
        last = unidecode.unidecode( ''.join(m for m in a[0] if m.isalpha()))
    return first + ', '+  last

### entities defined by user

In [6]:
# if entities is defined by user then saved it in a list
active_lexicons = []
is_chebi, is_do, is_go, is_hp = True, True, False, False

active_lexicons.append('chebi')   
active_lexicons.append('do')      


#### Path from original' json and entities' json folder, blacklist and metadata

In [7]:
original_json_folder = '../data/comm_use_subset/'
entities_json_folder = '../data/comm_use_subset_entities/'
path_to_blacklist = '../data/blacklist/blacklist_articles.txt'
path_to_metadata = '../data/metadata_new.csv'

In [8]:
entities_list_of_json_files = list_files_in_directory(path=entities_json_folder)
print(entities_list_of_json_files)

['81c92759c804f01629de5c8499535a5b300b5f16_entities.json', 'PMC6275129_entities.json', '06ced00a5fc04215949aa72528f2eeaae1d58927_entities.json', '.ipynb_checkpoints', 'PMC446188_entities.json', 'cdb29ec7a9029d22f6fbf7ee04543819591acdc2_entities.json']


##### Get all articles id that cannot be considered in use case 

In [9]:
articles_blacklist = get_blacklist(file=path_to_blacklist)
print(articles_blacklist)

metadata = pd.read_csv(path_to_metadata)

['cdb29ec7a9029d22f6fbf7ee04543819591acdc2\n', '.ipynb_checkpoints\n', 'cdb29ec7a9029d22f6fbf7ee04543819591acdc2\n', '.ipynb_checkpoints\n', '81c92759c804f01629de5c8499535a5b300b5f16']


In [10]:
user_item_rating_all = []
count = 0
#entities_list_of_json_files=['PMC7176216_entities.json']
for file in entities_list_of_json_files:
    # ## Exception
    if file.replace('_entities.json','') in articles_blacklist:
        continue
    # if file.startswith('PMC'):
    #     continue    
    print(count, "-", len(entities_list_of_json_files))            
    print(file)
        
    # check valid json file, i.e. contains values
    try:
        j_file_entities = open_json_file_pd(path=entities_json_folder, file=file)           
    except Exception as e:
        print(f'Json file does not contain values. Error message {e}')
        set_blacklist(file=path_to_blacklist, line=file.replace('_entities.json',''))
        continue
    
    df_entities = get_entities_id(get_entities(j_file_entities))
    if df_entities.empty:
        print(f'Json file does not contain values.')
        set_blacklist(file=path_to_blacklist, line=file.replace('_entities.json',''))
        continue    
    
    print(df_entities)
    article_id = get_article_id(j_file_entities)
    
    # check valid json file, i.e. contains values
    try:
        j_file_original = open_json_file(path=original_json_folder, file=article_id)
        #print(j_file_original)
    except Exception as e:
        print(f'Original json file does not exist. Error message {e}')
        set_blacklist(file=path_to_blacklist, line=file.replace('_entities.json',''))
        continue    
    
    # check if json file contains authors, otherwise try to find them in metadata.csv
    # if value remains null them put this article in the blacklist file
    list_of_authors = get_authors_names(data=j_file_original)
    
###  ---- Metadata ----
    # Some datas return an array, then we convert to a list and get first element
    publish_date=''      
    if article_id.startswith('PMC'):
        try:
            publish_date = metadata[metadata.pmcid == article_id].\
                publish_time.map(lambda v: v.split('-')[0]).tolist()[0]
        except Exception as e:    
            try:
                print(f'Find publish date, now with metapub. Error message {e}')            
                publish_date = get_year_by_metapub(pmcid=article_id) 
            except Exception as e: 
                try:                    
                    print(f'Find publish date, now with Bio. Error message {e}') 
                    pmid = get_pmid(pmcid=article_id) 
                    publish_date = get_year_by_bio(pmid) 
                except Exception as e:
                    print(f'Find publish date, now with metapub. Error message {e}')    
                
        finally:
            pass                                         
    else:
        try:
            publish_date = metadata[metadata.sha == article_id].\
                publish_time.map(lambda v: v.split('-')[0]).tolist()[0]
        finally:
            pass  

    print(publish_date)
    if publish_date==None:
        set_blacklist(file=path_to_blacklist, line=file.replace('_entities.json',''))
        continue
    
    # if authors is empty we will find in metadata.csv file
    if len(list_of_authors)==0:
        ##if string is NaN
        try:
            if article_id.startswith('PMC'):
                try:
                    authors = get_authors_metadata(data=metadata, label=pmcid, ident= article_id)
                    list_of_authors.append(authors) 
                except Exception as e:
                    print(f'Get authors name in pubmed. Error: {e}')    
                    pmid = get_pmid(pmcid=article_id)
                    authors = get_authors_by_bio(pmid)
                    for a in authors:                
                        first = unidecode.unidecode(a['first'])
                        last = unidecode.unidecode(a['last']) 
                    list_of_authors.append(first + ', '+  last)                     
            else:
                authors = get_authors_metadata(data=metadata, label=sha, ident= article_id)
                list_of_authors.append(authors) 
        except Exception as e:
            print(f'Empty values {e}')   
            set_blacklist(file=path_to_blacklist, line=file.replace('_entities.json',''))                
        continue
    count+=1
    # if count > 0:
    #     break

###  ---- End of Metadata ----            
    
    user_item_rating = get_user_item_rating(lst=list_of_authors, df=df_entities)
    #print(user_item_rating)
    
    ## add publish_date in array in index column = 3
    user_item_rating = np.insert(user_item_rating, 3, publish_date, axis=1)
    # print(user_item_rating)
    user_item_rating_all.append(user_item_rating)
    

0 - 6
PMC6275129_entities.json
                                       entities  count   entities_id
0    http://purl.obolibrary.org/obo/CHEBI_33731   1181   CHEBI_33731
1    http://purl.obolibrary.org/obo/CHEBI_35640    663   CHEBI_35640
2    http://purl.obolibrary.org/obo/CHEBI_34935    138   CHEBI_34935
3    http://purl.obolibrary.org/obo/CHEBI_84123    115   CHEBI_84123
4      http://purl.obolibrary.org/obo/DOID_2841    115     DOID_2841
5         http://purl.obolibrary.org/obo/DOID_4     92        DOID_4
6    http://purl.obolibrary.org/obo/CHEBI_33281     46   CHEBI_33281
7    http://purl.obolibrary.org/obo/CHEBI_18111     46   CHEBI_18111
8    http://purl.obolibrary.org/obo/CHEBI_25016     46   CHEBI_25016
9    http://purl.obolibrary.org/obo/CHEBI_33341     46   CHEBI_33341
10     http://purl.obolibrary.org/obo/DOID_4483     46     DOID_4483
11   http://purl.obolibrary.org/obo/CHEBI_28515     28   CHEBI_28515
12   http://purl.obolibrary.org/obo/CHEBI_16044     23   CHEBI_16044
13 

In [11]:
flat_list = []
for sublist in user_item_rating_all:
    for item in sublist:
        flat_list.append(item)

In [12]:
array = np.array(flat_list)

final_data = pd.DataFrame(array,  columns=['user', 'item', 'rating', 'year'])

sum_df = final_data.groupby(['user', 'item', 'year']).size().reset_index().rename(columns={0: 'rating'})
    
df_with_user_id = id_to_index(sum_df)

In [13]:
# swap columns: user and index_user, and after rename to author_name
#df_with_user_id['index_user'], df_with_user_id['user'] = df_with_user_id['user'], df_with_user_id['index_user']
df_with_user_id.rename(columns={'user': 'author_name', 'index_user': 'user'}, inplace = True)

### get entities labels

In [15]:
list_of_entities = df_with_user_id.item.unique()
print(list_of_entities) 

['CHEBI_16044' 'CHEBI_18111' 'CHEBI_24433' 'CHEBI_25016' 'CHEBI_28515'
 'CHEBI_30216' 'CHEBI_33281' 'CHEBI_33341' 'CHEBI_33697' 'CHEBI_33731'
 'CHEBI_34935' 'CHEBI_35640' 'CHEBI_36976' 'CHEBI_50906' 'CHEBI_84123'
 'DOID_0080815' 'DOID_10754' 'DOID_1485' 'DOID_2275' 'DOID_2723'
 'DOID_2841' 'DOID_3083' 'DOID_4' 'DOID_4481' 'DOID_4483' 'DOID_850'
 'DOID_934' 'DOID_9415' 'CHEBI_17089' 'CHEBI_29852' 'CHEBI_33704'
 'CHEBI_36080' 'CHEBI_37527' 'CHEBI_40799' 'CHEBI_46882' 'CHEBI_59132'
 'DOID_0080327' 'DOID_2237' 'DOID_225' 'DOID_2326' 'DOID_2945' 'DOID_6132'
 'DOID_8469' 'CHEBI_132554' 'CHEBI_145810' 'CHEBI_147370' 'CHEBI_15356'
 'CHEBI_16178' 'CHEBI_16240' 'CHEBI_16247' 'CHEBI_16412' 'CHEBI_16480'
 'CHEBI_165825' 'CHEBI_16646' 'CHEBI_16670' 'CHEBI_16749' 'CHEBI_16991'
 'CHEBI_17234' 'CHEBI_17268' 'CHEBI_17306' 'CHEBI_17895' 'CHEBI_18059'
 'CHEBI_18095' 'CHEBI_18154' 'CHEBI_18276' 'CHEBI_18368' 'CHEBI_22907'
 'CHEBI_22984' 'CHEBI_23888' 'CHEBI_24261' 'CHEBI_24400' 'CHEBI_24870'
 'CHEBI_25367

### loading ontologies

In [16]:
chebi, do, go, hp = loading_items(is_chebi, is_do, is_go, is_hp)

Downloading ... chebi.owl
i am here
Downloading ... doid.owl
Loading ... chebi
Loading ... doid


In [33]:
# entities_label = get_entities_labels(list_of_entities, chebi, hp, go, do)
entities_label = get_entities_labels(lst=list_of_entities, prefix_chebi=chebi, prefix_do=do, prefix_go=go, prefix_hp=hp)
print(entities_label)

[rdflib.term.Literal('L-methionine residue', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')), rdflib.term.Literal('ribosomal RNA', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')), rdflib.term.Literal('group', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')), rdflib.term.Literal('lead atom', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')), rdflib.term.Literal('acetoacetamide', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')), rdflib.term.Literal('alpha-particle', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')), rdflib.term.Literal('antimicrobial agent', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')), rdflib.term.Literal('titanium atom', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')), rdflib.term.Literal('ribonucleic acid', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')), rdf

In [31]:
df_entities = pd.DataFrame(list_of_entities, columns=["item_id"])
df_entities["entity_name"] = np.array(entities_label)
print(df_entities)

          item_id                                        entity_name
0     CHEBI_16044                               L-methionine residue
1     CHEBI_18111                                      ribosomal RNA
2     CHEBI_24433                                              group
3     CHEBI_25016                                          lead atom
4     CHEBI_28515                                     acetoacetamide
5     CHEBI_30216                                     alpha-particle
6     CHEBI_33281                                antimicrobial agent
7     CHEBI_33341                                      titanium atom
8     CHEBI_33697                                   ribonucleic acid
9     CHEBI_33731                                            cluster
10    CHEBI_34935                                        propyzamide
11    CHEBI_35640                        adrenergic uptake inhibitor
12    CHEBI_36976                                         nucleotide
13    CHEBI_50906                 

In [34]:
print('mapping labels')
df_with_user_id["item_name"] = df_with_user_id["item"].map(df_entities.set_index('item_id')["entity_name"]).fillna(0)

mapping labels


In [35]:
print(df_with_user_id)

           author_name          item         year  rating  user  \
0          Binbin,  Xu   CHEBI_16044         2018       1     0   
1          Binbin,  Xu   CHEBI_18111         2018       1     0   
2          Binbin,  Xu   CHEBI_24433         2018       1     0   
3          Binbin,  Xu   CHEBI_25016         2018       1     0   
4          Binbin,  Xu   CHEBI_28515         2018       1     0   
5          Binbin,  Xu   CHEBI_30216         2018       1     0   
6          Binbin,  Xu   CHEBI_33281         2018       1     0   
7          Binbin,  Xu   CHEBI_33341         2018       1     0   
8          Binbin,  Xu   CHEBI_33697         2018       1     0   
9          Binbin,  Xu   CHEBI_33731         2018       1     0   
10         Binbin,  Xu   CHEBI_34935         2018       1     0   
11         Binbin,  Xu   CHEBI_35640         2018       1     0   
12         Binbin,  Xu   CHEBI_36976         2018       1     0   
13         Binbin,  Xu   CHEBI_50906         2018       1     

In [36]:
# swap columns: rating 2 item_name
#df_with_user_id['rating'], df_with_user_id['item_name'] = df_with_user_id['item_name'], df_with_user_id['rating']
#df_with_user_id.rename(columns={'rating': 'item_name', 'item_name': 'rating'}, inplace = True)
path_to_cord_ds = '../data/results/comm_subset_cord-19_dataset.csv'
path_to_cord_userid = '../data/results/comm_subset_cord-19_dataset_userid.csv'
print('saving data')
save_final_data(data=df_with_user_id[['user', 'item', 'rating', 'item_name', 'year']], \
        path=path_to_cord_ds)
save_final_data(data=df_with_user_id[['user', 'author_name']], \
        path=path_to_cord_userid)
#save_final_data(data=df_with_user_id[['user', 'author_name', 'item', 'rating', 'item_name', 'year']], \
#        path=path_to_cord_userid)  

saving data
