## DBLP  

#### Download DBLP xml file
Version: `dblp.xml.gz 2018-12-19 00:37`

In [None]:
%%sh 
wget https://dblp.uni-trier.de/xml/dblp.xml.gz
gunzip -k dblp.xml.gz

#### Transform DBLP xml to JSON
Uses parser from arangodb: https://github.com/arangodb/example-datasets/tree/master/DBLP

In [None]:
%%sh
python ../tools/dblp2json.py dblp.xml > dblp.json

#### Import DBLP to Mongo

In [1]:
from pymongo import MongoClient
import json
import os

client = MongoClient(os.environ['MONGO_HOST'], 27017)

In [None]:
with open('dblp.json') as file:
    for line in file:
        w = json.loads(line[:-1])
        client['w-steam']['dblp-works'].insert_one(w)

#### Stats

In [25]:
stats = client['w-steam'].command('collStats','dblp-works')
print('Docs:', stats['count'])
print('Size:',     round(stats['size'] / (1024**3), 2), "GB's" )

Docs: 6625690
Size: 2.54 GB's


#### Delete unnecessary files

In [None]:
%%bash
rm dblp.xml.gz
rm dblp.xml
rm dblp.json

## ORCID

#### Download ORCID public dataset (profiles only)
Version: `22.10.2018, 06:17`

In [None]:
%%sh 
# Downloaded file has different name: ORCID-API-2.0_xml_10_2018.tar.gz
wget https://s3-eu-west-1.amazonaws.com/pstorage-orcid-9853294819483122/13320035/ORCIDAPI2.0_xml_10_2018.tar.gz


#### Transform ORCID xml to JSON
Uses orcid-conversion-lib: https://github.com/ORCID/orcid-conversion-lib

In [None]:
%%sh
java -jar ../tools/orcid-conversion-lib-0.0.2-full.jar --tarball \
     -i ORCID-API-2.0_xml_10_2018.tar.gz \
     -v v2_0 \
     -o ORCID-API-2.0_json_10_2018.tar.gz

In [None]:
%%sh
tar -xzf ORCID-API-2.0_json_10_2018.tar.gz

#### Import ORCID to Mongo

In [1]:
import os

files=[]
for folder in os.listdir("./summaries"):
    path = os.path.join('./summaries', folder)
    for file in os.listdir(path):
        t = (file, os.path.join(path, file))
        files.append( t )

len(files)

5380984

In [6]:
from pymongo import MongoClient
from multiprocessing.dummy import Pool as ThreadPool 
import json
import pymongo


NUM_THREADS=5
BATCH_SIZE =1000

client = MongoClient(os.environ['MONGO_HOST'], 27017)

def mongo_insert(files):
    batch = []
    for i in range(len(files)):
        file_name, file_path = files[i]
        with open(file_path) as f:
            doc = json.load(f)
            # Remove extension .json
            doc['_id'] = file_name[:-5] 

            # Ignore publications (some files are bigger than 16mb)
            if 'activities-summary' in doc:
                doc['activities-summary']['works'] = {}

            batch.append(doc)

            if i % BATCH_SIZE == 0 or i == len(files)-1:
                client['w-steam']['orcid-summaries'].insert_many( batch )
                batch = []

            
def split_list(a, n):
    k, m = divmod(len(a), n)
    return list(a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))

            
pool = ThreadPool(NUM_THREADS) 
pool.map(mongo_insert, split_list(files, NUM_THREADS))
pool.close() 
pool.join()

#### Stats

In [27]:
stats = client['w-steam'].command('collStats','orcid-summaries')
print('Docs:', stats['count'])
print('Size:',     round(stats['size'] / (1024**3), 2), "GB's" )

Docs: 5380984
Size: 15.42 GB's


#### Delete unnecessary files

In [9]:
%%sh
rm -r summaries
rm ORCID-API-2.0_xml_10_2018.tar.gz
rm ORCID-API-2.0_json_10_2018.tar.gz

## MAG 
Version: `2018-10-26` (via Microsoft Academic API Team)

Files:
* MAG/Affiliations.txt
* MAG/Authors.txt
* MAG/Journals.txt
* MAG/PaperAuthorAffiliations.txt
* MAG/Papers.txt


#### Import MAG to Mongo

In [None]:
with open("MAG/Papers.txt") as file:
    docs = []
    for line in file:
        a = line[:-1].split('\t')
        if len(a) == 22:      
            d = {
                'PaperId' : a[0],
                'Rank': a[1],
                'Doi': a[2],
                'DocType': a[3],
                'PaperTitle': a[4],
                'OriginalTitle': a[5],
                'BookTitle': a[6],
                'Year': a[7],
                'Date': a[8],
                'Publisher': a[9],
                'JournalId': a[10],
                'ConferenceSeriesId': a[11],
                'ConferenceInstanceId': a[12],
                'Volume': a[13],
                'Issue': a[14],
                'FirstPage': a[15],
                'LastPage': a[16],
                'ReferenceCount': a[17],
                'CitationCount': a[18],
                'EstimatedCitation': a[19],
                'OriginalVenue': a[20],
                'CreatedDate': a[21],    
            }
            
            docs.append(d)
            
            if len(docs) == 10000:
                client['w-steam']['mag-papers'].insert_many(docs)
                docs = []
                
    
    if len(docs) > 0:
        client['w-steam']['mag-papers'].insert_many(docs)
        

In [None]:
with open("MAG/PaperAuthorAffiliations.txt") as file:
    docs = []
    for line in file:
        a = line[:-1].split('\t')
        if len(a) == 5:
            d = {
                'PaperId':  a[0],
                'AuthorId': a[1],
                'AffiliationId': a[2],
                'AuthorSequenceNumber': a[3],
                'OriginalAffiliation':  a[4]
            }
            
            docs.append(d)
            
            if len(docs) == 100000:
                client['w-steam']['mag-papers-author-affiliations'].insert_many(docs)
                docs = []
                
    
    if len(docs) > 0:
        client['w-steam']['mag-papers-author-affiliations'].insert_many(docs)


In [None]:
with open("MAG/Authors.txt") as file:
    docs = []
    for line in file:
        a = line[:-1].split('\t')
        if len(a) == 8:
            d = {
                'AuthorId': a[0],
                'Rank': a[1],
                'NormalizedName': a[2],
                'DisplayName': a[3],
                'LastKnownAffiliationId': a[4],
                'PaperCount': a[5],
                'CitationCount': a[6],
                'CreatedDate': a[7]
            }

            docs.append(d)
            
            if len(docs) == 100000:
                client['w-steam']['mag-authors'].insert_many(docs)
                docs = []
                
    
    if len(docs) > 0:
        client['w-steam']['mag-authors'].insert_many(docs)
        

In [None]:
with open("MAG/Affiliations.txt") as file:
    docs = []
    for line in file:
        a = line[:-1].split('\t')
        if len(a) == 10:
            d = {
                'AffiliationId': a[0],
                'Rank': a[1],
                'NormalizedName': a[2],
                'DisplayName': a[3],
                'GridId': a[4],
                'OfficialPage': a[5],
                'WikiPage': a[6],
                'PaperCount': a[7],
                'CitationCount': a[8],
                'CreatedDate': a[9]    
            }
            
            docs.append(d)
            
            if len(docs) == 100000:
                client['w-steam']['mag-affiliations'].insert_many(docs)
                docs = []
                
    
    if len(docs) > 0:
        client['w-steam']['mag-affiliations'].insert_many(docs)
        

In [None]:
with open("MAG/Journals.txt") as file:
    docs = []
    for line in file:
        a = line[:-1].split('\t')
        if len(a) == 10:
            d = {
                'JournalId': a[0],
                'Rank': a[1],
                'NormalizedName': a[2],
                'DisplayName': a[3],
                'Issn': a[4],
                'Publisher': a[5],
                'Webpage': a[6],
                'PaperCount': a[7],
                'CitationCount': a[8],
                'CreatedDate': a[9]
            }
                        
            docs.append(d)
            
            if len(docs) == 100000:
                client['w-steam']['mag-journals'].insert_many(docs)
                docs = []
                
    
    if len(docs) > 0:
        client['w-steam']['mag-journals'].insert_many(docs)
        

#### Stats

In [33]:
# Affiliations
stats = client['w-steam'].command('collStats','mag-affiliations')
print('Affiliations')
print('\tDocs:', stats['count'])
print('\tSize:',     round(stats['size'] / (1024**3), 2), "GB's" )
print()

# Authors
stats = client['w-steam'].command('collStats','mag-authors')
print('Authors')
print('\tDocs:', stats['count'])
print('\tSize:',     round(stats['size'] / (1024**3), 2), "GB's" )
print()

# Journals
stats = client['w-steam'].command('collStats','mag-journals')
print('Journals')
print('\tDocs:', stats['count'])
print('\tSize:',     round(stats['size'] / (1024**3), 2), "GB's" )
print()

# Paper-Author-Affiliations
stats = client['w-steam'].command('collStats','mag-papers-author-affiliations')
print('Papers-author-affiliations')
print('\tDocs:', stats['count'])
print('\tSize:',     round(stats['size'] / (1024**3), 2), "GB's" )
print()

# Papers
stats = client['w-steam'].command('collStats','mag-papers')
print('Papers')
print('\tDocs:', stats['count'])
print('\tSize:',     round(stats['size'] / (1024**3), 2), "GB's" )


Affiliations
	Docs: 25414
	Size: 0.01 GB's

Authors
	Docs: 253363081
	Size: 53.39 GB's

Journals
	Docs: 48024
	Size: 0.01 GB's

Papers-author-affiliations
	Docs: 552327638
	Size: 89.95 GB's

Papers
	Docs: 209449323
	Size: 126.14 GB's
