# This is a post-initial collection of data from Web of Science. That is to say, any additional data fields not collected in the original assembly can be integrated into the primary dataframe through this notebook.

## In this case, fields such as publication date, journal name fields, and topic fields could be extracted individually or in groups.

## Citation count can be updated with this notebook as well.

## This notebook can also be used on the dataframe split to generate the three minimalist dataframes to avoid redundant data and excessively large pickle files.

In [None]:
# This code is the MongoConnection class from the Amaral lab LabTools folder.

from __future__ import print_function, unicode_literals
import sys
from pymongo import MongoClient


class MongoConnection(object):
    def __init__(self, cxnSettings, **kwargs):
        self.settings = cxnSettings
        self.mongoURI = self._constructURI()
        self.connect(**kwargs)
        self.ensure_index()

    def _constructURI(self):
        '''
        Construct the mongo URI
        '''
        mongoURI = 'mongodb://'
        #User/password handling
        if 'user'in self.settings and 'password' in self.settings:
            mongoURI += self.settings['user'] + ':' + self.settings['password']
            mongoURI += '@'
        elif 'user' in self.settings:
            print('Missing password for given user, proceeding without either')
        elif 'password' in self.settings:
            print('Missing user for given passord, proceeding without either')
        #Host and port
        try:
            mongoURI += self.settings['host'] + ':'
        except KeyError:
            print('Missing the hostname. Cannot connect without host')
            sys.exit()
        try:
            mongoURI += str(self.settings['port'])
        except KeyError:
            print('Missing the port. Substituting default port of 27017')
            mongoURI += str('27017')
        return mongoURI

    def connect(self, **kwargs):
        '''
        Establish the connection, database, and collection
        '''
        self.connection = MongoClient(self.mongoURI, **kwargs)
        #########
        try:
            self.db = self.connection[self.settings['db']]
        except KeyError:
            print("Must specify a database as a 'db' key in the settings file")
            sys.exit()
        #########
        try:
            self.collection = self.db[self.settings['collection']]
        except KeyError:
            print('Should have a collection.', end='')
            print('Starting a collection in database', end='')
            print(' for current connection as test.')
            self.collection = self.db['test']

    def tearDown(self):
        '''
        Closes the connection
        '''
        self.connection.close()

    def ensure_index(self):
        '''
        Ensures the connection has all given indexes.
        indexes: list of (`key`, `direction`) pairs.
            See docs.mongodb.org/manual/core/indexes/ for possible `direction`
            values.
        '''
        if 'indexes' in self.settings:
            for index in self.settings['indexes']:
                self.collection.ensure_index(index[0], **index[1])

In [None]:
import sys
import matplotlib.pyplot as plt
%matplotlib inline
import copy
import datetime
import pickle
import gzip
import os,glob
import time
import numpy as np
import pandas as pd
#sys.path

## Set up MongoConnection settings

In [None]:
merged_papers_settings = {
    "host": "chicago.chem-eng.northwestern.edu",
    "port": "27017",
    "db": "web_of_science_aux",
    "collection": "merged_papers",
    "user": "mongoreader",
    "password": "emptycoffeecup"
}

issues_settings = {
    "host": "chicago.chem-eng.northwestern.edu",
    "port": "27017",
    "db": "web_of_science_aux",
    "collection": "issues",
    "user": "mongoreader",
    "password": "emptycoffeecup"
}

journal_settings = {
    "host": "chicago.chem-eng.northwestern.edu",
    "port": "27017",
    "db": "web_of_science_aux",
    "collection": "journals",
    "user": "mongoreader",
    "password": "emptycoffeecup"
}

In [None]:
papers_con = MongoConnection(merged_papers_settings)
issue_con = MongoConnection(issues_settings)
journal_con = MongoConnection(journal_settings)

## Load up current dataframes (if split dfs are available)

In [None]:
ref_df = pickle.load(open('../ref_dataframe_min_full.pkl', 'rb'))
plos_df = pickle.load(open('../plos_paper_dataframe_full.pkl', 'rb'))
cite_df = pickle.load(open('../citation_dataframe_full.pkl', 'rb'))

## Join dataframes together

In [None]:
result = ref_df.join(cite_df, on='reference_UT')
ref_df_full = result.join(plos_df, on='paper_UT')

## If there is no split dataframes yet, and only a combined dataframe is available, load that instead. The split dataframes will be created at the end of this notebook

In [None]:
#ref_df_full = pickle.load(open('..', 'rb'))

## Because extraction takes a long time, storing the results between sessions is sometimes necessary. Any data here is stored in a pickled dictionary

## If no data has been collected, then initialize suppl_dict as an empty dictionary. Be sure to comment out immediately to prevent overwriting when rerunning 

In [None]:
#suppl_dict = {}
#suppl_dict= pickle.load(open('../suppl_dict.pkl', 'rb'))

## Collect all UTs for both PLOS paper and reference paper for data collection

In [None]:
plos_uts = ref_df_full['paper_UT'].unique()
ref_uts = ref_df_full['reference_UT'].unique()
all_uts = set(np.append(plos_uts,ref_uts))
print(len(all_uts))

In [None]:
# If suppl_dict is empty, populate it with empty subdicts

#if suppl_dict == {}:
#    for ut in all_uts:
#        suppl_dict[ut] = {}

In [None]:
# Here we simply check for a specific type of entry in our dictionary and compare to the total number of UTs above
c = 0
for i in suppl_dict:
    if 'J9' in suppl_dict[i]:
        c+=1
print(c)

In [None]:
# This cell extracts supplementary information for each PLOS paper and reference paper in WoS. Includes publication date, year, and article type

count = 0
for ut in list(suppl_dict.keys()): # in all_uts
    count+=1
    if count%5000==0:
        print(count)
    
    if 'SC' not in suppl_dict[ut]: # If target field not in a UT's field, then look it up on WoS (prevents rechecking entries already collected)
        doc = papers_con.collection.find_one({"UT":ut})
        if doc != None:

            #if 'PY' in doc['issue']:
            #    suppl_dict[ut]['year'] = doc['issue']['PY']

            #if 'DT' in doc:
            #    suppl_dict[ut]['paper_type'] = doc['DT']

            if 'citations' in doc:
                suppl_dict[ut]['cite_count'] = len(doc['citations'])

            if 'J1' in doc['issue']:
                suppl_dict[ut]['J1'] = doc['issue']['J1']
            if 'J2' in doc['issue']:
                suppl_dict[ut]['J2'] = doc['issue']['J2']
            if 'J9' in doc['issue']:
                suppl_dict[ut]['J9'] = doc['issue']['J9']
            if 'JI' in doc['issue']:
                suppl_dict[ut]['JI'] = doc['issue']['JI']

            if 'PD' in doc['issue']:
                suppl_dict[ut]['pub_date'] = doc['issue']['PD']

            if 'SC' in doc['issue']:
                suppl_dict[ut]['field'] = doc['issue']['SC']
        else:
            print('Error in ut: ' + str(ut))

In [None]:
# if stopped/paused, dump the dictionary in between sessions

with open('../suppl_dict.pkl', 'wb') as handle:
    pickle.dump(suppl_dict, handle, protocol = 2)

## Now that we have a populated suppl_dict, need to fill in the main dataframe. This process could probably be simplified by editing only the small dataframes, but it shouldn't be a big issue since it shouldn't need to be done often

In [None]:
# Set up empty column for new data

ref_df_full['plos_j1'] = -1
ref_df_full['plos_j2'] = -1
ref_df_full['plos_j9'] = -1
ref_df_full['plos_ji'] = -1
ref_df_full['ref_j1'] = -1
ref_df_full['ref_j2'] = -1
ref_df_full['ref_j9'] = -1
ref_df_full['ref_ji'] = -1

In [None]:
'''ref_df_full['plos_pub_date'] = float('nan')
ref_df_full['plos_pub_year'] = float('nan')
ref_df_full['plos_article_type'] = -1
ref_df_full['plos_field'] = -1
ref_df_full['ref_pub_date'] = float('nan')
ref_df_full['ref_pub_year'] = float('nan')
ref_df_full['ref_article_type'] = -1
ref_df_full['ref_field'] = -1'''

In [None]:
# Fill in all the new supplementary data in the ref_df. First get array of results in the proper UT order...

count = 0

'''plos_pub_dates = [-1]*len(ref_df_full)
plos_pub_years = [-1]*len(ref_df_full)
plos_article_types = [-1]*len(ref_df_full)
plos_fields = [-1]*len(ref_df_full)

ref_pub_dates = [-1]*len(ref_df_full)
ref_pub_years = [-1]*len(ref_df_full)
ref_article_types = [-1]*len(ref_df_full)
ref_fields = [-1]*len(ref_df_full)'''

plos_j1s = [-1]*len(ref_df_full)
plos_j2s = [-1]*len(ref_df_full)
plos_j9s = [-1]*len(ref_df_full)
plos_jis = [-1]*len(ref_df_full)
plos_cite_counts = [-1]*len(ref_df_full)

ref_j1s = [-1]*len(ref_df_full)
ref_j2s = [-1]*len(ref_df_full)
ref_j9s = [-1]*len(ref_df_full)
ref_jis = [-1]*len(ref_df_full)
ref_cite_counts = [-1]*len(ref_df_full)


for i in range(len(ref_df_full)):
    plos_ut = ref_df_full.iloc[i]['paper_UT']
    ref_ut = ref_df_full.iloc[i]['reference_UT']
    
    try:
        plos_j1 = suppl_dict[plos_ut]['J1']
    except:
        plos_j1 = -1
    try:
        plos_j2 = suppl_dict[plos_ut]['J2']
    except:
        plos_j2 = -1
    try:
        plos_j9 = suppl_dict[plos_ut]['J9']
    except:
        plos_j9 = -1
    try:
        plos_ji = suppl_dict[plos_ut]['JI']
    except:
        plos_ji = -1
    try:
        plos_cite_count = suppl_dict[plos_ut]['cite_count']
    except:
        plos_cite_count = -1
        
        
    '''try:
        plos_pub_date = suppl_dict[plos_ut]['pub_date']
    except:
        plos_pub_date = float('nan')
    try:
        plos_pub_year = suppl_dict[plos_ut]['year']
    except:
        plos_pub_year = float('nan')
    try:
        plos_article_type = suppl_dict[plos_ut]['paper_type']
    except:
        plos_article_type = -1
    try:
        plos_field = suppl_dict[plos_ut]['field']
    except:
        plos_field = -1        
        
        
    try:
        ref_pub_date = suppl_dict[ref_ut]['pub_date']
    except:
        ref_pub_date = float('nan')
    try:
        ref_pub_year = suppl_dict[ref_ut]['year']
    except:
        ref_pub_year = float('nan')
    try:
        ref_article_type = suppl_dict[ref_ut]['paper_type']
    except:
        ref_article_type = -1
    try:
        ref_field = suppl_dict[ref_ut]['field']
    except:
        ref_field = -1
    '''
    try:
        ref_j1 = suppl_dict[ref_ut]['J1']
    except:
        ref_j1 = -1
    try:
        ref_j2 = suppl_dict[ref_ut]['J2']
    except:
        ref_j2 = -1
    try:
        ref_j9 = suppl_dict[ref_ut]['J9']
    except:
        ref_j9 = -1
    try:
        ref_ji = suppl_dict[ref_ut]['JI']
    except:
        ref_ji = -1
    try:
        ref_cite_count = suppl_dict[ref_ut]['cite_count']
    except:
        ref_cite_count = -1
        
    '''plos_pub_dates[i]=plos_pub_date
    plos_pub_years[i]=plos_pub_year
    plos_article_types[i]=plos_article_type
    plos_fields[i] = plos_field
    
    ref_pub_dates[i]=ref_pub_date
    ref_pub_years[i]=ref_pub_year
    ref_article_types[i]=ref_article_type
    ref_fields[i]=ref_field'''
    
    plos_j1s[i] = plos_j1
    plos_j2s[i] = plos_j2
    plos_j9s[i] = plos_j9
    plos_jis[i] = plos_ji
    plos_cite_counts[i] = plos_cite_count

    ref_j1s[i] = ref_j1
    ref_j2s[i] = ref_j2
    ref_j9s[i] = ref_j9
    ref_jis[i] = ref_ji
    ref_cite_counts[i] = ref_cite_count
    
    
    count+=1
    if count%50000==0:
        print(count)

In [None]:
# Then assign the array as a new column in the dataframe
ref_df_full['plos_j1'] = plos_j1s
ref_df_full['plos_j2'] = plos_j2s
ref_df_full['plos_j9'] = plos_j9s
ref_df_full['plos_ji'] = plos_jis

ref_df_full['ref_j1'] = ref_j1s
ref_df_full['ref_j2'] = ref_j2s
ref_df_full['ref_j9'] = ref_j9s
ref_df_full['ref_ji'] = ref_jis

In [None]:
'''ref_df_full['plos_pub_date'] = plos_pub_dates
ref_df_full['plos_pub_year'] = plos_pub_years
ref_df_full['plos_article_type'] = plos_article_types
ref_df_full['plos_field'] = plos_fields

ref_df_full['ref_pub_date'] = ref_pub_dates
ref_df_full['ref_pub_year'] = ref_pub_years
ref_df_full['ref_article_type'] = ref_article_types
ref_df_full['ref_field'] = ref_fields'''

In [None]:
# Now to make our two smaller dataframes with the new data, simply select the associated colummns.
#(The ref_dataframe_min df doesn't include any additional info gathered this way, so it's just these two)

plos_df = ref_df_full[['paper_UT','paper_cite_count','total_refs', 'paper_char_total', 'paper_word_total', 'plos_pub_date', 'plos_pub_year', 'plos_article_type', 'plos_field', 'plos_j1', 'plos_j2', 'plos_j9', 'plos_ji']]
citation_df = ref_df_full[['reference_UT','cite_count','ref_pub_date','ref_pub_year', 'ref_article_type', 'ref_field', 'ref_j1', 'ref_j2', 'ref_j9', 'ref_ji']]

# Drop all non-unique fields. Note that coincidentally overlapping rows with -1 for ut will be dropped as well
plos_df_unique = plos_df.drop_duplicates('paper_UT')
citation_df_unique = citation_df.drop_duplicates('reference_UT')

In [None]:
# Reassign the index to be the respective UT column. May have to manually drop non-information rows with 
#(indicated by a -1 UT, as no information is possible to be matched in primary dataframe)

plos_df_small = plos_df_unique.set_index('paper_UT')
#plos_df_small = plos_df_small.drop(-1,0)
citation_df_small = citation_df_unique.set_index('reference_UT')
#citation_df_small = citation_df_small.drop(-1,0)

In [None]:
#Lastly, pickle them
with open('../plos_paper_dataframe_full_new.pkl', 'wb') as handle:
    pickle.dump(plos_df_small, handle, protocol = 2)
with open('../citation_dataframe_ful_newl.pkl', 'wb') as handle:
    pickle.dump(citation_df_small, handle, protocol = 2)

In [None]:
# For completeness, or if this is the first time splitting files, then drop all non-UT fields that are already contained in the plos_df and citation_df dataframes
ref_df_min = ref_df_full.drop(['paper_cite_count','total_refs', 'paper_char_total', 'paper_word_total', 'plos_pub_date', 'plos_pub_year', 'plos_article_type', 'cite_count','ref_pub_date','ref_pub_year', 'ref_article_type', 'plos_field','ref_field', 'plos_field', 'plos_j1', 'ref_j1', 'ref_j2',
       'ref_j9', 'ref_ji', 'plos_j2', 'plos_j9', 'plos_ji'], 1)

In [None]:
# And pickle it
with open('/Users/Nathan/dataframes_2/ref_dataframe_min_full.pkl', 'wb') as handle:
    pickle.dump(ref_df_min, handle, protocol = 2)