In [1]:
import sys

import matplotlib.pyplot as plt
%matplotlib inline
import copy
import datetime
import pickle
import gzip
import os,glob
import time
import numpy as np
import pandas as pd
import operator

#sys.path
from tqdm import tqdm
from tqdm import tnrange, tqdm_notebook

import regex as re
import datetime
import math
import time

from IPython.core.display import display,HTML
display(HTML("<style>.container { width:100% !important; }</style>"))  # to make the notebook use the entire width of the browser


In [2]:
# This code is the MongoConnection class from the Amaral lab LabTools folder.

from __future__ import print_function, unicode_literals
import sys
from pymongo import MongoClient


class MongoConnection(object):
    def __init__(self, cxnSettings, **kwargs):
        self.settings = cxnSettings
        self.mongoURI = self._constructURI()
        self.connect(**kwargs)
        self.ensure_index()

    def _constructURI(self):
        '''
        Construct the mongo URI
        '''
        mongoURI = 'mongodb://'
        #User/password handling
        if 'user'in self.settings and 'password' in self.settings:
            mongoURI += self.settings['user'] + ':' + self.settings['password']
            mongoURI += '@'
        elif 'user' in self.settings:
            print('Missing password for given user, proceeding without either')
        elif 'password' in self.settings:
            print('Missing user for given passord, proceeding without either')
        #Host and port
        try:
            mongoURI += self.settings['host'] + ':'
        except KeyError:
            print('Missing the hostname. Cannot connect without host')
            sys.exit()
        try:
            mongoURI += str(self.settings['port'])
        except KeyError:
            print('Missing the port. Substituting default port of 27017')
            mongoURI += str('27017')
        return mongoURI

    def connect(self, **kwargs):
        '''
        Establish the connection, database, and collection
        '''
        self.connection = MongoClient(self.mongoURI, **kwargs)
        #########
        try:
            self.db = self.connection[self.settings['db']]
        except KeyError:
            print("Must specify a database as a 'db' key in the settings file")
            sys.exit()
        #########
        try:
            self.collection = self.db[self.settings['collection']]
        except KeyError:
            print('Should have a collection.', end='')
            print('Starting a collection in database', end='')
            print(' for current connection as test.')
            self.collection = self.db['test']

    def tearDown(self):
        '''
        Closes the connection
        '''
        self.connection.close()

    def ensure_index(self):
        '''
        Ensures the connection has all given indexes.
        indexes: list of (`key`, `direction`) pairs.
            See docs.mongodb.org/manual/core/indexes/ for possible `direction`
            values.
        '''
        if 'indexes' in self.settings:
            for index in self.settings['indexes']:
                self.collection.ensure_index(index[0], **index[1])

In [3]:
merged_papers_settings = {
    "host": "chicago.chem-eng.northwestern.edu",
    "port": "27017",
    "db": "web_of_science_aux",
    "collection": "merged_papers",
    "user": "mongoreader",
    "password": "emptycoffeecup"
}
issues_settings = {
    "host": "chicago.chem-eng.northwestern.edu",
    "port": "27017",
    "db": "web_of_science_aux",
    "collection": "issues",
    "user": "mongoreader",
    "password": "emptycoffeecup"
}
journal_settings = {
    "host": "chicago.chem-eng.northwestern.edu",
    "port": "27017",
    "db": "web_of_science_aux",
    "collection": "journals",
    "user": "mongoreader",
    "password": "emptycoffeecup"
}
papers_con = MongoConnection(merged_papers_settings)
issue_con = MongoConnection(issues_settings)
journal_con = MongoConnection(journal_settings)

In [None]:
# List_papers=['271721400012', '268637800016', '275809700007', '266872900016', '273779000031', '276311600005', '265514200012', '269229800016', '269415800006', '279140800001', '275257300012',\
#              '280557200015', '269229800011', '265837000004', '266214700013', '274590500027', '271685800011', '280371800009', '275620600001', '273338500021', '272828400001', '266320000010',\
#              '280243300011', '278599600024', '275063400012', '283216400020', '283573800004', '276454000014', '266490000014', '265510800008', '268637700007', '269220900003', '265490800004',\
#              '266415000008', '278601000002', 273414100007, 266716900003, 273896300006, 276952500022, 270594300016, 265505300002, 277776500010, 277079500001, 273554600021, 279259900008, 277773700011, 268739100017, 275620900007, 270160900010, 273896500011, 271721900009, 284087800017, 268035600004, 266107500006, 280520400008, 267081300004, 282869800004, 268773300004, 282748100005, 265513800007, 285578000036, 285575200067, 285041800017, 265505700011, 274231500013, 274442400021, 276454000008, 265514400020, 265482400006, 275328800002, 285579200021, 284231800014, 265513800010, 271936700019, 276418200050, 274442800021, 274139100002, 282676700018, 267806300015, 277239500014, 276418200009, 280243800032, 275894500009, 281687100001, 269267400001, 278125500010, 266221100026]


# new_list= [ '000'+str(item) for item in List_papers]

# new_list

In [None]:

# for UT in new_list:
#     doc = papers_con.collection.find_one({"UT":UT})  # i look at a given ref_ut paper
#         #if 'citations' in doc:  # if the paper ut has recieved any citations 
        
#     print  (UT,doc['TI'] )
    

In [4]:
def datetime_filler_traj(traj_dict):
    # Generate a new dictionary from a trajectory dictionary with datetime objects. Includes the plos_flag value
    

    numeric = re.compile('[0-9]+')
    alpha = re.compile('[A-Za-z]+')
    month_dict = {'JAN':1, 'FEB': 2, 'MAR':3, 'APR':4, 'MAY':5, 'JUN': 6, 
                  'JUL': 7, 'AUG': 8, 'SEP': 9, 'OCT':10, 'NOV': 11, 'DEC': 12,
                 'SPR': 3, 'SUM': 6, 'FAL': 9, 'WIN': 12}
    formatted_traj_dict = {}
    
    for ref_ut in traj_dict:
        formatted_traj_dict[ref_ut] = {}
        for cite_ut in traj_dict[ref_ut]:
            date = traj_dict[ref_ut][cite_ut][0]
            year = traj_dict[ref_ut][cite_ut][1]
            plos_flag = traj_dict[ref_ut][cite_ut][2]
            
            date_field = []

            if type(date) == str:
                
                for i in re.findall(alpha, date):
                    date_field.append(i)
                for i in re.findall(numeric, date):
                    date_field.append(i)
                
                if len(date_field) > 2: # No weirdness here, only 2 values max for date field
                    print(date_field)
                
                if len(date_field[0])==1: # There's a single date that is just '0'. Likely a database error/nan
                    print(date_field)

                    
            else:
                date_field = ['JUNE', '15']
                
            month = -1
            day = -1
            
            if date_field[0] in month_dict:
                month = month_dict[date_field[0]]
                if len(date_field)==2:
                    if len(date_field[1])<3:
                        day = int(date_field[1])
            
                        
            if not math.isnan(year):
                if month != -1:
                    if day != -1:
                        dt = datetime.datetime(year, month, day)
                    else:
                        dt = datetime.datetime(year, month, 15)
                    
                else:
                    dt = datetime.datetime(year, 6, 15)


            else:
                dt = -1
            
            formatted_traj_dict[ref_ut][cite_ut] = [dt, plos_flag]

    return formatted_traj_dict

In [5]:
def filter_df_zero_citations(df_pap_,df_ref_,df_pap_ref_):

    list_ref_ut_ = list(df_ref_[\
                              (df_ref_['cite_count']>0)\
                             ].index.values)
    df_ref_sel_ = df_ref_[df_ref_.index.isin(list_ref_ut_)]
    df_pap_ref_sel_ = df_pap_ref_[df_pap_ref_['reference_UT'].isin(list_ref_ut_)]
    ## identify the remaining pap-uts and cut the df_pap dataframe
    list_pap_ut_ = df_pap_ref_sel_['paper_UT'].drop_duplicates().tolist()
    df_pap_sel_ = df_pap_[df_pap_.index.isin(list_pap_ut_)]

    ## in case we want to get rid again of the 'unique_occurrences'-column
    # df_ref_sel_.drop('occurrences_unique', axis=1, inplace=True)
    return df_pap_sel_,df_ref_sel_,df_pap_ref_sel_

def filter_df_plos(df_pap_,df_ref_,df_pap_ref_, y1_,y2_, list_pap_art_type_, list_pap_field_ = None):
    '''filter our three dataframes
        - publication-year-interval >=y1_ <= y2_
        - article-type: list_pap_art_type_

    TODO:
        - scientific field 
        - journal (plosone,plosbio, ....)
    '''
    ## identify paper-uts that satisfy the constraints
    if list_pap_field_ == None: ## no contraint no field
        list_pap_ut_ = list(df_pap_[\
                              (df_pap_['plos_pub_year']>=y1_)&(df_pap_['plos_pub_year']<=y2_)\
                                &(df_pap_['plos_article_type'].isin(list_pap_art_type_))\
                             ].index.values)
    else: ## constraint on field
        ## create list-field as string
        x_tmp = df_pap_['plos_field'].values
        y_tmp = [str(h) for h in x_tmp]
        df_pap_.loc[:,'plos_field_str'] = y_tmp
        field_sel_str_ = '|'.join(list_pap_field_)
        list_pap_ut_ = list(df_pap_[\
                      (df_pap_['plos_pub_year']>=y1_)&(df_pap_['plos_pub_year']<=y2_)\
                        &(df_pap_['plos_article_type'].isin(list_pap_art_type_))\
                        &(df_pap_['plos_field_str'].str.contains(field_sel_str_))\
                     ].index.values)

    ## cut the df_pap dataframe
    df_pap_sel_ = df_pap_[df_pap_.index.isin(list_pap_ut_)]
    try:
        df_pap_sel_.drop('plos_field_str', axis=1, inplace=True)
    except ValueError:
        pass

    ## cut the df_pap_ref dataframe
    df_pap_ref_sel_ = df_pap_ref_[df_pap_ref_['paper_UT'].isin(list_pap_ut_)]

    ## identify the remaining ref-uts and cut the df_ref dataframe
    list_ref_ut_ = df_pap_ref_sel_['reference_UT'].drop_duplicates().tolist()
    df_ref_sel_ = df_ref_[df_ref_.index.isin(list_ref_ut_)]
    return df_pap_sel_,df_ref_sel_,df_pap_ref_sel_

def filter_df_ref(df_pap_,df_ref_,df_pap_ref_, n_occ_min_,n_occ_max_, y1_,y2_, list_ref_art_type_, list_ref_field_ = None):
    '''filter our three dataframes
        - number of unique occurrences in plos-paers: n_thresh
        - publication-year-interval >=y1_ <= y2_
        - article-type: list_pap_art_type_

    TODO:
        - scientific field 
    '''
    ## create a new column in df_ref with unique number of occurrences
    df_tmp = df_pap_ref_[['paper_UT', 'reference_UT']].drop_duplicates() # remove multi-occurence of the same ref in the same paper
    x_ = df_tmp.groupby(['reference_UT'])['reference_UT'].value_counts()
    x_ut_ = [h[0] for h in x_.index.tolist()]
    x_series_tmp = pd.Series(data=x_.values,index=x_ut_)
    df_ref_.loc[:,'occurrences_unique'] = x_series_tmp

    ## filter ref-uts and cut the df_ref dataframe
    if list_ref_field_ == None: ## no constraint on field

        list_ref_ut_ = list(df_ref_[\
                                  (df_ref_['occurrences_unique']>=n_occ_min_)&(df_ref_['occurrences_unique']<=n_occ_max_)\
                                  &(df_ref_['ref_pub_year']>=y1_)&(df_ref_['ref_pub_year']<=y2_)\
                                  &(df_ref_['ref_article_type'].isin(list_ref_art_type_))\
                                 ].index.values)

    else: ## constraint on field
        ## create list-field as string
        x_tmp = df_ref_['ref_field'].values
        y_tmp = [str(h) for h in x_tmp]
        df_ref_.loc[:,'ref_field_str'] = y_tmp
        field_sel_str_ = '|'.join(list_ref_field_)
        list_ref_ut_ = list(df_ref_[\
                          (df_ref_['occurrences_unique']>=n_occ_min_)&(df_ref_['occurrences_unique']<=n_occ_max_)\
                          &(df_ref_['ref_pub_year']>=y1_)&(df_ref_['ref_pub_year']<=y2_)\
                          &(df_ref_['ref_article_type'].isin(list_ref_art_type_))\
                          &(df_ref_['ref_field_str'].str.contains(field_sel_str_))\
                         ].index.values)

    df_ref_sel_ = df_ref_[df_ref_.index.isin(list_ref_ut_)]

    ## cut the df_pap dataframe
    try:
        df_ref_sel_.drop('ref_field_str', axis=1, inplace=True)
    except ValueError:
        pass


    ## cut the df_pap_ref dataframe
    df_pap_ref_sel_ = df_pap_ref_[df_pap_ref_['reference_UT'].isin(list_ref_ut_)]

    ## identify the remaining pap-uts and cut the df_pap dataframe
    list_pap_ut_ = df_pap_ref_sel_['paper_UT'].drop_duplicates().tolist()
    df_pap_sel_ = df_pap_[df_pap_.index.isin(list_pap_ut_)]

    ## in case we want to get rid again of the 'unique_occurrences'-column
    # df_ref_sel_.drop('occurrences_unique', axis=1, inplace=True)
    return df_pap_sel_,df_ref_sel_,df_pap_ref_sel_

In [None]:

ref_df_min = pickle.load(open('../data/ref_dataframe_min.pkl', 'rb'))
plos_df = pickle.load(open('../data/plos_paper_dataframe.pkl', 'rb'))
cite_df = pickle.load(open('../data/citation_dataframe.pkl', 'rb'))
#ref_df = pd.concat([ref_df_p1, ref_df_p2])
#suppl_dict= pickle.load(open('../suppl_dict.txt', 'rb'))
print ("done loading pickles")





result = ref_df_min.join(cite_df, on='reference_UT')
ref_df = result.join(plos_df, on='paper_UT')
print ("done joining dfs")


# Remove null reference columns (8907763 rows vs 10848620 rows)
ref_df = ref_df.loc[(ref_df['reference_UT']!='-1')]
print ("done dropping observations")


del ref_df_min





###############################
###### filtering plos-articles

pap_pub_year_min = 2000
pap_pub_year_max = 2016
pap_art_type = ['@ Article','L Letter']   # ['@ Article','L Letter','K Article','N Note']  # we dont include the Review papers for now (it can be argued their behavior is different)


## the most common  types:
# @ Article               156824
# R Review                  1469
# E Editorial Material       746
# C Correction                10
# I Biographical-Item          4
# L Letter                     4

list_pap_field = None ## if you want to keep all fields




####################################
###### filtering reference-article

n_thresh = 10   # min and max number of occurences of any given ut_ref paper on plos papers
n_max = 100000

ref_pub_year_min = 1900
ref_pub_year_max = 2000
list_ref_art_type = ['@ Article','L Letter','N Note']  # ['@ Article','L Letter','K Article','N Note']    # we dont include the Review papers for now (it can be argued their behavior is different)

### the most common types:
# @ Article                       2268122
# R Review                         260165
# E Editorial Material              36944
# N Note                            18231
# L Letter                          17953
# M Meeting Abstract                 3376
# 5 News Item                        1967
# C Correction                        116


list_ref_field = None ## if you want to keep all fields    





df_pap, df_ref, df_pap_ref = filter_df_plos(plos_df, cite_df, ref_df,\
                                            pap_pub_year_min, pap_pub_year_max,\
                                            pap_art_type,\
                                           list_pap_field_ = list_pap_field\
                                           ) 




df_pap, df_ref, df_pap_ref = filter_df_ref(df_pap, df_ref, df_pap_ref,\
                                           n_thresh, n_max,\
                                           ref_pub_year_min, ref_pub_year_max,\
                                           list_ref_art_type,\
                                          list_ref_field_ = list_ref_field\
                                          )





print("tot. numb. ref. occurences:",ref_df.shape, "   # unique ref UTs", len(ref_df['reference_UT'].unique() ), "   mean # citations:",ref_df['cite_count'].mean() )
print("tot. numb. ref. occurences:",df_pap_ref.shape, "   # unique ref UTs", len(df_pap_ref['reference_UT'].unique() ), "   # unique plos UTs", len(df_pap_ref['paper_UT'].unique() ), "   mean # citations:",df_pap_ref['cite_count'].mean() )





lista_ref_uts=list(df_pap_ref.reference_UT.unique())
lista_plos_uts=list(df_pap_ref.paper_UT.unique())

with open('../data/lista_selected_ref_uts.pkl', 'wb') as handle:
     pickle.dump(lista_ref_uts, handle, protocol = 2)
        
with open('../data/lista_selected_plos_uts.pkl', 'wb') as handle:
     pickle.dump(lista_plos_uts, handle, protocol = 2)


del df_pap
del df_ref
del plos_df
del cite_df
del ref_df






In [None]:
################################# OR  (for selected UTs and selected plos)
try:
    lista_ref_uts = pickle.load(open('../data/lista_selected_ref_uts.pkl', 'rb'))
    print ("read lista_ref_uts", len(lista_ref_uts))
except:
     print ("lista_ref_uts not found")

    
try:
    lista_plos_uts = pickle.load(open('../data/lista_selected_plos_uts.pkl', 'rb'))
    print ("read lista_plos_uts", len(lista_plos_uts))
except:
    print ("list_plos_uts not found")
    
    
    
    

In [6]:
#################  OR (for ALL trajectories for all UTs)
try:
    lista_all_reference_UTs = pickle.load(open('../data/lista_all_reference_UTs.pkl', 'rb'))
    print ("read lista_ref_uts", len(lista_all_reference_UTs))
except:
     print ("lista_ref_uts not found")

    
try:
    lista_all_plos_UTs = pickle.load(open('../data/lista_all_plos_UTs.pkl', 'rb'))
    print ("read lista_plos_uts", len(lista_all_plos_UTs))
except:
    print ("list_plos_uts not found")
    
    
    



read lista_ref_uts 2607457
read lista_plos_uts 158813


In [None]:

# df_merged = pickle.load(open('../data/df_reference_cite_plos_merged_simplified_cols.pkl', 'rb'))
# print ("done loading pickle", df_merged.shape)



# # df_merged.columns   # ['occurence', 'paper_UT', 'reference_UT', 'reference_rank',  'regex_sect_index', 'cite_count', 'ref_pub_year', 'paper_cite_count',  'total_refs', 'plos_pub_year']

# lista_all_reference_UTs=list(df_merged.reference_UT.unique())
# with open('../data/lista_all_reference_UTs.pkl', 'wb') as handle:
#      pickle.dump(lista_all_reference_UTs, handle, protocol = 2)

# len(lista_all_reference_UTs)



# lista_all_plos_UTs=list(df_merged.paper_UT.unique())
# with open('../data/lista_all_plos_UTs.pkl', 'wb') as handle:
#      pickle.dump(lista_all_plos_UTs, handle, protocol = 2)

# len(lista_all_plos_UTs)

In [None]:
# def retrieve_trajectory(ut, plos_uts):
#     # Retrieve a citation trajectory for a single reference UT. This version collects -all- WoS citations for a given UT,
#     # PLOS or not, so this one takes a lot more time, but shows the underlaying shape of the citation profile.
#     # Requires inputting the list of all plos UTs ('paper_UT' column in the plos dataframe)
#     count = 0
#     cite_dates = {}
    
#     doc = papers_con.collection.find_one({"UT":ut})
#     if 'citations' in doc:
#         for ref_ut in doc['citations']:
#             count+=1
#             ref = papers_con.collection.find_one({"UT":ref_ut})
#             year = float('nan')
#             date = float('nan')
#             plos_flag = False
#             if ref:
#                 if 'issue' in ref:
#                     if 'PY' in ref['issue']:
#                         year = ref['issue']['PY']
#                     if 'PD' in ref['issue']:
#                         date = ref['issue']['PD']
#                     if ref_ut in plos_uts:
#                         plos_flag = True
                    
# #             if count%1000==0:
# #                 print(count)   
#             cite_dates[ref_ut] = [date, year, plos_flag]

#     return cite_dates



# ###############################3
# def retrieve_trajectory_new_old(ut, plos_uts, list_citing_papers, dict_citing_ut_year):
    
#      # Retrieve a citation trajectory for a single reference UT. This version collects -all- WoS citations for a given UT,
#     # PLOS or not, so this one takes a lot more time, but shows the underlaying shape of the citation profile.
#     # Requires inputting the list of all plos UTs ('paper_UT' column in the plos dataframe) to flag those papers that are Plos
#     #count = 0
#     cite_dates = {}
    
#     doc = papers_con.collection.find_one({"UT":ut})  # i look at a given ref_ut paper
#     if 'citations' in doc:
#         for ref_ut in doc['citations']:   # i iterate over all papers that have CITED this ref_ut paper
            
#             list_citing_papers.append(ref_ut)
            
#             try:
#                 dict_citing_ut_year[ref_ut]  # if i have encounter this citing paper before, i dont need to go query it again for its publication year
#                 cite_dates[ref_ut]=dict_citing_ut_year[ref_ut]
                
#             except KeyError:
                                    
#                 #count+=1
#                 ref = papers_con.collection.find_one({"UT":ref_ut})
#                 year = float('nan')
#                 date = float('nan')
#                 plos_flag = False
#                 if ref:
#                     if 'issue' in ref:
#                         if 'PY' in ref['issue']:
#                             year = ref['issue']['PY']
#                         if 'PD' in ref['issue']:
#                             date = ref['issue']['PD']
#                         if ref_ut in plos_uts:
#                             plos_flag = True

#                 cite_dates[ref_ut] = [date, year, plos_flag]
#                 dict_citing_ut_year[ref_ut] = [date, year, plos_flag]
            
            
#     return cite_dates
    

###########################


def retrieve_trajectory_new(ref_ut, plos_uts, list_citing_papers, dict_citing_ut_year):
    
    # Retrieve a citation trajectory for a single reference UT. This version collects -all- WoS citations for a given UT,
    # PLOS or not, so this one takes a lot more time, but shows the underlaying shape of the citation profile.
    # Requires inputting the list of all plos UTs ('paper_UT' column in the plos dataframe) to flag those papers that are Plos
    #count = 0
    
    
    cite_dates = {}
    
    doc = papers_con.collection.find_one({"UT":ref_ut})  # i look at a given ref_ut paper
    if 'citations' in doc:  # if the paper ut has recieved any citations 
        
        list_citations= doc['citations'] 
        print (len(list_citations))
            
        list_citing_papers += list_citations#_all
           
        query = papers_con.collection.find({"UT":{"$in":list_citations}},{"UT":1,"issue.PY":1,"issue.PD":1}, no_cursor_timeout=True)  # it returns an iterator (it gets empty after iterating over it once) # second {} to select the fields it returns, if i dont want them all
               
            
        for item in query:  # query (cursor) is an iterator (once i iterate over it once, it is empty), and every item is a dict

            UT=item["UT"]
            
            
            try:
                                                
                cite_dates[UT] = dict_citing_ut_year[UT]
                
            except KeyError:  # if i haven't encountered this citing paper before
                     
                year = float('nan')
                date = float('nan')
                plos_flag = False
            
                issue=item['issue']
                year=issue['PY']
                try:
                    date=issue['PD']
                except KeyError: # if there is no PD field
                    pass               


                if UT in plos_uts:
                    plos_flag = True


                cite_dates[UT] = [date, year, plos_flag]
                dict_citing_ut_year[UT] = [date, year, plos_flag]

        query.close()  # because i am using the no_cursor_timeout=True, i need also this, or cursor keeps waiting so ur resources are used up
        
            
    return cite_dates
    
#######################################




 
############################3


# def retrieve_multiple_trajectories(ut_list, plos_uts):
#     # Retrieve multiple trajectories for all reference UTs in ut_list. Returns a nested dictionary
#     # {Reference_UT_1: {Citation_UT_1: [Month, Year], ...}}
#     trajectories = {}
#     for ut in ut_list:
#         print(ut)
#         #cite_dates = retrieve_trajectory_new(ut, plos_uts)  # with some changes i added
#         cite_dates = retrieve_trajectory(ut, plos_uts)
#         trajectories[ut] = cite_dates
#     return trajectories



#######################

# def retrieve_multiple_trajectories_new(ut_list, plos_uts,dict_ref_UT_dict_traj):
    
#     for ut in ut_list:
#         print(ut)
#         #cite_dates = retrieve_trajectory_new(ut, plos_uts)  # with some changes i added
#         cite_dates = retrieve_trajectory(ut, plos_uts)
#         dict_ref_UT_dict_traj[ut] = cite_dates
    



#########################


# def retrieve_trajectory_plos_only(ut, plos_uts):
#     # Retrieve a citation trajectory for a single reference UT. This version collects only PLOS citations, so it's much faster
#     # Requires inputting the list of all plos UTs ('paper_UT' column in the plos dataframe)

#     count = 0
#     cite_dates = {}
#     if ut in plos_uts:
#         doc = papers_con.collection.find_one({"UT":ut})
#         if 'citations' in doc:
#             for ref_ut in doc['citations']:
#                 count+=1
#                 ref = papers_con.collection.find_one({"UT":ref_ut})
#                 year = float('nan')
#                 date = float('nan')
#                 plos_flag = False
#                 if ref:
#                     if 'issue' in ref:
#                         if 'PY' in ref['issue']:
#                             year = ref['issue']['PY']
#                         if 'PD' in ref['issue']:
#                             date = ref['issue']['PD']
#                         if ref_ut in plos_uts:
#                             plos_flag = True
#                 if count%1000==0:
#                     print(count)   
#                 cite_dates[ref_ut] = [date, year, plos_flag]

#     return cite_dates

# #######################

# def retrieve_multiple_trajectories_plos_only(ut_list, plos_uts):
#     # Retrieve multiple trajectories for all reference UTs in ut_list. Returns a nested dictionary
#     # {Reference_UT_1: {Citation_UT_1: [Month, Year], ...}}

#     trajectories = {}
#     for ut in ut_list:
#         print(ut)
#         cite_dates = retrieve_trajectory_plos_only(ut, plos_uts)
#         trajectories[ut] = cite_dates
#     return trajectories

In [None]:
# #rm ../data/test_trajectories.pkl


# try:
#     master_dict_ref_UT_dict_traj = pickle.load(open('../data/test_trajectories.pkl', 'rb'))
#     print ("read master dict")
# except:
#     master_dict_ref_UT_dict_traj={}

    
# try:
#     list_citing_papers = pickle.load(open('../data/list_citing_papers.pkl', 'rb'))
#     print ("read list citing papers")
# except:
#     list_citing_papers=[]
    
    
# try:
#     dict_citing_ut_year = pickle.load(open('../data/dict_citing_ut_year.pkl', 'rb'))
#     print ("read dict citing ut-year")
# except:
#     dict_citing_ut_year={}

   






# for ref_ut in lista_ref_uts:
#     try:
#         master_dict_ref_UT_dict_traj[ref_ut]
#         print (ref_ut, "already in dict. of trajectories")
#     except KeyError:
   
#         cite_dates = retrieve_trajectory_new(ref_ut, lista_plos_uts, list_citing_papers,dict_citing_ut_year)
#         master_dict_ref_UT_dict_traj[ref_ut] = cite_dates


#         ### dump in a pickle after i process every ref_ut
#         with open('../data/test_trajectories.pkl', 'wb') as handle:
#             pickle.dump(master_dict_ref_UT_dict_traj, handle, protocol = 2)   
#             print (ref_ut, len(master_dict_ref_UT_dict_traj), len(dict_citing_ut_year), len(list_citing_papers))


#         with open('../data/list_citing_papers.pkl', 'wb') as handle:
#              pickle.dump(list_citing_papers, handle, protocol = 2)

#         with open('../data/dict_citing_ut_year.pkl', 'wb') as handle:
#              pickle.dump(dict_citing_ut_year, handle, protocol = 2)

        
        
# print ("done")



In [12]:

 ################################ new, faster way:
    

def get_all_citing_UTs_at_once_and_dict_ref_UT_citing_UTs(lista_ref_uts, list_all_citing_UTs, dict_ref_UT_list_its_citing_papers):
                                                          
    
    cursor = papers_con.collection.find({"UT":{"$in":lista_ref_uts}},{"UT":1,'citations':1}, no_cursor_timeout=True)  # it returns an iterator (it gets empty after iterating over it once) # second {} to select the fields it returns, if i dont want them all
   # cont =0
    tot=len(lista_ref_uts)
    for item in cursor:  # query (cursor) is an iterator (once i iterate over it once, it is empty), and every item is a dict

        UT=item["UT"]               
    #    cont +=1
        
        list_citing_papers=item['citations']
        
 
        dict_ref_UT_list_its_citing_papers[UT]=list_citing_papers
        list_all_citing_UTs +=list_citing_papers
    
    cursor.close()  # because i am using the no_cursor_timeout=True, i need also this, or cursor keeps waiting so ur resources are used up
    
    
    return list(set(list_all_citing_UTs)), dict_ref_UT_list_its_citing_papers 

#####################################

  
    

def  get_publ_years_for_all_citing_UTs(list_citing_UTs,lista_plos_uts,dict_citing_UT_publ_year) :
       # active_users = db.users.find({active:True}).batch_size(2000)
       
    print (len(dict_citing_UT_publ_year), len(list_citing_UTs))
        
    cursor = papers_con.collection.find({"UT":{"$in":list_citing_UTs}},{"UT":1,"issue.PY":1,"issue.PD":1}, no_cursor_timeout=True).batch_size(2000)  # it returns an iterator (it gets empty after iterating over it once) # second {} to select the fields it returns, if i dont want them all   
    #query = papers_con.collection.find({"UT":{"$in":list_citing_UTs}},{"UT":1,"issue.PY":1,"issue.PD":1})
    cont=0
    tot=len(list_citing_UTs)   
   
    for item in cursor:  # query (cursor) is an iterator (once i iterate over it once, it is empty), and every item is a dict

       
        UT=item["UT"]
        print (cont, tot)                  
        cont +=1
                                           
        year = float('nan')
        date = float('nan')
        plos_flag = False
            
            
        issue=item['issue']
       
        year=issue['PY']
        try:
            date=issue['PD']
        except KeyError: # if there is no PD field
            pass               


        if UT in lista_plos_uts:
            plos_flag = True


        dict_citing_UT_publ_year[UT] = [date, year, plos_flag]
      
    
    
    cursor.close()  # because i am using the no_cursor_timeout=True, i need also this, or cursor keeps waiting so ur resources are used up
    print (len(dict_citing_UT_publ_year))
    return dict_citing_UT_publ_year

 

###########################






In [None]:
# ######  ORIGINAL query to get trajecteries of a SELECTION of ref UTs
# ####################################3333



# print ("length first list:", len(lista_ref_uts))

# try:   # this part is fast
#     list_all_citing_UTs = pickle.load(open('../data/list_all_citing_UTs.pkl', 'rb'))
#     print ("read list_all_citing_UTs", len(list_all_citing_UTs))    
    
#     dict_ref_UT_list_its_citing_papers = pickle.load(open('../data/dict_ref_UT_list_its_citing_papers.pkl', 'rb'))
#     print ("read dict_ref_UT_list_its_citing_papers", len(dict_ref_UT_list_its_citing_papers))    
    
# except:     
#     print ("1st query: getting list all citing UTs that i will need.........")

#     list_all_citing_UTs, dict_ref_UT_list_its_citing_papers  = get_all_citing_UTs_at_once_and_dict_ref_UT_citing_UTs(lista_ref_uts)

#     with open('../data/list_all_citing_UTs.pkl', 'wb') as handle:
#         pickle.dump(list_all_citing_UTs, handle, protocol = 2)   
#         print ( len(list_all_citing_UTs), '../data/list_all_citing_UTs.pkl')

#     with open('../data/dict_ref_UT_list_its_citing_papers.pkl', 'wb') as handle:
#         pickle.dump(dict_ref_UT_list_its_citing_papers, handle, protocol = 2)   
#         print ( len(dict_ref_UT_list_its_citing_papers), '../data/dict_ref_UT_list_its_citing_papers.pkl')        
    
# print ("done.")









# print ("\n\nlength second list:", len(list_all_citing_UTs))
# print ("2nd query: getting dict UT publication year ...................")   # too large for Mongo to do all at once!


# try:
#     dict_citing_UT_publ_year = pickle.load(open('../data/partial_dict_citing_UT_publ_year_.pkl', 'rb'))
#     print ("read partial master dict, length:", len(dict_citing_UT_publ_year))
# except:
#     dict_citing_UT_publ_year = {}

    
# stop= len(list_all_citing_UTs)
# initial=len(dict_citing_UT_publ_year)
# delta=int(stop/1000.)
# final=initial + delta
# while final-delta <=  stop:    
#     print (" ",initial, final, "....")

#     partial_list=list_all_citing_UTs[initial:final]  # slicing beyond the length of the list is NOT a problem

#     get_publ_years_for_all_citing_UTs(partial_list,lista_plos_uts,dict_citing_UT_publ_year)    #where  dict_citing_UT_publ_year[UT] = [date, year, plos_flag]

#     initial += delta
#     final = initial +delta

#     with open('../data/partial_dict_citing_UT_publ_year.pkl', 'wb') as handle:
#         pickle.dump(dict_citing_UT_publ_year, handle, protocol = 2)   
#         print ("# papers in master dict:", len(dict_citing_UT_publ_year), '../data/partial_dict_citing_UT_publ_year.pkl')

    
# #     print ("   done")
# print("done.")



# print ("putting together master dict ref_UT trajectories.............")
# list_missing =[]
# master_dict_ref_UT_dict_traj={}
# for ref_UT in dict_ref_UT_list_its_citing_papers:  # the 9K ref_UT papers i focus on
#     cite_dates={}
#     dict_aux = dict_ref_UT_list_its_citing_papers[ref_UT]
#     print (ref_UT, len(dict_ref_UT_list_its_citing_papers[ref_UT]))
#     for citing_UT in dict_aux:
#         try:
#             cite_dates[citing_UT]=dict_citing_UT_publ_year[citing_UT]
#         except KeyError:
#             list_missing.append(citing_UT)
    
#     master_dict_ref_UT_dict_traj[ref_UT]=cite_dates
# print ("missing UTs:", len(set(list_missing)))
# print ("done.")    
    
    
    
# # final dict:   master_dict_ref_UT_dict_traj[ref_ut] = cite_dates   where is a dict: cite_dates[citing_UT] = [date, year, plos_flag]
    
    

# ### dump in a pickle 
# with open('../data/new_master_dict_ref_UT_trajectories.pkl', 'wb') as handle:
#     pickle.dump(master_dict_ref_UT_dict_traj, handle, protocol = 2)   
#     print ("# papers in master dict:", len(master_dict_ref_UT_dict_traj), '../data/new_master_dict_ref_UT_trajectories.pkl')




In [None]:
len(lista_all_reference_UTs)


len(lista_all_plos_UTs)
                                                            
dict_citing_UT_publ_year = pickle.load(open('../data/dict_citing_UT_publ_year_.pkl', 'rb'))
print ("read partial master dict, length:", len(dict_citing_UT_publ_year))


In [13]:
print ("1st query: getting list all citing UTs that i will need.........")


try:  
    list_all_citing_UTs = pickle.load(open('../data/list_all_citing_UTs_of_ref_papers.pkl', 'rb'))
    print ("read list_all_citing_UTs_of_ref_papers", len(list_all_citing_UTs_of_ref_papers))    
    
    dict_ref_UT_list_its_citing_papers = pickle.load(open('../data/dict_ref_UT_list_its_citing_papers.pkl', 'rb'))
    print ("read dict_ref_UT_list_its_citing_papers", len(dict_ref_UT_list_its_citing_papers))    
except:
    list_all_citing_UTs=[]
    dict_ref_UT_list_its_citing_papers ={}
    

stop= len(lista_all_reference_UTs)
initial=len(dict_ref_UT_list_its_citing_papers)
delta=int(stop/100.)
final=initial + delta
while final-delta <=  stop:    
    print (" ",initial, final, "....")

    partial_list=lista_all_reference_UTs[initial:final]  # slicing beyond the length of the list is NOT a problem

    get_all_citing_UTs_at_once_and_dict_ref_UT_citing_UTs(partial_list, list_all_citing_UTs, dict_ref_UT_list_its_citing_papers)

    with open('../data/dict_ref_UT_list_its_citing_papers.pkl', 'wb') as handle:
        pickle.dump(dict_ref_UT_list_its_citing_papers, handle, protocol = 2)   
        print ( len(dict_ref_UT_list_its_citing_papers), '../data/dict_ref_UT_list_its_citing_papers.pkl')        

        
        
    with open('../data/list_all_citing_UTs_of_ref_papers.pkl', 'wb') as handle:
        pickle.dump(list_all_citing_UTs, handle, protocol = 2)   
        print ( len(list_all_citing_UTs), '../data/list_all_citing_UTs_of_ref_papers.pkl')


    initial += delta
    final = initial +delta
    
print ("done.")
print (len(list_all_citing_UTs), len(lista_all_reference_UTs))






1st query: getting list all citing UTs that i will need.........
  0 26074 ....
26074 ../data/dict_ref_UT_list_its_citing_papers.pkl
8376382 ../data/list_all_citing_UTs_of_ref_papers.pkl
  26074 52148 ....
52148 ../data/dict_ref_UT_list_its_citing_papers.pkl
14572178 ../data/list_all_citing_UTs_of_ref_papers.pkl
  52148 78222 ....
78222 ../data/dict_ref_UT_list_its_citing_papers.pkl
20044755 ../data/list_all_citing_UTs_of_ref_papers.pkl
  78222 104296 ....
104296 ../data/dict_ref_UT_list_its_citing_papers.pkl
24783508 ../data/list_all_citing_UTs_of_ref_papers.pkl
  104296 130370 ....
130370 ../data/dict_ref_UT_list_its_citing_papers.pkl
29635101 ../data/list_all_citing_UTs_of_ref_papers.pkl
  130370 156444 ....
156444 ../data/dict_ref_UT_list_its_citing_papers.pkl
33793291 ../data/list_all_citing_UTs_of_ref_papers.pkl
  156444 182518 ....
182518 ../data/dict_ref_UT_list_its_citing_papers.pkl
37980007 ../data/list_all_citing_UTs_of_ref_papers.pkl
  182518 208592 ....
208592 ../data/dict

KeyboardInterrupt: 

In [None]:
### ############# for ALL trajectories for all UTs, i expand the queries for trajectories to ALL UT papers referenced in the plos corpus




# try:  
#     list_all_citing_UTs = pickle.load(open('../data/lista_all_reference_UTs.pkl', 'rb'))
#     print ("read list_all_citing_UTs", len(lista_all_reference_UTs))    
    
#     dict_ref_UT_list_its_citing_papers = pickle.load(open('../data/dict_ref_UT_list_its_citing_papers_.pkl', 'rb'))
#     print ("read dict_ref_UT_list_its_citing_papers", len(dict_ref_UT_list_its_citing_papers))    
    
# except:     
print ("1st query: getting list all citing UTs that i will need.........")

list_all_citing_UTs, dict_ref_UT_list_its_citing_papers  = get_all_citing_UTs_at_once_and_dict_ref_UT_citing_UTs(lista_all_reference_UTs)

#     with open('../data/list_all_citing_UTs.pkl', 'wb') as handle:
#         pickle.dump(list_all_citing_UTs, handle, protocol = 2)   
#         print ( len(list_all_citing_UTs), '../data/list_all_citing_UTs_.pkl')

with open('../data/dict_ref_UT_list_its_citing_papers.pkl', 'wb') as handle:
    pickle.dump(dict_ref_UT_list_its_citing_papers, handle, protocol = 2)   
    print ( len(dict_ref_UT_list_its_citing_papers), '../data/dict_ref_UT_list_its_citing_papers_.pkl')        

    
print ("done.")
print (len(list_all_citing_UTs), len(lista_all_reference_UTs))




########### FALTA AGNADIR LA EXCLUSION DE UTS Q YA ESTAN BUSCAOS



print ("\n\nlength second list:", len(lista_all_reference_UTs))
print ("2nd query: getting dict UT publication year ...................")   # too large for Mongo to do all at once!


try:
    dict_citing_UT_publ_year = pickle.load(open('../data/partial_dict_citing_UT_publ_year_.pkl', 'rb'))
    print ("read partial master dict, length:", len(dict_citing_UT_publ_year))
except:
    dict_citing_UT_publ_year = {}

    
stop= len(list_all_citing_UTs)
initial=len(dict_citing_UT_publ_year)
delta=int(stop/1000.)
final=initial + delta
while final-delta <=  stop:    
    print (" ",initial, final, "....")

    partial_list=list_all_citing_UTs[initial:final]  # slicing beyond the length of the list is NOT a problem

    get_publ_years_for_all_citing_UTs(partial_list,lista_plos_uts,dict_citing_UT_publ_year)    #where  dict_citing_UT_publ_year[UT] = [date, year, plos_flag]

    initial += delta
    final = initial +delta

    with open('../data/partial_dict_citing_UT_publ_year.pkl', 'wb') as handle:
        pickle.dump(dict_citing_UT_publ_year, handle, protocol = 2)   
        print ("# papers in master dict:", len(dict_citing_UT_publ_year), '../data/partial_dict_citing_UT_publ_year.pkl')

    
#     print ("   done")
print("done.")



print ("putting together master dict ref_UT trajectories.............")
list_missing =[]
master_dict_ref_UT_dict_traj={}
for ref_UT in dict_ref_UT_list_its_citing_papers:  # the 9K ref_UT papers i focus on
    cite_dates={}
    dict_aux = dict_ref_UT_list_its_citing_papers[ref_UT]
    print (ref_UT, len(dict_ref_UT_list_its_citing_papers[ref_UT]))
    for citing_UT in dict_aux:
        try:
            cite_dates[citing_UT]=dict_citing_UT_publ_year[citing_UT]
        except KeyError:
            list_missing.append(citing_UT)
    
    master_dict_ref_UT_dict_traj[ref_UT]=cite_dates
print ("missing UTs:", len(set(list_missing)))
print ("done.")    
    
    
    
# final dict:   master_dict_ref_UT_dict_traj[ref_ut] = cite_dates   where is a dict: cite_dates[citing_UT] = [date, year, plos_flag]
    
    

### dump in a pickle 
with open('../data/new_master_dict_ref_UT_trajectories.pkl', 'wb') as handle:
    pickle.dump(master_dict_ref_UT_dict_traj, handle, protocol = 2)   
    print ("# papers in master dict:", len(master_dict_ref_UT_dict_traj), '../data/new_master_dict_ref_UT_trajectories.pkl')




In [None]:
len(list_all_citing_UTs)  # list_all_citing_UTs: 6901130

print (len(dict_citing_UT_publ_year)+ len(missing_citing_UTs) )  # dict_citing_UT_publ_year:  6898862   

len(dict_citing_UT_publ_year)

In [None]:
missing_citing_UTs=[]   # it turns out, this UTs are NOT in the WoS db
for item in list_all_citing_UTs:
    if item not in dict_citing_UT_publ_year:
        missing_citing_UTs.append(item)
        
print ("missing UTs:", len(missing_citing_UTs))


print (len(dict_citing_UT_publ_year))

# dict_citing_UT_publ_year=get_publ_years_for_all_citing_UTs(missing_citing_UTs,lista_plos_uts,dict_citing_UT_publ_year)    #where  dict_citing_UT_publ_year[UT] = [date, year, plos_flag]
# print (len(dict_citing_UT_publ_year))

# with open('../data/partial_dict_citing_UT_publ_year.pkl', 'wb') as handle:
#     pickle.dump(dict_citing_UT_publ_year, handle, protocol = 2)   
#     print ("# papers in master dict:", len(dict_citing_UT_publ_year), '../data/partial_dict_citing_UT_publ_year.pkl')


In [None]:
list_a=set((dict_citing_UT_publ_year.keys()))
list_b=set(list_all_citing_UTs)
list_c=set(missing_citing_UTs)
interseccion = list_c.intersection(list_a)
len(interseccion)

In [None]:
dict_traject = datetime_filler_traj(master_dict_ref_UT_dict_traj)
print ("done converting datetimes")

with open('../data/test_trajectories_datetimes.pkl', 'wb') as handle:
    pickle.dump(dict_traject, handle, protocol = 2)
print ("written: ../data/test_trajectories_datetimes.pkl")  

In [None]:
dict_traject

In [None]:
# i create a simpler dictionary (because sorting it by value when the value is a tuple (date, flag) is problematic)
master_dict_traject={}
for llave1 in dict_traject:
    master_dict_traject[llave1]={}
   # print( dict_traject[llave1])
    for llave2 in dict_traject[llave1]:        
        master_dict_traject[llave1][llave2]=dict_traject[llave1][llave2][0]
    #print (llave1, len(master_dict_traject[llave1]))

 


### i create a new dict (ut as keys) of dicts: where dates are keys and values are total count of citations    
new_dict={}
for ut in master_dict_traject:
  #  print (ut, len(master_dict_traject[ut]))
   
    new_dict[ut]={}
    dict_citating_paper_date=master_dict_traject[ut]
       
    cont_cit =1
    sorted_dict = sorted(dict_citating_paper_date.items(), key=operator.itemgetter(1))
    for pair in sorted_dict:        
        date=pair[1]

#         try:                  
#             new_dict[ut][date] += cont_cit
#         except KeyError:
        new_dict[ut][date] = cont_cit
        cont_cit +=1
        
#     print (sum(new_dict_ut_dict_citation_dates_counts[ut].values()))
    #input()

print ("done")
    
with open('../data/dict_ut_dict_dates_citations_trajectories_datetimes.pkl', 'wb') as handle:
    pickle.dump(new_dict, handle, protocol = 2)
print ("written: ../data/dict_ut_dict_dates_citations_trajectories_datetimes.pkl")  

In [None]:
new_dict 

In [None]:
# ## dont need for now

# # Non-time-filtered regex-based exploration
# import numpy as np
# i=0
# #k = np.log2(i)
# bin_points = []
# while i<16:
#     k = 2**i
#     i+=1
#     bin_points.append(k)
# bin_points


# fig = plt.figure(figsize=(20,20))
# fig.suptitle('Citation distribution across regex sections (normalized, log2 bins)', size=30)
# bin_num = 8
# subplot_num = 1
# hist_list_raw = []
# #hist_list = np.array([])
# hist_list = []

# for i in range(len(bin_points)-1):
#     print(subplot_num)
#     #print(i)
#     if i == len(bin_points)-2:
#         upper_bound = int(max(ref_df['cite_count'])+1)
#         print(upper_bound)
#     else:
#         upper_bound = bin_points[i+1]
#         #print('asdfasdf')
        

#     cite_count_log_list = ref_df.loc[ref_df['cite_count'].isin(range(bin_points[i], upper_bound))]
#     title_text = str(bin_points[i]) + '-' + str(upper_bound-1) + ' citations'
#     unique_refs = len(cite_count_log_list['reference_UT'].unique())
#     unique_papers = len(cite_count_log_list['paper_UT'].unique())
#     total_occs = len(cite_count_log_list)
    
#     ax = fig.add_subplot(4,4,subplot_num)
    
    
#     hist1, bins1 = np.histogram(cite_count_log_list['sect_index'], bins= bin_num, range=[0,8])
#     hist_list_raw.append(hist1)
#     widths1 = np.diff(bins1)
#     hist1 = hist1/float(len(cite_count_log_list))
#     hist_list.append(hist1)
#     #hist_list = np.append(hist_list, hist1, axis=1)
#     ax.bar(bins1[:-1], hist1, widths1)
    
#     ax.title.set_text(title_text)
    
#     subtext1 = 'unique refs:  ' + str(unique_refs)
#     subtext2 = 'total occs:   ' + str(total_occs)
#     subtext3 = 'total papers:' + str(unique_papers)


#     ax.text(7.3, 0.83, subtext1 + '\n' + subtext2 + '\n' + subtext3, style='italic', horizontalalignment='right',
#         bbox={'facecolor':'white', 'alpha':0.6, 'pad':10})

#     ax.grid()
#     ax.set_ylim([0,1])
#     ax.set_xlim([0,7])
#     ax.set_xticks([0,1,2,3,4,5,6,7,8])
#     subplot_num +=1
    

# # Plotting the distribution for all papers    
# cite_count_all = ref_df.loc[(ref_df['cite_count']>=0)]
# title_text = 'All citations'
# unique_refs = len(cite_count_all['reference_UT'].unique())
# unique_papers = len(cite_count_all['paper_UT'].unique())
# total_occs = len(cite_count_all)

# ax = fig.add_subplot(4,4,subplot_num)
# hist1, bins1 = np.histogram(cite_count_all['sect_index'], bins= bin_num, range=[0,8])
# hist_all_raw = hist1
# widths1 = np.diff(bins1)
# hist1 = hist1/float(len(cite_count_all))
# hist_all = hist1
# ax.bar(bins1[:-1], hist1, widths1)
# ax.title.set_text(title_text)
# subtext1 = 'unique refs:   ' + str(unique_refs)
# subtext2 = 'total occs:   ' + str(total_occs)
# subtext3 = 'total papers:   ' + str(unique_papers)

# ax.text(7.3, 0.83, subtext1 + '\n' + subtext2 + '\n' + subtext3, style='italic', horizontalalignment='right',
#     bbox={'facecolor':'white', 'alpha':0.6, 'pad':10})
# ax.grid()
# ax.set_ylim([0,1])
# ax.set_xlim([0,7])
# ax.set_xticks([0, 1, 2, 3, 4, 5, 6, 7, 8],['i','m','r','d','rd','c','mx','na',''])


# fig.text(0.5, 0.08, 'Section Label', ha='center', size = 30)
# fig.text(0.07, 0.55, 'Fraction of Total Occurences', va='center', rotation='vertical', size = 30)

In [None]:
mid_cites = ref_df.loc[ref_df['cite_count'].isin(range(100, 110))]

In [None]:
len(mid_cites['reference_UT'].unique())

In [None]:
high_cites = ref_df[ref_df['cite_count']>=16384]

In [None]:
a = ref_df[ref_df['paper_UT'] == '000271022300001']

In [None]:
len(high_cites['paper_UT'].unique())

In [None]:
len(high_cites['reference_UT'].unique())

In [None]:
len(ref_df['paper_UT'].unique())

In [None]:
len(ref_df['reference_UT'].unique())

In [None]:
any_cites = ref_df[ref_df['cite_count']>=1]

In [None]:
len(any_cites['reference_UT'].unique())

In [None]:
any_cites['paper_UT'].unique()[0]

In [None]:
doc = papers_con.collection.find_one({"UT":'000324515600133'})
doc

In [None]:

test_traj = retrieve_trajectory('000089825700038')

In [None]:
test_traj

In [None]:
doc = papers_con.collection.find_one({"UT":'000181970800023'})
doc

In [None]:
filtered_ref_uts = list(df_pap_ref['reference_UT'].unique()[0:10])
filtered_trajs = retrieve_multiple_trajectories(filtered_ref_uts, all_plos)

In [None]:
test_traj_dict = datetime_filler_traj(filtered_trajs)

In [None]:
test_traj_dict

In [None]:
test_traj_dict['000187449400012']

In [None]:
sorted(range(len(dates)), key=lambda k: dates[k])

In [None]:
for paper in test_traj_dict:
    dates = []
    count = []
    plos_dates = []
    plos_count = []
    for index, cite in enumerate(test_traj_dict[paper]):
        #print(type(test_traj_dict[paper][cite][0]))
        if type(test_traj_dict[paper][cite][0]) != int:
            dates.append(test_traj_dict[paper][cite][0])#if test_traj_dict[paper][cite][1] == True:    
            count.append(index)
            if test_traj_dict[paper][cite][1] == True:
                #plos_dates.append(test_traj_dict[paper][cite][0])#if test_traj_dict[paper][cite][1] == True:    
                plos_count.append(index)
                
    sorted_ind = sorted(range(len(dates)), key=lambda k: dates[k])
    plos_dates = [dates[x] for x in plos_count]
    plos_inds = [sorted_ind.index(i) for i in plos_count]

    #print(plos_dates)
    dates.sort()
    #sorted(range(len(plos_dates)), key=lambda k: plos_dates[k])
    #plos_dates.sort()
    plt.plot(dates, count, color = 'b')
    
    plt.scatter(plos_dates, plos_inds, color='r')
    break


In [None]:
dates[0,5,10]

In [None]:
all_plos = ref_df['paper_UT'].unique()
mid_cite_ref_uts = mid_cites['reference_UT'].unique()[0:10]
mid_cites_trajs = retrieve_multiple_trajectories(mid_cite_ref_uts, all_plos)

In [None]:
mid_cites_trajs

In [None]:
ref_df.loc[:,'paper_UT'] = ref_.index

In [None]:
# def section_regex_parser(ref_df):
# # Using regular expressions to sort sections. Reach a concensus on section identification by scanning both section_title and section_title_alt
#     import regex as re

#     sect_index_dict = {'intro': 0, 'methods': 1, 'results': 2, 'disc': 3, 'res_disc':4, 'concl':5, 'mixed':6, 'na':7}

#     intro_re = re.compile(r'(intro)')
#     method_re = re.compile(r'(method)')
#     results_re = re.compile(r'(results)')
#     disc_re = re.compile(r'(disc)')
#     concl_re = re.compile(r'(conclu)')
#     backgr_re = re.compile(r'(backgr)')
#     mater_re = re.compile(r'(mater)')
#     count = 0
#     judgement = []

#     for i in range(len(ref_df)):
#         count += 1
#         if count%100000 == 0:
#             print(count)
#         sect = ref_df.iloc[i]['section_title']
#         sect_alt = ref_df.iloc[i]['section_title_alt']
#         sect_tag = -1
#         sect_alt_tag = -1
#         sect_final = -1

#         if sect == None:
#             sect_tag = 'na'
#         else:
#             if re.search(intro_re, sect.lower()) or re.search(backgr_re, sect.lower()):
#                 sect_tag = 'intro'

#             elif re.search(method_re,sect.lower()) or re.search(mater_re, sect.lower()):
#                 if re.search(results_re, sect.lower()):
#                     sect_tag = 'mixed'

#                 elif re.search(disc_re, sect.lower()):
#                     sect_tag = 'mixed'

#                 else:
#                     sect_tag = 'methods'

#             elif re.search(results_re, sect.lower()):
#                 if re.search(disc_re, sect.lower()):
#                     sect_tag = 'res_disc'
#                 else:
#                     sect_tag = 'results'

#             elif re.search(disc_re, sect.lower()):
#                 sect_tag = 'disc'

#             elif re.search(concl_re, sect.lower()):
#                 sect_tag = 'concl'

#             else:
#                 sect_tag = 'na'



#         if sect_alt == None:
#             sect_alt_tag = 'na'
#         else:
#             if re.search(intro_re, sect_alt.lower()) or re.search(backgr_re, sect_alt.lower()):
#                 sect_alt_tag = 'intro'

#             elif re.search(method_re, sect_alt.lower()) or re.search(mater_re, sect_alt.lower()):
#                 if re.search(results_re, sect_alt.lower()):
#                     sect_alt_tag = 'mixed'

#                 elif re.search(disc_re, sect_alt.lower()):
#                     sect_alt_tag = 'mixed'

#                 else:
#                     sect_alt_tag = 'methods'

#             elif re.search(results_re, sect_alt.lower()):
#                 if re.search(disc_re, sect_alt.lower()):
#                     sect_alt_tag = 'res_disc'
#                 else:
#                     sect_alt_tag = 'results'

#             elif re.search(disc_re, sect_alt.lower()):
#                 sect_alt_tag = 'disc'

#             elif re.search(concl_re, sect_alt.lower()):
#                 sect_alt_tag = 'concl'

#             else:
#                 sect_alt_tag = 'na'



#         if sect_tag == sect_alt_tag: # Confident on label
#             sect_final = sect_tag
#             #ref_df.iloc[i]['regex_sect_index'] = sect_index_dict[sect_tag]

#         else:
#             if sect_tag == 'na':
#                 sect_final = sect_alt_tag
#                 #ref_df.iloc[i]['regex_sect_index'] = sect_index_dict[sect_alt_tag]
#             elif sect_alt_tag == 'na':
#                 sect_final = sect_tag

#             elif sect_tag == 'mixed':
#                 sect_final = sect_alt_tag

#             elif sect_alt_tag == 'mixed':
#                 sect_final = sect_tag

#             elif sect_alt_tag == 'concl': # Conclusion is a plos-based standard that includes discussion/results and discussion
#                 sect_final = sect_tag
#             else:
#                 sect_final = sect_alt_tag

#         judgement.append(sect_final)
#     return judgement
#         #ref_df.iloc[i]['regex_sect_index'] = sect_index_dict[sect_final]

In [None]:
# judgement = section_regex_parser(ref_df)
# judgement_2 = []
# sect_index_dict = {'intro': 0, 'methods': 1, 'results': 2, 'disc': 3, 'res_disc':4, 'concl':5, 'mixed':6, 'na':7}
# for i in judgement:
#     judgement_2.append(sect_index_dict[i])
# ref_df['regex_sect_index'] = judgement_2