## Building a set of new variables for the analysis of in-text-citations

In this code I take a file with some basic information about citation patterns among academic papers, and I add some new, more sophisticated variables, derived from the basic ones. Some of these new variables are calculated directly by combining the basic ones, while in other cases, I query a Mongo DataBase to obtain extra information that I need. Once I'm done, I dump the new dataframe in a pickle file.

See notebook "For_UPTAKE_get_basic_stats_plots_and_null_model", where I actually use some of these variables for plotting and analysis.

In [1]:
import sys
import pickle
import gzip
import os,glob
import time
import numpy as np
import pandas as pd
import operator
import random
from  scipy import stats
import datetime
import math
import time
import itertools


from pymongo import MongoClient



from IPython.core.display import display,HTML
display(HTML("<style>.container { width:100% !important; }</style>"))  # to make the notebook use the entire width of the browser


#### Upload main datafiles

These are the basic files with information about citations among academic papers that I will use to built upon. These files are pickled pandas dataframes, and were created elsewhere.


In [None]:

    
%time df_merged = pickle.load(open('../data/df_reference_cite_plos_merged_with_team_expertise_and_recycled_ref.pkl', 'rb'))
print ("done loading pickle", df_merged.shape)



%time plos_df = pickle.load(open('../data/plos_paper_dataframe_ONLY_ARTICLES_num_ref_sect_young_old.pkl', 'rb'))
print ("done loading pickle", plos_df.shape)


#### I modify the unique ID of the papers to make it standard across entries, and add it as new column of the main dataframe, dropping the old one

In [None]:
def fix_paper_UT(row):
    paper_UT = row.paper_UT
    return '000'+str(paper_UT)

##################3

df_merged['new_paper_UT'] = df_merged.apply (lambda row: fix_paper_UT(row),axis=1)
df_merged.drop(['paper_UT'], axis=1, inplace=True)
df_merged.rename(columns={'new_paper_UT': 'paper_UT'}, inplace=True)

df_merged[['paper_UT']]

#### I create a number of simple, derived variables and add them to the datafram as new columns

In [None]:

def get_log_num_cit_of_paper(row):
    """
    This function takes one row from the dataframe and returns the corresponding value for the new variable (logarithm of the 
    number of citations of the paper) to be added in a column
    """
  
    try:
        log_num_cit=np.log10(float(row.paper_cite_count))
    except:
        print (row.plos_pub_year)
        log_num_cit=0.
    return log_num_cit

###################


def get_diff_publ_plos_publ_ref(row):
    """
    This function takes one row from the dataframe and returns the corresponding value for the new variable (age difference 
    between the reference and the citing paper) to be added in a column
    """
  
    try:
        delta_publ_year=float(row.plos_pub_year)- float(row.ref_pub_year) 
    except:
        delta_publ_year=np.nan
    return delta_publ_year

###################

def get_relative_position_within_section(row):
    """
    This function takes one row from the dataframe and returns the corresponding value for the new variable (relative position
    of the reference in the paper section, in units of characters) to be added in a column
    """
  
    try:
        rel_pos=float(row.sect_char_pos) / float(row.sect_char_total) 
    except:
        rel_pos=np.nan
    return rel_pos 

###################

def get_section(row):
    """
    This function takes one row from the dataframe and returns the corresponding value for the new variable (numberical code 
    to identify the section of the paper where the reference is used) to be added in a column
    """
  
    sect_num = row.regex_sect_index
    section_name = "NA"
    
    if sect_num == 0:
        section_name = "0:Intro"
    elif sect_num == 1:
        section_name = "1:Methods"
    elif sect_num == 2:
        section_name = "2:Results"
    elif sect_num == 3:
        section_name = "3:Discussion"
    elif sect_num == 4:
        section_name = "4:Results/Discussion"
    elif sect_num == 5:
        section_name = "5:Conclusion"
    elif sect_num == 6:
        section_name = "6:Mx"
    
    
    
    return section_name 

###################



%time df_merged['log10_num_cit_ref'] = df_merged.apply (lambda row: get_log_num_cit_of_ref(row),axis=1)
%time df_merged['log10_num_cit_paper'] = df_merged.apply (lambda row: get_log_num_cit_of_paper(row),axis=1)
%time df_merged['diff_year_plos_ref'] = df_merged.apply (lambda row: get_diff_publ_plos_publ_ref(row),axis=1)
%time df_merged['rel_loc_in_sect'] = df_merged.apply (lambda row: get_relative_position_within_section(row),axis=1)
%time df_merged['section'] = df_merged.apply (lambda row: get_section(row),axis=1)


####  In this next function, I assign a label to each usage of a reference (that is, to each pair reference-paper), to mark it as 'isolated' or 'group' reference, depending on how it is used in a given paper in my records. I'll use this later in the analysis to select subsets of the data.

In [None]:
def isolated_or_group_citation(row):
    """
    This function takes one row from the dataframe and returns a flag (1 or 0) to mark the usage of isolated or group references,
    to be added in a column
    """
  
    window=4  # max distance (in units of characters) to determine if a reference is clustered with anothers or not
    
    paper_UT = row.paper_UT
    reference_UT = row.reference_UT
    regex_sect_index = row.regex_sect_index
    sect_char_pos = row.sect_char_pos
    
    
    df_selection=df_merged[ (df_merged['paper_UT'] == paper_UT) &  (df_merged['regex_sect_index'] == regex_sect_index) ]   
    
   
    list_positions=sorted(list(df_selection.sect_char_pos))
    
   
    lista_diff_pos= sorted([abs(item - sect_char_pos ) for item in list_positions])[1:]
    
     
    if len(lista_diff_pos)>0:
        if min(lista_diff_pos) > window:
            
            return 1   # isolated reference
        else:
            return 0
    else:
        return 0
    


###################


%time df_merged['isolated_citation'] = df_merged.apply (lambda row: isolated_or_group_citation(row),axis=1)


#### After adding a number of new columns, I wrote the dataframe to a pickle file


In [None]:


print ("writing pickle.....")


path = '../data/df_reference_cite_plos_merged_simplified_added_more_columns.pkl'
%time df_merged.to_pickle(path, compression='infer', protocol=2)

print ("written:",path)  


#### Next, I label each record of references used in a given paper as self-citations or not self-citations

In order to do this, I need to check who are the authors of a given paper, as well as who are the authors of all the references this paper cites, and find any possible matches.



First of all, I need a list of the unique IDs of both the PLoS papers and all references that appear in my citation data. These lists are created elsewhere, and uploaded here:

In [None]:
lista_all_plos_UTs = pickle.load(open('../data/lista_all_plos_UTs.pkl', 'rb'))
new_lista_all_plos_UTs = ['000'+str(item) for item in lista_all_plos_UTs]   # i need to add the 000 so it matches the db UTs
lista_all_reference_UTs = pickle.load(open('../data/lista_all_reference_UTs.pkl', 'rb'))
lista_UTs_plos_and_ref = new_lista_all_plos_UTs+ lista_all_reference_UTs
lista_UTs_plos_and_ref =  list(set(lista_UTs_plos_and_ref))


Regarding the author IDs, I use an author-disambiguated dataset (that I compiled elsewhere, and that it is based on proprietary Web of Science disambiguated data). Web of Science assigns unique IDs to papers (called UT), as well as unique IDs to authors (called DAIS).
The author-disambiguated data is structured as follows: one row per paper, and in each row: UT, list_if_DAIS. 

The data is storaged in multiple (320) zip files, and it includes many more records than the ones I need: my dataset is 156K PLoS papers, while the entire Web of Science includes around 60M papers. Hence, as I read the zip files, I only keep the info about the papers I care about. I will concatenate the files, and create a new pandas dataframe with the author-disambiguated info.

In [None]:
###I create a list with all the names of the zip files to iterate over

path_disamb='/home/workspace/scibio/resources/rbusa/rbusa_main_v1_1/disambiguation/wos_dais/'

lista_files=[]

for i in range(320):
    i += 1
    file_name=path_disamb+'wos_dais_all_batch'+str(i)+'.csv.gz'    
    lista_files.append(file_name)
##############





cont_tot=0
cont_filtered=0
########### concatenate a list of (gzip) files into one single pandas dataframe:
frame = pd.DataFrame()
list_ = []
for file_ in lista_files:
    print (file_)
    df = pd.read_csv(file_, compression = 'gzip', dtype=object, index_col=None, header=0)   
    cont_tot += len(df)
    df = df[df['WOS'].isin(lista_UTs_plos_and_ref)]  # i directly filter out what i dont need, to save space
    cont_filtered += len(df)
    list_.append(df)

df_disamb = pd.concat(list_)
print (cont_tot, cont_filtered)    # 208,042,832         14,382,344


df_disamb.rename(columns={'WOS': 'UT'}, inplace=True)
print (df_disamb.shape)     # (14382344, 3)





path = '../data/df_disambig_filtered.pkl'
%time df_disamb.to_pickle(path, compression='infer', protocol=2)

print ("written:",path)  
df_disamb.head()





Then, from the dataframe I have just created, I build a dictionary that gives me the list of disambiguated authors for a given paper ID. I'll need this for the function that labels citations as self or not self citations:

In [None]:


dict_UT_list_authors={}    

for item in df_disamb.groupby(['UT']):        

    UT=item[0]
    dict_UT_list_authors[UT]=list(item[1].DAIS)


print ("done. now writing pickle.....")

with open('../data/dict_UT_list_authors.pkl', 'wb') as handle:
     pickle.dump(dict_UT_list_authors, handle, protocol = 2)
print ("written:",'../data/dict_UT_list_authors.pkl')   





Now I apply a function to each row of my dataframe (each row corresponds to the occurrence of one reference in one particular paper), 
to figure out whether a citation is self-citation or not

In [None]:


def get_self_citation(row, dict_UT_list_authors, cont_missing_records):
    
    """
    This function labels a given record of paper-citation in the dataframe as self-citation or not self-citation
    

    Parameters
    ----------
    row : a row from a dataframe
    
    dict_UT_list_authors : dictionary where the key is the UT (ID) of papers
                            and the value is the list of disambiguated authors in that paper
                            
    cont_missing_records : int
        counter for the number of paper UT not found

    Returns
    -------
    int
        a flag to identify self-citation (1) or not self-citation (0)

    
    """
    
      
    
    paper_UT = row.paper_UT
    ref_UT = row.reference_UT
    
    self_citation=0
    try:
        lista_DAIS_paper = dict_UT_list_authors[paper_UT]
    except KeyError:
        lista_DAIS_paper = []
        cont_missing_records +=1
        
        
    try:
        lista_DAIS_ref = dict_UT_list_authors[ref_UT]
    except KeyError:
        lista_DAIS_ref = []
        cont_missing_records +=1
        
   
    if len(set(lista_DAIS_paper) & set(lista_DAIS_ref)) >0:  # if the citing paper and the reference paper share at least one author
        self_citation = 1
       
        
    
    return self_citation

###################

cont_missing_records=0
%time df_merged['self_citation'] = df_merged.apply (lambda row: get_self_citation(row, dict_UT_list_authors, cont_missing_records), axis=1)

print ("done. missing records (plos and/or ref):", cont_missing_records, "\nwriting pickle.....")





### Finally, I want to extract the number of early citations of young references. Because this info is not among the basic varibles in the original dataframe, I'll have to query the Mongo database

#### Establish MongoDB connection 

In [None]:

class MongoConnection(object):
    def __init__(self, cxnSettings, **kwargs):
        self.settings = cxnSettings
        self.mongoURI = self._constructURI()
        self.connect(**kwargs)
        self.ensure_index()

    def _constructURI(self):
        '''
        Construct the mongo URI
        '''
        mongoURI = 'mongodb://'
        #User/password handling
        if 'user'in self.settings and 'password' in self.settings:
            mongoURI += self.settings['user'] + ':' + self.settings['password']
            mongoURI += '@'
        elif 'user' in self.settings:
            print('Missing password for given user, proceeding without either')
        elif 'password' in self.settings:
            print('Missing user for given passord, proceeding without either')
        #Host and port
        try:
            mongoURI += self.settings['host'] + ':'
        except KeyError:
            print('Missing the hostname. Cannot connect without host')
            sys.exit()
        try:
            mongoURI += str(self.settings['port'])
        except KeyError:
            print('Missing the port. Substituting default port of 27017')
            mongoURI += str('27017')
        return mongoURI

    def connect(self, **kwargs):
        '''
        Establish the connection, database, and collection
        '''
        self.connection = MongoClient(self.mongoURI, **kwargs)
        #########
        try:
            self.db = self.connection[self.settings['db']]
        except KeyError:
            print("Must specify a database as a 'db' key in the settings file")
            sys.exit()
        #########
        try:
            self.collection = self.db[self.settings['collection']]
        except KeyError:
            print('Should have a collection.', end='')
            print('Starting a collection in database', end='')
            print(' for current connection as test.')
            self.collection = self.db['test']

    def tearDown(self):
        '''
        Closes the connection
        '''
        self.connection.close()

    def ensure_index(self):
        '''
        Ensures the connection has all given indexes.
        indexes: list of (`key`, `direction`) pairs.
            See docs.mongodb.org/manual/core/indexes/ for possible `direction`
            values.
        '''
        if 'indexes' in self.settings:
            for index in self.settings['indexes']:
                self.collection.ensure_index(index[0], **index[1])



#############################  (not the real user name nor password)

merged_papers_settings = {
    "host": "chicago.chem-eng.northwestern.edu",
    "port": "27017",
    "db": "web_of_science_aux",
    "collection": "merged_papers",
    "user": "lalalalala",  
    "password": "lalalalalalylala"
}

papers_con = MongoConnection(merged_papers_settings)






To obtain the number of early citations of a reference, I proceed as follows: First, I pick a focus year, and select all the PLoS papper published that year, as well as all the references that those papers cite. Then, I filter the references to select only the ones that were young at the time of the publication of the citing paper.

In [None]:
focus_plos_year = 2009  # available years: 2005 to 2016
def_young = 1  # young references are <= 1 year old
   

df_selection_general_young_ref = df_merged[ (df_merged['ref_pub_year'] >= (focus_plos_year - def_young) )]  

list_UT_young_ref_by_focus_year = df_selection_general_young_ref.reference_UT.unique()


print ("\n\nTotal # records of young ref. by", focus_plos_year, ": ",df_selection_general_young_ref.shape, "  # unique ref_UTs:", len(list_UT_young_ref_by_focus_year))   


Note that in order to plot the results later, all references need to have a value of early citations assigned (even for those that do not qualify as young references). So, first, I initialize all values to Nan.

In [None]:

dict_UT_young_ref_num_cit_by_focus_year = {}

for ref_UT in list_all_ref_UT:  # i need to assign a value to all references  (those that were older, get a NaN, those that are young enough (one year old or younger), get whatever number of citations they had by focus_year)           
    dict_UT_young_ref_num_cit_by_focus_year[ref_UT] = np.nan  # first i initialize all ref to nan  (faster than evaluating whether the reference is in the list of selected ones or not)


Now I iterate over all young references in the focus year, and I query the database to obtain the number of citaions those papers received in or before the focus year.

In [None]:

cont = 0    
for ref_UT in list_UT_young_ref_by_focus_year:  

    ref_UT = str(ref_UT)  # i make sure it is a string
  
    item = papers_con.collection.find_one({"UT" : ref_UT}, {'citations':1})  # the output of a find_one cursor is a DICTIONARY!!!   citations:   cites received by the paper (currently updated value!)


    dict_UT_young_ref_num_cit_by_focus_year[ref_UT] = 0

    try:
        list_citing_papers = item["citations"]                                

        aux_list_citing_papers = list_citing_papers



        ### QUERY TO GET ONLY CITATIONS RECEIVED BY THE REFERENCE PAPER IN THE FOCUS_YEAR OR EARLIER:
        cursor2_count = papers_con.collection.find( {  "UT" :{"$in":aux_list_citing_papers }, "issue.PY": { '$lte': focus_plos_year }} ).count()  
       

        #### i save the value for number of early citations in a dictionary
        dict_UT_young_ref_num_cit_by_focus_year[ref_UT] = cursor2_count



    except :  # IF THERE ARE NO CITATIONS IN WEB OF SCIENCE FOR THAT REFERENCE PAPER   
         dict_UT_young_ref_num_cit_by_focus_year[ref_UT] = 0




    print(cont)#, ref_UT, "# tot cit:",count_cursor1, "# cit by",focus_year, ":",dict_UT_young_ref_num_cit_by_focus_year[ref_UT])       




    cont +=1  




  


I dump the dictionary into a pickle file

In [None]:

print ("done.", len(dict_UT_young_ref_num_cit_by_focus_year),"dumping dict ref.......")         
with open('../data/dict_UT_young_ref_in'+str(focus_plos_year)+'_num_cit_by_'+str(focus_plos_year)+'.pkl', 'wb') as handle:
         pickle.dump(dict_UT_young_ref_num_cit_by_focus_year, handle, protocol = 2)
print ("written:",'../data/dict_UT_young_ref_in'+str(focus_plos_year)+'_num_cit_by_'+str(focus_plos_year)+'.pkl')   



Next, I add a new column to my dataframe with the new info about early citations accrued by young references (for a given focus year)

In [None]:
def get_new_field(row, dict_UT_young_ref_in_focus_year_num_cit):
    """
    This function finds the number of early citations of a reference in the provided dictionary and returns the value, 
    to be added as new column in the existing dataframe.

    Parameters
    ----------
    row : a row from a dataframe
    
    dict_UT_young_ref_in_focus_year_num_cit : dictionary where the key is the UT (ID) of the reference
                            and the value is the number of early citations the reference has accrued by the focus year
                            
    

    Returns
    -------
    int
        number of early citations of the reference

    
    """
    
    ref_UT=str(row.reference_UT) 
    
    num_cit_young_ref = dict_UT_young_ref_in_focus_year_num_cit[ref_UT]  
    
    return num_cit_young_ref
    
    ###################
    
    
    
focus_year = 2009
dict_UT_young_ref_in_focus_year_num_cit = pickle.load(open('../data/dict_UT_young_ref_in'+str(focus_year)+'_num_cit_by_'+str(focus_year)+'.pkl', 'rb'))

column_name = 'num_cit_young_ref_by'+str(focus_year)
%time df_merged[column_name] = df_merged.apply (lambda row: get_new_field(row, dict_UT_young_ref_in_focus_year_num_cit),axis=1)
   
    

I dump the dataframe with the new column to a pickle file one more time

In [None]:


print ("writing pickle.....")


path = '../data/df_reference_cite_plos_merged_simplified_added_more_columns.pkl'
%time df_merged.to_pickle(path, compression='infer', protocol=2)

print ("written:",path)  
