In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [12]:
import pandas as pd
import numpy as np

import time
import os

from bibmatch.authorclass import author
import bibmatch.fast_match_utilities as fmu

import bibmatch.clean_data as clean_data
import bibmatch.parse_web  as parse_web
import bibmatch.load_data as load_data

from bibmatch.parse_wos import parse_wos_authors, load_wos_data


In [3]:
path2rawdata='/Users/ajgates/DropBox/WOSsample/'

In [4]:
st = time.time()
author_df = load_wos_data(name = 'authorship', year_list = None, path2rawdata=path2rawdata,
                          columns = ['ArticleID', 'AuthorDAIS', 'FullName', 'AuthorOrder'], 
                          dropna = ['AuthorDAIS'], 
                          duplicate_subset = ['ArticleID', 'AuthorDAIS'],
                          isindict = None, 
                             verbose = 100)
                              
print(author_df.shape)
print(author_df['AuthorDAIS'].nunique(), " authors")
author_articles = np.sort(author_df['ArticleID'].unique())

print("Completed in %f" % (time.time() - st))

Final DF Shape (515655, 4)
(515655, 4)
108780  authors
Completed in 3.823384


In [5]:
st = time.time()
article_df = load_wos_data(name = 'article', year_list = None, path2rawdata=path2rawdata,
                          columns = ['ArticleID', 'Title','PubYear'], 
                          dropna = ['ArticleID', 'Title'], 
                          duplicate_subset = ['ArticleID'],
                          isindict = {'ArticleID':author_articles}, 
                             verbose = 100)
                              
print(article_df.shape)
#del article_df['Doctypes']
print("Completed in %f" % (time.time() - st))

Final DF Shape (322895, 3)
(322895, 3)
Completed in 12.553882


In [6]:
st = time.time()
address_df = load_wos_data(name = 'address', year_list = None, path2rawdata=path2rawdata,
                          columns = ['ArticleID', 'AuthorOrder', 'Organization'], 
                          dropna = ['ArticleID'], 
                          duplicate_subset = ['ArticleID'],
                          isindict = None, 
                             verbose = 100)
print(address_df.shape)                              
print("Completed in %f" % (time.time() - st))

Final DF Shape (308, 3)
(308, 3)
Completed in 0.268918


In [7]:
full_df = author_df.merge(article_df, how='left', on='ArticleID')
full_df = full_df.merge(address_df, how='left', on=['ArticleID', 'AuthorOrder'])

print(full_df.shape)

(515655, 7)


In [8]:
# run this once to process the author co-author lists
# you can save and re-use the resulting dataframes

def join_article_authors(namelist):
    return " | ".join(set(namelist))
article_authors = author_df.dropna(subset=['FullName']).groupby('ArticleID')['FullName'].apply(join_article_authors).to_frame()
article_authors.reset_index(inplace=True)
article_authors['CoAuthors'] = article_authors['FullName']
del article_authors['FullName']

In [9]:
full_df = full_df.merge(article_authors, on='ArticleID', how='left')
# remove the author from the co-author list
full_df['CoAuthors'] = full_df.apply(lambda x: x['CoAuthors'].replace(x['FullName'], ''), axis=1)
print(full_df.shape)
print(list(full_df))

(515655, 8)
['ArticleID', 'AuthorDAIS', 'FullName', 'AuthorOrder', 'Title', 'PubYear', 'Organization', 'CoAuthors']


In [13]:
alist = parse_wos_authors(full_df.iloc[:2000])

In [14]:
len(alist)

1904

In [15]:
alist[10].print_author(list_articles=True)


Halpern, J
All Names:  {'Halpern, J'}
Institutions: set()
Co-authors: ['Crane, HR', 'Oleson, NL']
Articles


In [None]:
print(alist[10].full_last_names)
print(alist[10].full_first_names)
print(alist[10].first_initials)