# List authors and articles in APSR after 2016
- Code written to help a friend resolve a twitter debate about authorship trends in the American Political Science Review

## setup

In [1]:
from pyspark.sql import SQLContext
import pandas as pd
from collections import Counter
from pyspark.sql.functions import *
import pyspark.sql

In [2]:
sqlC = SQLContext(sc)

## load paper data

In [3]:
ts = sqlC.read.parquet("wos_times_cited.parquet")
ts.columns

['UID', 'times_cited', 'n_cites']

In [4]:
cites = sqlC.read.parquet("wos_core_clean.parquet")
cites.columns

['UID',
 'keywords',
 'references',
 'full_abstract',
 'all_lang',
 'pubyear',
 'has_abstract',
 'pubtype',
 'subjects',
 'subheadings',
 'headings',
 'item_title',
 'journal',
 'issn',
 'isbn',
 'eissn',
 'eisbn',
 'doi',
 'bare_text']

In [5]:
cites = cites.dropDuplicates(subset=['UID'])
#cites.count()

## get APSR papers from 2016 and beyond

In [6]:
papers = cites.filter(lower(col('journal')).contains('american political science review')) 
papers.count()

26348

In [7]:
papers = papers.filter(col('pubyear') >= 2016)
papers.count()

186

In [8]:
papers = papers.select('UID', 'pubyear', 'item_title', 'full_abstract')

## load author data

In [32]:
authors = sqlC.read.parquet("WoS_names.parquet")
authors.columns

['UID',
 '_addr_no',
 'wos_standard',
 'email_addr',
 '_dais_id',
 '_orcid_id_tr',
 '_r_id',
 '_reprint',
 '_role',
 '_seq_no',
 'display_name',
 'first_name',
 'full_name',
 'last_name',
 'suffix',
 'row_id',
 '_addr_no_2',
 '_dais_id_2',
 '_orcid_id_tr_2',
 '_r_id_2',
 '_reprint_2',
 '_role_2',
 '_seq_no_2',
 'display_name_2',
 'first_name_2',
 'full_name_2',
 'last_name_2',
 'suffix_2',
 'row_id2',
 'email_addr_2',
 'city',
 'country',
 'full_address',
 'organizations',
 'state',
 'street',
 'suborganizations',
 'zipcode',
 'zip_location',
 'pub_year']

In [33]:
authors = authors.select('UID', 'display_name',
                         'first_name', 'last_name')
authors.count()

328611134

In [34]:
authors= authors.drop_duplicates(subset=['display_name', 'UID'])
authors.count()

251554595

## merge authors with papers

In [35]:
output = papers.join(authors, how='left',on='UID')
output.printSchema()

root
 |-- UID: string (nullable = true)
 |-- pubyear: long (nullable = true)
 |-- item_title: string (nullable = true)
 |-- full_abstract: string (nullable = true)
 |-- display_name: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)



In [36]:
output.count()

329

In [37]:
output.show()

+-------------------+-------+--------------------+--------------------+--------------------+-----------+----------+
|                UID|pubyear|          item_title|       full_abstract|        display_name| first_name| last_name|
+-------------------+-------+--------------------+--------------------+--------------------+-----------+----------+
|WOS:000382561900009|   2016|Adam Smith on Wha...|This article expl...|Rasmussen, Dennis C.|  Dennis C.| Rasmussen|
|WOS:000446999100009|   2018|Leadership with T...|Group members val...|        Dewan, Torun|      Torun|     Dewan|
|WOS:000446999100009|   2018|Leadership with T...|Group members val...|Squintani, Francesco|  Francesco| Squintani|
|WOS:000439554800011|   2018|Reading Between t...|This article prov...|     Mueller, Hannes|     Hannes|   Mueller|
|WOS:000439554800011|   2018|Reading Between t...|This article prov...|   Rauh, Christopher|Christopher|      Rauh|
|WOS:000446999100025|   2018|Ethnoracial Homog...|How does ethnorac...| 

## output is small, so convert to pandas and save

In [38]:
df = output.toPandas()
df.head()

Unnamed: 0,UID,pubyear,item_title,full_abstract,display_name,first_name,last_name
0,WOS:000382561900009,2016,Adam Smith on What Is Wrong with Economic Ineq...,This article explores Adam Smith's attitude to...,"Rasmussen, Dennis C.",Dennis C.,Rasmussen
1,WOS:000446999100009,2018,Leadership with Trustworthy Associates,Group members value informed decisions and hol...,"Dewan, Torun",Torun,Dewan
2,WOS:000446999100009,2018,Leadership with Trustworthy Associates,Group members value informed decisions and hol...,"Squintani, Francesco",Francesco,Squintani
3,WOS:000439554800011,2018,Reading Between the Lines: Prediction of Polit...,This article provides a new methodology to pre...,"Mueller, Hannes",Hannes,Mueller
4,WOS:000439554800011,2018,Reading Between the Lines: Prediction of Polit...,This article provides a new methodology to pre...,"Rauh, Christopher",Christopher,Rauh


In [39]:
df.to_csv('for_mike.tsv', sep='\t', index=False)

In [40]:
df.shape

(329, 7)