I realized that using 

```python
authors_to_study = author[(
    author.type == 'R') & (
    author['affiliation.2'].notnull())]
```
 
is wrong because in 10 research papers, some authors have affiliation info whereas others do not. So, if I use the above codes, in these 10 papers, authors with affiliation info will be included. 

The correct way is:

```python
dois_to_exclude = list(set(
		author[(author.type =='R') & (author['affiliation.2'].isnull())].doi))
dois_to_study = [x for x in all_r if x not in dois_to_exclude]
papers_to_study = paper[paper.doi.isin(dois_to_study)]
authors_to_study = author[author.doi.isin(dois_to_study)]
```

In [1]:
import pandas as pd
import numpy as np

In [2]:
PAPER_CLASSIFICATION_HONGTAO = '../../data/interim/paper_classification_task/paper_classification_hongtao.csv'
PAPER_CLASSIFICATION_JEFF = '../../data/interim/paper_classification_task/paper_classification_jeff.csv'
PAPER_CLASSIFICATION_KRISTEN = '../../data/interim/paper_classification_task/paper_classification_kristen.csv'
ICA_PAPER_DF = '../../data/interim/ica_paper_df.csv'
AUTHOR_WITH_PRED = '../../data/processed/author_with_pred.csv'

In [3]:
hongtao = pd.read_csv(PAPER_CLASSIFICATION_HONGTAO)

In [4]:
hongtao[hongtao.doi == '10.1111/j.1460-2466.1987.tb00988.x']

Unnamed: 0,journal,issueURL,doi,url,sci_hub,year,category,type,title,abstract
1485,Journal of Communication,https://academic.oup.com/joc/issue/37/2?browse...,10.1111/j.1460-2466.1987.tb00988.x,https://academic.oup.com/joc/article/37/2/191/...,https://sci-hub.se/10.1111/j.1460-2466.1987.tb...,1987,Briefly noted,0,Briefly noted,


In [5]:
hongtao = pd.read_csv(PAPER_CLASSIFICATION_HONGTAO)
jeff = pd.read_csv(PAPER_CLASSIFICATION_JEFF)
kristen = pd.read_csv(PAPER_CLASSIFICATION_KRISTEN)
hongtao_r = hongtao[hongtao.type == 1].doi.tolist()
jeff_r = jeff[jeff.type == 'R'].doi.tolist()
kristen_r = kristen[kristen.type == 'R'].doi.tolist()
print('set of type:\n')
set(hongtao.type), set(jeff.type), set(kristen.type)
print('number of research papers:\n')
len(hongtao_r), len(jeff_r), len(kristen_r)
all_r = hongtao_r + jeff_r + kristen_r

set of type:

number of research papers:



In [6]:
len(all_r)

5813

In [7]:
all_r[0]

'10.1093/hcr/hqab020'

In [8]:
'10.1111/j.1460-2466.1987.tb00988.x' in all_r

False

In [9]:
# paper with type
paper = pd.read_csv(ICA_PAPER_DF)
paper['year'] = np.where(paper.year == 'Progress)', '2022', paper.year)
paper['type'] = np.where(paper.doi.isin(all_r), 'R', 'M')

In [10]:
paper[paper.type=='R'].shape

(5813, 15)

In [11]:
paper[paper.doi == '10.1111/j.1460-2466.1987.tb00988.x']

Unnamed: 0,journal,issueURL,volumn,issue,issueText,month,year,category,title,url,doi,pages,abstract,abstract_para_num,type
2052,Journal of Communication,https://academic.oup.com/joc/issue/37/2?browse...,Volume 37,Issue 2,"Issue 2, June 1987, Pages 2–192",June,1987,Briefly noted,Briefly noted,https://academic.oup.com/joc/article/37/2/191/...,10.1111/j.1460-2466.1987.tb00988.x,191–192,,,M


In [12]:
author = pd.read_csv(AUTHOR_WITH_PRED)
author['year'] = np.where(author.year == 'Progress)', '2022', author.year)
author['type'] = np.where(author.doi.isin(all_r), 'R', 'M')

In [13]:
author[author.doi == '10.1111/j.1460-2466.1987.tb00988.x']

Unnamed: 0,authorID,doi,url,year,title,journal,numberOfAuthors,authorPosition,authorFullName,firstName,...,white,raceHighest,raceSecondHighest,raceDiff,affProcessed,affiliation.2,ROR_AFFNAME,matchMethod,ROR_ID,type
3890,10.1111/j.1460-2466.1987.tb00988.x+nan,10.1111/j.1460-2466.1987.tb00988.x,https://academic.oup.com/joc/article/37/2/191/...,1987,Briefly noted,Journal of Communication,,,,,...,0.384294,0.461375,0.384294,0.077081,,,,,,M


In [14]:
author.columns

Index(['authorID', 'doi', 'url', 'year', 'title', 'journal', 'numberOfAuthors',
       'authorPosition', 'authorFullName', 'firstName', 'lastName',
       'affiliation', 'gscholarLink', 'googleSearch', 'genderize',
       'genderize_prob', 'genderize_basedon', 'genderAccuracy',
       'authorFullName.1', 'firstName.1', 'lastName.1', 'affiliation.1',
       'gscholarLink.1', 'googleSearch.1', 'race', 'racePredAccuracy', 'api',
       'black', 'hispanic', 'white', 'raceHighest', 'raceSecondHighest',
       'raceDiff', 'affProcessed', 'affiliation.2', 'ROR_AFFNAME',
       'matchMethod', 'ROR_ID', 'type'],
      dtype='object')

In [23]:
author[author.type =='R'].shape

(11471, 39)

In [37]:
dois_seem_correct = list(set(author[author.type =='R'].doi))
len(dois_seem_correct)

5813

In [44]:
author[(author.type =='R') & (author['affiliation.2'].isnull())]

Unnamed: 0,authorID,doi,url,year,title,journal,numberOfAuthors,authorPosition,authorFullName,firstName,...,white,raceHighest,raceSecondHighest,raceDiff,affProcessed,affiliation.2,ROR_AFFNAME,matchMethod,ROR_ID,type
170,10.1093/joc/jqab027+14.0,10.1093/joc/jqab027,https://academic.oup.com/joc/article/71/5/803/...,2021,Decolonizing Open Science: Southern Interventions,Journal of Communication,23.0,14.0,Gayle Moana Johnson,Gayle,...,0.608817,0.608817,0.364275,0.244542,,,,,,R
1086,10.1111/jcom.12153+3.0,10.1111/jcom.12153,https://academic.oup.com/joc/article/65/3/443/...,2015,"Parent and Child Media Exposure, Preschooler D...",Journal of Communication,3.0,3.0,,,...,0.384294,0.461375,0.384294,0.077081,,,,,,R
2919,10.1111/j.1460-2466.2000.tb02849.x+1.0,10.1111/j.1460-2466.2000.tb02849.x,https://academic.oup.com/joc/article/50/3/4/41...,2000,"Elderly Lifestyles in the 21st Century: ""Doris...",Journal of Communication,2.0,1.0,Angie Williams,Angie,...,0.465989,0.497633,0.465989,0.031645,,,,,,R
2920,10.1111/j.1460-2466.2000.tb02849.x+2.0,10.1111/j.1460-2466.2000.tb02849.x,https://academic.oup.com/joc/article/50/3/4/41...,2000,"Elderly Lifestyles in the 21st Century: ""Doris...",Journal of Communication,2.0,2.0,Virpi Ylänne-McEwen,Virpi,...,0.924163,0.924163,0.051145,0.873018,,,,,,R
3860,10.1111/j.1460-2466.1987.tb00995.x+1.0,10.1111/j.1460-2466.1987.tb00995.x,https://academic.oup.com/joc/article/37/3/68/4...,1987,Calculating Risk: Radiation and Chernobyl,Journal of Communication,1.0,1.0,Robert Peter Gale,Robert,...,0.882981,0.882981,0.063767,0.819214,,,,,,R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12118,10.1111/j.1083-6101.2006.00318.x+3.0,10.1111/j.1083-6101.2006.00318.x,https://academic.oup.com/jcmc/article/12/1/136...,2006,Managing Impressions in a Virtual Environment:...,Journal of Computer-Mediated Communication,4.0,3.0,Tiffany Charles,Tiffany,...,0.341701,0.603965,0.341701,0.262264,,,,,,R
12119,10.1111/j.1083-6101.2006.00318.x+4.0,10.1111/j.1083-6101.2006.00318.x,https://academic.oup.com/jcmc/article/12/1/136...,2006,Managing Impressions in a Virtual Environment:...,Journal of Computer-Mediated Communication,4.0,4.0,Patrice Coleman,Patrice,...,0.567848,0.567848,0.400698,0.167150,,,,,,R
12146,10.1111/j.1083-6101.2006.00305.x+1.0,10.1111/j.1083-6101.2006.00305.x,https://academic.oup.com/jcmc/article/11/4/101...,2006,Gender and the Use of Exclamation Points in Co...,Journal of Computer-Mediated Communication,1.0,1.0,Carol Waseleski,Carol,...,0.990786,0.990786,0.008628,0.982158,,,,,,R
12188,10.1111/j.1083-6101.2006.00021.x+1.0,10.1111/j.1083-6101.2006.00021.x,https://academic.oup.com/jcmc/article/11/2/442...,2006,Ouch!: an Examination of the Self-Representati...,Journal of Computer-Mediated Communication,1.0,1.0,Estelle Thoreau,Estelle,...,0.915146,0.915146,0.053728,0.861418,,,,,,R


In [33]:
dois_to_exclude = list(set(author[(author.type =='R') & (author['affiliation.2'].isnull())].doi))
len(dois_to_exclude)

95

In [27]:
authors_to_study = author[(
    author.type == 'R') & (
    author['affiliation.2'].notnull())]

In [28]:
authors_to_study.shape

(11338, 39)

In [17]:
dois_to_study = list(set(authors_to_study.doi))
papers_to_study = paper[paper.doi.isin(dois_to_study)]

In [18]:
papers_to_study.shape

(5728, 15)

In [29]:
len(dois_to_study)

5728

In [40]:
actually_excluded = [x for x in dois_seem_correct if x not in dois_to_study]
len(actually_excluded)

85

In [41]:
actually_excluded in dois_to_exclude

False

In [42]:
[x for x in actually_excluded if x not in dois_to_exclude]

[]

In [43]:
[x for x in dois_to_exclude if x not in actually_excluded]

['10.1111/j.1460-2466.1984.tb02161.x',
 '10.1111/hcre.12009',
 '10.1111/j.1083-6101.2006.00318.x',
 '10.1111/jcc4.12005',
 '10.1111/jcom.12153',
 '10.1111/j.1468-2885.1999.tb00160.x',
 '10.1093/joc/jqab027',
 '10.1111/hcre.12013',
 '10.1093/jcmc/zmac011',
 '10.1111/j.1468-2885.1999.tb00206.x']

In [45]:
author[author.doi=='10.1111/j.1460-2466.1984.tb02161.x']

Unnamed: 0,authorID,doi,url,year,title,journal,numberOfAuthors,authorPosition,authorFullName,firstName,...,white,raceHighest,raceSecondHighest,raceDiff,affProcessed,affiliation.2,ROR_AFFNAME,matchMethod,ROR_ID,type
4149,10.1111/j.1460-2466.1984.tb02161.x+1.0,10.1111/j.1460-2466.1984.tb02161.x,https://academic.oup.com/joc/article/34/2/73/4...,1984,Family Patterns and Television Viewing as Pred...,Journal of Communication,3.0,1.0,Jerome L. Singer,Jerome,...,0.932566,0.932566,0.050324,0.882243,wanda s rapaczynski jerome l singer is profess...,1 Wanda S. Rapaczynski Jerome L. Singer is Pro...,Yale University,Exact,https://ror.org/03v76x132,R
4150,10.1111/j.1460-2466.1984.tb02161.x+2.0,10.1111/j.1460-2466.1984.tb02161.x,https://academic.oup.com/joc/article/34/2/73/4...,1984,Family Patterns and Television Viewing as Pred...,Journal of Communication,3.0,2.0,Dorothy G. Singer,Dorothy,...,0.932566,0.932566,0.050324,0.882243,dorothy g singer is also professor of psycholo...,2 Dorothy G. Singer is also Professor of Psych...,University of Bridgeport,Exact,https://ror.org/01rf3yp57,R
4151,10.1111/j.1460-2466.1984.tb02161.x+3.0,10.1111/j.1460-2466.1984.tb02161.x,https://academic.oup.com/joc/article/34/2/73/4...,1984,Family Patterns and Television Viewing as Pred...,Journal of Communication,3.0,3.0,Wanda S. Rapaczynski,Wanda,...,0.98964,0.98964,0.006395,0.983245,,,,,,R
