In [1]:
import pandas as pd
import numpy as np
from collections import Counter

## Data collection

In [2]:
df = pd.read_csv('../data/interim/ica_author_data.csv')

In [3]:
df.head(3)

Unnamed: 0,doi,url,year,title,journal,datePublished,authorFullName,firstName,lastName,numberOfAuthors,authorPosition,affiliation
0,10.1093/joc/jqac004,https://academic.oup.com/joc/article/72/3/297/...,2022,The Gender Divide in Wikipedia: Quantifying an...,Journal of Communication,2022-02-16,Isabelle Langrock,Isabelle,Langrock,2.0,1.0,"Annenberg School for Communication, University..."
1,10.1093/joc/jqac004,https://academic.oup.com/joc/article/72/3/297/...,2022,The Gender Divide in Wikipedia: Quantifying an...,Journal of Communication,2022-02-16,Sandra González-Bailón,Sandra,González-Bailón,2.0,2.0,"Annenberg School for Communication, University..."
2,10.1093/joc/jqac009,https://academic.oup.com/joc/article/72/3/322/...,2022,Mapping Exposure Diversity: The Divergent Effe...,Journal of Communication,2022-03-16,Pascal Jürgens,Pascal,Jürgens,2.0,1.0,"Department of Communication, Jakob-Welder-Weg ..."


In [4]:
# total unmber of authors
df.shape

(13603, 12)

In [5]:
# total number of papers
all_papers = list(set(df.doi))
len(all_papers)

7708

## Research versus non research

In [6]:
df = pd.read_csv('../data/interim/ica_paper_df.csv')

In [7]:
df.head()

Unnamed: 0,journal,issueURL,volumn,issue,issueText,month,year,category,title,url,doi,pages,abstract,abstract_para_num
0,Journal of Communication,https://academic.oup.com/joc/issue/72/3?browse...,Volume 72,Issue 3,"Issue 3, June 2022, Pages 297–449",June,2022,Articles,The Gender Divide in Wikipedia: Quantifying an...,https://academic.oup.com/joc/article/72/3/297/...,10.1093/joc/jqac004,297–321,Wikipedia has a well-known gender divide affec...,1.0
1,Journal of Communication,https://academic.oup.com/joc/issue/72/3?browse...,Volume 72,Issue 3,"Issue 3, June 2022, Pages 297–449",June,2022,Articles,Mapping Exposure Diversity: The Divergent Effe...,https://academic.oup.com/joc/article/72/3/322/...,10.1093/joc/jqac009,322–344,Diversity is a crucial precondition for a demo...,1.0
2,Journal of Communication,https://academic.oup.com/joc/issue/72/3?browse...,Volume 72,Issue 3,"Issue 3, June 2022, Pages 297–449",June,2022,Articles,Democratic Consequences of Incidental Exposure...,https://academic.oup.com/joc/article/72/3/345/...,10.1093/joc/jqac008,345–373,"In the last two decades, communication researc...",1.0
3,Journal of Communication,https://academic.oup.com/joc/issue/72/3?browse...,Volume 72,Issue 3,"Issue 3, June 2022, Pages 297–449",June,2022,Articles,The Great and Powerful Dr. Oz? Alternative Hea...,https://academic.oup.com/joc/article/72/3/374/...,10.1093/joc/jqac011,374–400,"Cardiothoracic surgeon Dr. Mehmet Oz, until re...",1.0
4,Journal of Communication,https://academic.oup.com/joc/issue/72/3?browse...,Volume 72,Issue 3,"Issue 3, June 2022, Pages 297–449",June,2022,Articles,A Methodological Framework for Analyzing the A...,https://academic.oup.com/joc/article/72/3/401/...,10.1093/joc/jqac013,401–428,Media effects have been studied for decades. H...,1.0


In [8]:
issue_dic = dict(zip(df.category, df.issueURL))

In [9]:
cat_df = pd.DataFrame(dict(Counter(df.category).items()).items(), columns = ['cat', 'freq']).sort_values(
    'freq', ascending= False).reset_index(drop=True)
cat_df.head()

Unnamed: 0,cat,freq
0,Articles,1667
1,Original Articles,1367
2,Article,1120
3,Original Article,1102
4,Book Reviews,367


In [10]:
dff = pd.read_csv('../data/interim/cat_class_raw.csv')

In [11]:
dff.head()

Unnamed: 0,cat,freq,TO_EXCLUDE,COMMENT,issueURL
0,Articles,1667,,,https://academic.oup.com/ccc/issue/12/3?browse...
1,Original Articles,1367,,,https://academic.oup.com/ccc/issue/1/1?browseB...
2,Article,1120,,,https://academic.oup.com/jcmc/issue/3/2?browse...
3,Original Article,1102,,,https://academic.oup.com/ccc/issue/9/1?browseB...
4,Book Reviews,367,True,,https://academic.oup.com/ccc/issue/1/3?browseB...


In [15]:
dff[dff.TO_EXCLUDE == True].shape

(113, 5)

In [12]:
dff[dff.TO_EXCLUDE == True].freq.sum()

965

In [13]:
7708-965

6743

## Without author names

I originally wrote that 

>Five papers without author names were relabelled as “non-research” after finding they had been incorrectly coded as research papers.

Later, I found that we might have changed the results directly in individual results in the folder of `paper_classification_task`. 

## Making sure `.isin()` is not making troubles here

In the scripte of `get_paper_and_author_with_type.py`, in order to tag doi with their corresponding paper type (R or M), I used `.isin()` as in 

>paper['type'] = np.where(paper.doi.isin(all_r), 'R', 'M')

But this is dangerous. Luckily, it is not making troubles here because `papers[papers.type == 'R'].shape[0] == len(all_r)`

Updates: `isin` has no problems :)

In [89]:
# our individual paper classification results
hongtao = pd.read_csv('../data/interim/paper_classification_task/paper_classification_hongtao.csv')
jeff = pd.read_csv('../data/interim/paper_classification_task/paper_classification_jeff.csv')
kristen = pd.read_csv('../data/interim/paper_classification_task/paper_classification_kristen.csv')

In [90]:
# research papers
hongtao_r = hongtao[hongtao.type == 1].doi.tolist()
jeff_r = jeff[jeff.type == 'R'].doi.tolist()
kristen_r = kristen[kristen.type == 'R'].doi.tolist()
all_r = hongtao_r + jeff_r + kristen_r

In [91]:
len(all_r)

5813

In [92]:
# non research papers
hongtao_nonr = hongtao[hongtao.type == 0].doi.tolist()
jeff_nonr = jeff[jeff.type == 'M'].doi.tolist()
kristen_nonr = kristen[kristen.type == 'M'].doi.tolist()
all_nonr = hongtao_nonr + jeff_nonr + kristen_nonr

In [93]:
set(hongtao.type), set(jeff.type), set(kristen.type)

({0, 1}, {'M', 'R'}, {'M', 'R'})

In [94]:
len(hongtao_r), len(jeff_r), len(kristen_r)

(1859, 1941, 2013)

In [95]:
len(hongtao_nonr), len(jeff_nonr), len(kristen_nonr)

(389, 306, 235)

In [96]:
withtype = pd.concat([hongtao, jeff, kristen])

In [97]:
typedict = dict(zip(withtype.doi, withtype.type))

In [98]:
len(hongtao_r) + len(jeff_r) + len(kristen_r) + len(hongtao_nonr) + len(jeff_nonr) + len(kristen_nonr)

6743

In [99]:
withtype.shape

(6743, 16)

In [45]:
# research_papers = withtype[(withtype.type == 1) | (withtype.type == 'R')].doi.tolist()

In [60]:
papers = pd.read_csv('../data/interim/ica_paper_df.csv')

In [62]:
papers.shape

(7708, 14)

In [65]:
papers['type'] = np.where(papers.doi.isin(all_r), 'R', 'M')

In [76]:
papers[papers.type == 'R'].shape

(5813, 15)

In [100]:
papers[papers.type == 'R'].shape[0] == len(all_r)

True

### The following shows that we have changed the results directly in individual results in the folder of paper_classification_task. 

See more details in `without author names` above.

In [83]:
authors = pd.read_csv('../data/interim/ica_author_data.csv')

In [84]:
authors['type'] = np.where(authors.doi.isin(all_r), 'R', 'M')

In [88]:
authors[(authors.type == 'R') & (authors.authorFullName.isnull())]

Unnamed: 0,doi,url,year,title,journal,datePublished,authorFullName,firstName,lastName,numberOfAuthors,authorPosition,affiliation,type


## Authors without aff 

In [102]:
# number of authors in 5813 research paperes
authors[authors.type == 'R'].shape

(11471, 13)

In [106]:
authors[(authors.type == 'R') & (authors.affiliation.isnull())].shape

(133, 13)

In [112]:
research_but_without_aff_papers = list(
    set(authors[(authors.type == 'R') & (authors.affiliation.isnull())].doi.tolist()))

In [113]:
len(research_but_without_aff_papers)

95

In [115]:
all_dois = papers.doi.tolist()

In [134]:
authors[authors.doi.isin(research_but_without_aff_papers)].shape

(167, 13)

In [135]:
authors[authors.type == 'R'].shape[0] - 167

11304

In [136]:
papers[papers.type == 'R'].shape[0] - 95

5718

## Updates: Using `isin()` has no problems ;)

In [127]:
totestlist_1 = ['xxxx', 'xxx']

In [128]:
'xx' in totestlist_1

False

In [129]:
dff = pd.DataFrame({'num_legs': ['xxxx', 'xxx'], 'num_wings': ['xxxxx', 'xxxxxx']})

In [130]:
dff

Unnamed: 0,num_legs,num_wings
0,xxxx,xxxxx
1,xxx,xxxxxx


In [131]:
totestlist = ['xx']

In [132]:
dff.num_legs.isin(totestlist)

0    False
1    False
Name: num_legs, dtype: bool