## Aim

In this notebook, I want to look at how many joc papers indexed on openalex contain DOI information. Quite a lot. 

In [36]:
import pandas as pd
import re
import numpy as np
import requests
import json
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [37]:
comm_papers = pd.read_csv('../data/interim/paper_df.csv')

In [38]:
joc_alex = comm_papers[comm_papers.Venue == 'Journal of Communication'].reset_index(drop=True)

In [20]:
joc_alex.shape
# around 4700 papers in joc

(4690, 19)

In [51]:
# how many contain dois
# wow, almost all of them contain doi
joc_alex = joc_alex[joc_alex.DOI.notnull()]
joc_alex.shape

(4524, 19)

In [61]:
joc_alex_dois = [x.replace('https://doi.org/', '') for x in joc_alex.DOI] 
joc_alex_dois

['10.1093/joc/jqac011',
 '10.1093/joc/jqac007',
 '10.1093/joc/jqac010',
 '10.1093/joc/jqac008',
 '10.1093/joc/jqac009',
 '10.1093/joc/jqac002',
 '10.1093/joc/jqac005',
 '10.1093/joc/jqac006',
 '10.1093/joc/jqac004',
 '10.1093/joc/jqac001',
 '10.1093/joc/jqac003',
 '10.1093/joc/jqab053',
 '10.1093/joc/jqab050',
 '10.1093/joc/jqab054',
 '10.1093/joc/jqab048',
 '10.1093/joc/jqab052',
 '10.1093/joc/jqab051',
 '10.1093/joc/jqab046',
 '10.1093/joc/jqab043',
 '10.1093/joc/jqab023',
 '10.1093/joc/jqab042',
 '10.1093/joc/jqab029',
 '10.1093/joc/jqab044',
 '10.31234/osf.io/7uyn5',
 '10.1093/joc/jqab024',
 '10.1093/joc/jqab041',
 '10.1093/joc/jqab040',
 '10.1093/joc/jqab033',
 '10.1093/joc/jqab039',
 '10.1093/joc/jqab035',
 '10.1093/joc/jqab038',
 '10.1093/joc/jqab037',
 '10.1093/joc/jqab036',
 '10.1093/joc/jqab028',
 '10.1093/joc/jqab027',
 '10.1093/joc/jqaa026',
 '10.1093/joc/jqab016',
 '10.1093/joc/jqab018',
 '10.1093/joc/jqab014',
 '10.1093/joc/jqab032',
 '10.1093/joc/jqab015',
 '10.1093/joc/

In [54]:
# in the form of urls
# note that OpenAlex made a mistake: it took all papers in journal of communications into acccount!
# jocS seems like a predatory journal!
joc_alex_dois[3000]

'10.1111/j.1460-2466.1984.tb02173.x'

In [55]:
ica_papers = pd.read_csv('../data/interim/ica_paper_df.csv')
ica_papers.head(1)

Unnamed: 0,journal,volumn,issue,month,year,category,title,url,doi,pages,abstract,abstract_para_num
0,Journal of Communication,Volume 72,Issue 2,April,2022,Articles,Media Systems in the Digital Age: An Empirical Comparison of 30 Countries,https://academic.oup.com/joc/article/72/2/145/6509144,10.1093/joc/jqab054,145–164,,0.0


In [56]:
joc_official = ica_papers[ica_papers.journal == 'Journal of Communication'].reset_index(drop=True)

In [57]:
joc_official.shape

(3951, 12)

In [58]:
joc_official[joc_official.doi == '10.1111/j.1460-2466.1985.tb02979.x']

Unnamed: 0,journal,volumn,issue,month,year,category,title,url,doi,pages,abstract,abstract_para_num
2117,Journal of Communication,Volume 35,Issue 4,December,1985,Homo Narrans,Toward a New Political Narrative,https://academic.oup.com/joc/article/35/4/156/4282943,10.1111/j.1460-2466.1985.tb02979.x,156–171,,


In [60]:
joc_official_dois = joc_official.doi.tolist()
joc_official_dois

['10.1093/joc/jqab054',
 '10.1093/joc/jqab051',
 '10.1093/joc/jqab052',
 '10.1093/joc/jqac002',
 '10.1093/joc/jqab048',
 '10.1093/joc/jqac001',
 '10.1093/joc/jqab039',
 '10.1093/joc/jqab038',
 '10.1093/joc/jqab040',
 '10.1093/joc/jqab044',
 '10.1093/joc/jqab043',
 '10.1093/joc/jqab024',
 '10.1093/joc/jqab046',
 '10.1093/joc/jqab042',
 '10.1093/joc/jqaa035',
 '10.1093/joc/jqab050',
 '10.1093/joc/jqab020',
 '10.1093/joc/jqab021',
 '10.1093/joc/jqab023',
 '10.1093/joc/jqab034',
 '10.1093/joc/jqab041',
 '10.1093/joc/jqab037',
 '10.1093/joc/jqab035',
 '10.1093/joc/jqaa038',
 '10.1093/joc/jqab033',
 '10.1093/joc/jqab032',
 '10.1093/joc/jqab031',
 '10.1093/joc/jqab030',
 '10.1093/joc/jqab029',
 '10.1093/joc/jqab025',
 '10.1093/joc/jqab027',
 '10.1093/joc/jqab028',
 '10.1093/joc/jqab026',
 '10.1093/joc/jqab014',
 '10.1093/joc/jqab015',
 '10.1093/joc/jqab019',
 '10.1093/joc/jqab016',
 '10.1093/joc/jqab017',
 '10.1093/joc/jqab018',
 '10.1093/joc/jqab022',
 '10.1093/joc/jqaa026',
 '10.1093/joc/jq

In [62]:
unique_to_official = [x for x in joc_official_dois if x not in joc_alex_dois]

In [63]:
len(unique_to_official)

560