In [24]:
from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
import pandas as pd
from collections import Counter

In [14]:
pager = Works().filter(
    primary_location={"source": {"id" : "s202381698"}},
    publication_year=">2018",
    is_retracted='true'
    ).paginate(per_page=200)

In [15]:
specified_columns = [
    'id',
    'title',
    'authorships',
    'publication_year',
    'countries_distinct_count',
    'institutions_distinct_count',
    'referenced_works_count',
    'is_retracted'
]

max_limit_pages = -1 # Set to -1 for all pages

output_df = pd.DataFrame(columns=specified_columns)
output_df['is_retracted'] = output_df['is_retracted'].astype(bool)

page_count = 0
for page in pager:
  print(f"Page: {page_count}")
  output_df = pd.concat([
    output_df,
    pd.DataFrame(
      page,
      columns=specified_columns
    )
  ])
  page_count += 1
  if page_count == max_limit_pages:
    break

print(len(output_df))

Page: 0
Page: 1
Page: 2
237


In [16]:
output_df

Unnamed: 0,id,title,authorships,publication_year,countries_distinct_count,institutions_distinct_count,referenced_works_count,is_retracted
0,https://openalex.org/W3134443160,The anti-vaccination infodemic on social media...,"[{'author_position': 'first', 'author': {'id':...",2021,1,1,44,True
1,https://openalex.org/W3014779806,ACC-deaminase producing plant growth promoting...,"[{'author_position': 'first', 'author': {'id':...",2020,2,2,48,True
2,https://openalex.org/W3209586559,Application of local fully Convolutional Neura...,"[{'author_position': 'first', 'author': {'id':...",2021,1,3,16,True
3,https://openalex.org/W3024735271,Terminal drought and heat stress alter physiol...,"[{'author_position': 'first', 'author': {'id':...",2020,3,4,52,True
4,https://openalex.org/W4213453867,Effect of zinc nanoparticles seed priming and ...,"[{'author_position': 'first', 'author': {'id':...",2022,4,10,46,True
...,...,...,...,...,...,...,...,...
32,https://openalex.org/W4292219993,Medication regimen complexity and its associat...,"[{'author_position': 'first', 'author': {'id':...",2022,1,2,34,True
33,https://openalex.org/W2916143521,Identifying developmental trajectories of worl...,"[{'author_position': 'first', 'author': {'id':...",2019,1,1,13,True
34,https://openalex.org/W3130064135,Prevalence of rheumatic heart disease in a maj...,"[{'author_position': 'first', 'author': {'id':...",2021,2,3,25,True
35,https://openalex.org/W3196834656,Analysis on frosting of heat exchanger and num...,"[{'author_position': 'first', 'author': {'id':...",2021,1,1,33,True


In [52]:
output_df.iloc[0]['authorships'][0]

{'author_position': 'first',
 'author': {'id': 'https://openalex.org/A5066021330',
  'display_name': 'Federico Germani',
  'orcid': 'https://orcid.org/0000-0002-5604-0437'},
 'institutions': [{'id': 'https://openalex.org/I202697423',
   'display_name': 'University of Zurich',
   'ror': 'https://ror.org/02crff812',
   'country_code': 'CH',
   'type': 'education',
   'lineage': ['https://openalex.org/I202697423']}],
 'countries': ['CH'],
 'is_corresponding': True,
 'raw_author_name': 'Federico Germani',
 'raw_affiliation_string': 'Institute of Biomedical Ethics and History of Medicine, University of Zurich, Zürich, Switzerland',
 'raw_affiliation_strings': ['Institute of Biomedical Ethics and History of Medicine, University of Zurich, Zürich, Switzerland']}

In [19]:
authors = list(output_df['authorships'].values)

In [22]:
ids = []

for author_dicts in authors:
    for author_dict in author_dicts:
        ids.append(author_dict['author']['id'])


In [29]:
unique_ids = list(set(ids))
count_ids = Counter(ids)


In [42]:
retracted_authors = pd.DataFrame(unique_ids, columns=['ID'])
retracted_authors

Unnamed: 0,ID
0,https://openalex.org/A5046119891
1,https://openalex.org/A5091317710
2,https://openalex.org/A5089407824
3,https://openalex.org/A5035213015
4,https://openalex.org/A5018416840
...,...
1345,https://openalex.org/A5074090595
1346,https://openalex.org/A5003651969
1347,https://openalex.org/A5048632692
1348,https://openalex.org/A5040205022


In [46]:
get_displayName = lambda x : Authors()[x]['display_name']
get_paperCounts = lambda x : count_ids[x]

In [44]:
retracted_authors['DisplayName'] = retracted_authors['ID'].apply(get_displayName)

In [47]:
retracted_authors['Retractions'] = retracted_authors['ID'].apply(get_paperCounts)

In [50]:
retracted_authors.sort_values(by='Retractions', ascending=False)

Unnamed: 0,ID,DisplayName,Retractions
1201,https://openalex.org/A5011637406,Mohammad Javed Ansari,19
672,https://openalex.org/A5068852002,Ali Tan Kee Zuan,18
75,https://openalex.org/A5002674897,Mubshar Hussain,12
964,https://openalex.org/A5063970370,Ahmed M. El-Shehawi,11
500,https://openalex.org/A5019980755,Subhan Danish,9
...,...,...,...
1313,https://openalex.org/A5050770854,Katarzyna Kotwica-Mojzych,1
4,https://openalex.org/A5018416840,Muhammad Iqbal,1
1315,https://openalex.org/A5059438758,Muneeba Haider,1
1316,https://openalex.org/A5085306260,Nida Baig,1


In [53]:
save = True

if save:
  output_df.to_csv('retracted-authors-plos1-2018.csv')
  print('Saved')

Saved


In [64]:
def anyRetractedAuthors(authors : list) -> bool:
    retracted = False

    for author_dict in authors:
        if author_dict['author']['id'] in list(retracted_authors['ID'].values):
            retracted = True
    
    return retracted

In [70]:
def countAuthorRetractions(authors : list) -> int:
    count = 0

    for author_dict in authors:
        if author_dict['author']['id'] in list(retracted_authors['ID'].values):
            row_index = retracted_authors.index.get_loc(retracted_authors[retracted_authors['ID'] == author_dict['author']['id']].index[0])
            count += retracted_authors.iloc[row_index]['Retractions']
    
    return count

In [68]:
def countRetractedAuthors(authors : list) -> int:
    count = 0

    for author_dict in authors:
        if author_dict['author']['id'] in list(retracted_authors['ID'].values):
            count += 1
    
    return count

In [60]:
pager = Works().filter(
    primary_location={"source": {"id" : "s202381698"}},
    publication_year=">2018"
    ).paginate(per_page=200)

In [80]:
specified_columns = [
    'id',
    'title',
    'authorships',
    'publication_year',
    'countries_distinct_count',
    'institutions_distinct_count',
    'referenced_works_count',
    'is_retracted'
]

max_limit_pages = -1 # Set to -1 for all pages

output_df = pd.DataFrame(columns=specified_columns)
output_df['is_retracted'] = output_df['is_retracted'].astype(bool)

page_count = 0
for page in pager:
  print(f"Page: {page_count}")
  output_df = pd.concat([
    output_df,
    pd.DataFrame(
      page,
      columns=specified_columns
    )
  ])
  page_count += 1
  if page_count == max_limit_pages:
    break

print(len(output_df))

Page: 0
Page: 1
Page: 2
Page: 3
Page: 4
Page: 5
Page: 6
Page: 7
Page: 8
Page: 9
Page: 10
Page: 11
Page: 12
Page: 13
Page: 14
Page: 15
Page: 16
Page: 17
Page: 18
Page: 19
Page: 20
Page: 21
Page: 22
Page: 23
Page: 24
Page: 25
Page: 26
Page: 27
Page: 28
Page: 29
Page: 30
Page: 31
Page: 32
Page: 33
Page: 34
Page: 35
Page: 36
Page: 37
Page: 38
Page: 39
Page: 40
Page: 41
Page: 42
Page: 43
Page: 44
Page: 45
Page: 46
Page: 47
Page: 48
Page: 49
10000


In [84]:
output_df[output_df['HasRetractedAuthor']==True]

<class 'pandas.core.frame.DataFrame'>
Index: 105 entries, 5 to 69
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   id                           105 non-null    object
 1   title                        105 non-null    object
 2   authorships                  105 non-null    object
 3   publication_year             105 non-null    object
 4   countries_distinct_count     105 non-null    object
 5   institutions_distinct_count  105 non-null    object
 6   referenced_works_count       105 non-null    object
 7   is_retracted                 105 non-null    bool  
 8   HasRetractedAuthor           105 non-null    bool  
 9   NumberOfRetractedAuthors     105 non-null    int64 
 10  NumberOfAuthorRetractions    105 non-null    int64 
dtypes: bool(2), int64(2), object(7)
memory usage: 8.4+ KB
