In [None]:
!pip install pyalex

Collecting pyalex
  Downloading pyalex-0.14-py3-none-any.whl (10 kB)
Installing collected packages: pyalex
Successfully installed pyalex-0.14


In [None]:
from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
import pandas as pd
import numpy as np
import json

# Loading data of retracted authors and institutions

In [None]:
!git clone --depth=1 --branch add_retracted_authors_institutions https://github.com/j4ck-k/predicting-paper-retractions

Cloning into 'predicting-paper-retractions'...
remote: Enumerating objects: 17, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 17 (delta 2), reused 14 (delta 1), pack-reused 0[K
Receiving objects: 100% (17/17), 14.87 MiB | 6.57 MiB/s, done.
Resolving deltas: 100% (2/2), done.
Updating files: 100% (13/13), done.


In [None]:
filename_retracted_authors = \
  '/content/predicting-paper-retractions/Data/OpenAlex/retractedauthors-train-03.pkl'
filename_retracted_institutions = \
  '/content/predicting-paper-retractions/Data/OpenAlex/retractedinstitutions-train-03.pkl'

retracted_authors = pd.read_pickle(filename_retracted_authors)
retracted_institutions = pd.read_pickle(filename_retracted_institutions)

# Sets and dicts for fast access
retracted_author_set = set(retracted_authors['unique_values'])
retracted_institutions_set = set(retracted_institutions['unique_values'])
retracted_author_key = dict(zip(retracted_authors.unique_values, retracted_authors.index))
retracted_institutions_key = dict(zip(retracted_institutions.unique_values, retracted_institutions.index))

## Getting source for PLOS ONE

In [None]:
plos_one = Sources()['s202381698']
json.dumps(plos_one)

'{"id": "https://openalex.org/S202381698", "issn_l": "1932-6203", "issn": ["1932-6203"], "display_name": "PloS one", "host_organization": "https://openalex.org/P4310315706", "host_organization_name": "Public Library of Science", "host_organization_lineage": ["https://openalex.org/P4310315706"], "works_count": 281207, "cited_by_count": 8867147, "summary_stats": {"2yr_mean_citedness": 3.4110240033901382, "h_index": 482, "i10_index": 194992}, "is_oa": true, "is_in_doaj": true, "ids": {"openalex": "https://openalex.org/S202381698", "issn_l": "1932-6203", "issn": ["1932-6203"], "mag": "202381698", "wikidata": "https://www.wikidata.org/entity/Q564954", "fatcat": "https://fatcat.wiki/container/s3gm7274mfe6fcs7e3jterqlri"}, "homepage_url": "http://www.plosone.org/", "apc_prices": [{"price": 1805, "currency": "USD"}], "apc_usd": 1805, "country_code": "US", "societies": [], "alternate_titles": ["PLoS ONE", "Public Library of Science one", "PLoS 1"], "abbreviated_title": null, "type": "journal", 

## Example of filtering works and getting pages

In [None]:
total_count = Works().filter(
    primary_location={"source": {"id" : "s202381698"}},
    publication_year="2010"
    ).count()

total_retracted_count = Works().filter(
    is_retracted=True,
    primary_location={"source": {"id" : "s202381698"}},
    publication_year="2010"
    ).count()

print(total_count)
print(total_retracted_count)

6924
22


In [None]:
example_page = Works().filter(
    primary_location={"source": {"id" : "s202381698"}},
    publication_year="2010"
    ).get()

example_retracted_page = Works().filter(
    is_retracted=True,
    primary_location={"source": {"id" : "s202381698"}},
    publication_year="2010"
    ).get()

### Example of processing page into dataframe

In [None]:
example_works = pd.DataFrame(
    example_page,
    columns=[
        'id',
        'title',
        'publication_year',
        'authorships',
        'countries_distinct_count',
        'institutions_distinct_count',
        'referenced_works_count',
        'cited_by_count',
        'is_retracted'
        ]
    )

example_retracted_works = pd.DataFrame(
    example_retracted_page,
    columns=[
        'id',
        'title',
        'publication_year',
        'authorships',
        'countries_distinct_count',
        'institutions_distinct_count',
        'referenced_works_count',
        'cited_by_count',
        'is_retracted'
        ]
    )

example_works.iloc[0]

id                                              https://openalex.org/W2031611770
title                          FastTree 2 – Approximately Maximum-Likelihood ...
publication_year                                                            2010
authorships                    [{'author_position': 'first', 'author': {'id':...
countries_distinct_count                                                       1
institutions_distinct_count                                                    2
referenced_works_count                                                        30
cited_by_count                                                             10642
is_retracted                                                               False
Name: 0, dtype: object

In [None]:
def authors_distinct_count(work):
  return max(1,len(work['authorships']))

def any_author_has_retraction(work,retracted_author_set):
  return any(
      (
          author['author']['id'] in retracted_author_set
              for author in work['authorships']
      )
  )

def any_institution_has_retraction(work,retracted_institution_set):
  return any(
      (
          institution['id'] in retracted_institution_set
              for author in work['authorships']
                  for institution in author['institutions']
      )
  )

def calc_author_retraction_stats(work,retracted_authors,retracted_author_set,retracted_author_key):
  return retracted_authors.iloc[
      (
          retracted_author_key[id_key]
              for id_key in (
                  author['author']['id']
                      for author in work['authorships']
                          if author['author']['id'] in retracted_author_set
              )
      )
  ]

def min_retracted_author_rank(author_retraction_stats):
  if len(author_retraction_stats) == 0:
    return -float('inf')
  else:
    return author_retraction_stats['rank'].min()

def has_1pct_retracted_author(author_retraction_stats):
  return author_retraction_stats['top1'].any()

def has_5pct_retracted_author(author_retraction_stats):
  return author_retraction_stats['top5'].any()

def has_10pct_retracted_author(author_retraction_stats):
  return author_retraction_stats['top10'].any()

def top_percentile_retracted_author(author_retraction_stats):
  if len(author_retraction_stats) == 0:
    return 0.0
  else:
    return author_retraction_stats['percentile'].max()

def frac_author_repeat_offenders(author_retraction_stats,n_authors):
  return sum(author_retraction_stats['counts'] > 1) / n_authors

def calc_institution_retraction_stats(work,retracted_institutions,retracted_institutions_set,retracted_institutions_key):
  return retracted_institutions.iloc[
      (
          retracted_institutions_key[id_key]
              for id_key in set(
                  institution['id']
                      for author in work['authorships']
                          for institution in author['institutions']
                              if institution['id'] in retracted_institutions_set
              )
      )
  ]

def min_retracted_institution_rank(institution_retraction_stats):
  if len(institution_retraction_stats) == 0:
    return -float('inf')
  else:
    return institution_retraction_stats['rank'].min()

def has_1pct_retracted_institution(institution_retraction_stats):
  return institution_retraction_stats['top1'].any()

def has_5pct_retracted_institution(institution_retraction_stats):
  return institution_retraction_stats['top5'].any()

def has_10pct_retracted_institution(institution_retraction_stats):
  return institution_retraction_stats['top10'].any()

def top_percentile_retracted_institution(institution_retraction_stats):
  if len(institution_retraction_stats) == 0:
    return 0
  else:
    return institution_retraction_stats['percentile'].max()


In [None]:
authors_distinct_count(example_works.iloc[1])

3

In [None]:
example_stats = calc_author_retraction_stats(example_works.iloc[1],retracted_authors,retracted_author_set,retracted_author_key)
example_retracted_stats = calc_author_retraction_stats(example_retracted_works.iloc[6],retracted_authors,retracted_author_set,retracted_author_key)
example_retracted_stats

Unnamed: 0,unique_values,counts,rank,percentile,top1,top5,top10
25483,https://openalex.org/A5074192479,1,24173.0,41.705052,False,False,False
25484,https://openalex.org/A5011293326,1,24173.0,41.705052,False,False,False
25485,https://openalex.org/A5081563738,1,24173.0,41.705052,False,False,False
25479,https://openalex.org/A5086152631,1,24173.0,41.705052,False,False,False
4754,https://openalex.org/A5023140151,2,4773.5,88.490293,False,False,True
25486,https://openalex.org/A5085111204,1,24173.0,41.705052,False,False,False
25488,https://openalex.org/A5067997872,1,24173.0,41.705052,False,False,False
25489,https://openalex.org/A5088340297,1,24173.0,41.705052,False,False,False
25490,https://openalex.org/A5028482826,1,24173.0,41.705052,False,False,False
25491,https://openalex.org/A5003014678,1,24173.0,41.705052,False,False,False


In [None]:
top_percentile_retracted_author(example_stats)

0.0

## Using the pagenation feature to get all records

In [None]:
# pager = Works().filter(
#     primary_location={"source": {"id" : "s202381698"}},
#     publication_year="2010"
#     ).paginate(per_page=200)

pager = Works().filter(
    primary_location={"source": {"id" : "s202381698"}},
    from_publication_date="2000-01-01",
    to_publication_date="2020-12-31"
    ).paginate(per_page=200,n_max=None)

specified_columns = [
      'id',
      'title',
      'publication_year',
      'authorships',
      'countries_distinct_count',
      'institutions_distinct_count',
      'referenced_works_count',
      'cited_by_count',
      'authors_distinct_count',
      'any_author_has_retraction',
      'min_retracted_author_rank',
      'has_1pct_retracted_author',
      'has_5pct_retracted_author',
      'has_10pct_retracted_author',
      'top_percentile_retracted_author',
      'frac_author_repeat_offenders',
      'any_institution_has_retraction',
      'min_retracted_institution_rank',
      'has_1pct_retracted_institution',
      'has_5pct_retracted_institution',
      'has_10pct_retracted_institution',
      'top_percentile_retracted_institution',
      'is_retracted'
]

max_limit_pages = 2 # Set to -1 for all pages

output_df = pd.DataFrame(columns=specified_columns)
output_df['any_author_has_retraction'] = output_df['any_author_has_retraction'].astype(bool)
output_df['has_1pct_retracted_author'] = output_df['has_1pct_retracted_author'].astype(bool)
output_df['has_5pct_retracted_author'] = output_df['has_5pct_retracted_author'].astype(bool)
output_df['has_10pct_retracted_author'] = output_df['has_10pct_retracted_author'].astype(bool)
output_df['any_institution_has_retraction'] = output_df['any_institution_has_retraction'].astype(bool)
output_df['has_1pct_retracted_institution'] = output_df['has_1pct_retracted_institution'].astype(bool)
output_df['has_5pct_retracted_institution'] = output_df['has_5pct_retracted_institution'].astype(bool)
output_df['has_10pct_retracted_institution'] = output_df['has_10pct_retracted_institution'].astype(bool)
output_df['is_retracted'] = output_df['is_retracted'].astype(bool)

page_count = 0
for page in pager:
  print(f"Page: {page_count}")

  new_df = pd.DataFrame(
      page,
      columns=specified_columns
  )

  # Early stop criteria
  if len(new_df) == 0:
    continue

  ### Calculate statistics for authors and institutions

  # Calculate author retraction statistics
  new_df['author_retraction_stats'] = new_df.apply(
      lambda work: calc_author_retraction_stats(
          work,
          retracted_authors,
          retracted_author_set,
          retracted_author_key
      ),
      axis=1
  )

  # Calculate institution retraction statistics
  new_df['institution_retraction_stats'] = new_df.apply(
      lambda work: calc_institution_retraction_stats(
          work,
          retracted_institutions,
          retracted_institutions_set,
          retracted_institutions_key
      ),
      axis=1
  )

  ### Calculate features for authors

  # Number of distinct authors
  new_df['authors_distinct_count'] = new_df.apply(
      lambda work: authors_distinct_count(work),axis=1
  )

  # Any author has a retraction
  new_df['any_author_has_retraction'] = new_df.apply(
      lambda work: any_author_has_retraction(
          work,retracted_author_set
          ),axis=1
  )

  # Minimum retraction rank of authors
  new_df['min_retracted_author_rank'] = new_df.apply(
      lambda work: min_retracted_author_rank(work['author_retraction_stats']),
      axis=1
  )

  # Do the authors contain a 1% retractor?
  new_df['has_1pct_retracted_author'] = new_df.apply(
      lambda work: has_1pct_retracted_author(work['author_retraction_stats']),
      axis=1
  )

  # Do the authors contain a 5% retractor?
  new_df['has_5pct_retracted_author'] = new_df.apply(
      lambda work: has_5pct_retracted_author(work['author_retraction_stats']),
      axis=1
  )

  # Do the authors contain a 10% retractor?
  new_df['has_10pct_retracted_author'] = new_df.apply(
      lambda work: has_10pct_retracted_author(work['author_retraction_stats']),
      axis=1
  )

  # Top retraction percentile of authors
  new_df['top_percentile_retracted_author'] = new_df.apply(
      lambda work: top_percentile_retracted_author(work['author_retraction_stats']),
      axis=1
  )

  # Fraction of authors with repeat offender retractions
  new_df['frac_author_repeat_offenders'] = new_df.apply(
      lambda work: frac_author_repeat_offenders(
          work['author_retraction_stats'],
          work['authors_distinct_count']
      ),
      axis=1
  )

  ### Calculate features for authors' institutions

  # Any author's institute has a retraction
  new_df['any_institution_has_retraction'] = new_df.apply(
      lambda work: any_institution_has_retraction(
          work,retracted_institutions_set),axis=1
  )

  # Minimum retraction rank of authors' institutions
  new_df['min_retracted_institution_rank'] = new_df.apply(
      lambda work: min_retracted_institution_rank(work['institution_retraction_stats']),
      axis=1
  )

  # Do the authors' institutions contain a 1% retractor?
  new_df['has_1pct_retracted_institution'] = new_df.apply(
      lambda work: has_1pct_retracted_institution(work['institution_retraction_stats']),
      axis=1
  )

  # Do the authors' institutions contain a 5% retractor?
  new_df['has_5pct_retracted_institution'] = new_df.apply(
      lambda work: has_5pct_retracted_institution(work['institution_retraction_stats']),
      axis=1
  )

  # Do the authors' institutions contain a 10% retractor?
  new_df['has_10pct_retracted_institution'] = new_df.apply(
      lambda work: has_10pct_retracted_institution(work['institution_retraction_stats']),
      axis=1
  )

  # Top retraction percentile of authors' institutions
  new_df['top_percentile_retracted_institution'] = new_df.apply(
      lambda work: top_percentile_retracted_institution(work['institution_retraction_stats']),
      axis=1
  )

  output_df = pd.concat([
      output_df,
      new_df[specified_columns]
  ])

  page_count += 1
  if page_count == max_limit_pages:
    break

output_df.drop('authorships', axis=1, inplace=True)

output_df.reset_index(drop=True, inplace=True)

print(len(output_df))

Page: 0
Page: 1
400


In [None]:
len(new_df)

200

In [None]:
output_df

Unnamed: 0,id,title,publication_year,countries_distinct_count,institutions_distinct_count,referenced_works_count,cited_by_count,authors_distinct_count,any_author_has_retraction,min_retracted_author_rank,...,has_10pct_retracted_author,top_percentile_retracted_author,frac_author_repeat_offenders,any_institution_has_retraction,min_retracted_institution_rank,has_1pct_retracted_institution,has_5pct_retracted_institution,has_10pct_retracted_institution,top_percentile_retracted_institution,is_retracted
0,https://openalex.org/W2056279562,phyloseq: An R Package for Reproducible Intera...,2013,1,1,68,13172,2,False,-inf,...,False,0.000000,0.000000,True,84.0,False,True,True,98.547173,False
1,https://openalex.org/W2031611770,FastTree 2 – Approximately Maximum-Likelihood ...,2010,1,2,30,10642,3,False,-inf,...,False,0.000000,0.000000,True,284.0,False,True,True,95.046385,False
2,https://openalex.org/W1982855075,Pilon: An Integrated Tool for Comprehensive Mi...,2014,2,3,46,6488,11,False,-inf,...,False,0.000000,0.000000,True,843.0,False,False,False,85.261684,False
3,https://openalex.org/W2159474015,"A Robust, Simple Genotyping-by-Sequencing (GBS...",2011,1,3,36,5223,7,True,4773.5,...,True,88.490293,0.142857,True,21.5,True,True,True,99.641169,False
4,https://openalex.org/W1969353942,REVIGO Summarizes and Visualizes Long Lists of...,2011,2,2,25,5117,4,False,-inf,...,False,0.000000,0.000000,True,2327.5,False,False,False,59.277087,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,https://openalex.org/W2047960154,Let-7 MicroRNA Family Is Selectively Secreted ...,2010,1,2,43,528,11,True,24173.0,...,False,41.705052,0.000000,True,4254.5,False,False,False,25.546998,False
396,https://openalex.org/W2048550695,Diversity of Bifidobacteria within the Infant ...,2012,4,5,16,528,17,True,4773.5,...,True,88.490293,0.058824,True,341.5,False,False,True,94.039909,False
397,https://openalex.org/W2115450300,Stool Microbiome and Metabolome Differences be...,2013,1,3,44,528,6,True,24173.0,...,False,41.705052,0.000000,True,522.5,False,False,True,90.871696,False
398,https://openalex.org/W1990950572,Microvesicles Derived from Mesenchymal Stem Ce...,2012,2,2,28,527,8,False,-inf,...,False,0.000000,0.000000,True,522.5,False,False,True,90.871696,False


In [None]:
output_df[output_df['is_retracted'] == True]

Unnamed: 0,id,title,publication_year,countries_distinct_count,institutions_distinct_count,referenced_works_count,cited_by_count,authors_distinct_count,any_author_has_retraction,min_retracted_author_rank,...,has_10pct_retracted_author,top_percentile_retracted_author,frac_author_repeat_offenders,any_institution_has_retraction,min_retracted_institution_rank,has_1pct_retracted_institution,has_5pct_retracted_institution,has_10pct_retracted_institution,top_percentile_retracted_institution,is_retracted


In [None]:
def get_works_dataset(pager,max_limit_pages):
    # Parameter: pager           -> OpenAlex object
    #            max_limit_pages -> How many pages to retrieve (set to -1 for all pages)
    #            is_test_data    -> Remove 'is_retracted' column?
    specified_columns = [
          'id',
          'title',
          'publication_year',
          'authorships',
          'countries_distinct_count',
          'institutions_distinct_count',
          'referenced_works_count',
          'cited_by_count',
          'authors_distinct_count',
          'any_author_has_retraction',
          'min_retracted_author_rank',
          'has_1pct_retracted_author',
          'has_5pct_retracted_author',
          'has_10pct_retracted_author',
          'top_percentile_retracted_author',
          'frac_author_repeat_offenders',
          'any_institution_has_retraction',
          'min_retracted_institution_rank',
          'has_1pct_retracted_institution',
          'has_5pct_retracted_institution',
          'has_10pct_retracted_institution',
          'top_percentile_retracted_institution',
          'is_retracted'
    ]

    output_df = pd.DataFrame(columns=specified_columns)
    output_df['any_author_has_retraction'] = output_df['any_author_has_retraction'].astype(bool)
    output_df['has_1pct_retracted_author'] = output_df['has_1pct_retracted_author'].astype(bool)
    output_df['has_5pct_retracted_author'] = output_df['has_5pct_retracted_author'].astype(bool)
    output_df['has_10pct_retracted_author'] = output_df['has_10pct_retracted_author'].astype(bool)
    output_df['any_institution_has_retraction'] = output_df['any_institution_has_retraction'].astype(bool)
    output_df['has_1pct_retracted_institution'] = output_df['has_1pct_retracted_institution'].astype(bool)
    output_df['has_5pct_retracted_institution'] = output_df['has_5pct_retracted_institution'].astype(bool)
    output_df['has_10pct_retracted_institution'] = output_df['has_10pct_retracted_institution'].astype(bool)
    output_df['is_retracted'] = output_df['is_retracted'].astype(bool)

    page_count = 0
    for page in pager:
      print(f"Page: {page_count}")

      new_df = pd.DataFrame(
          page,
          columns=specified_columns
      )

      # Early stop criteria
      if len(new_df) == 0:
        continue

      ### Calculate statistics for authors and institutions

      # Calculate author retraction statistics
      new_df['author_retraction_stats'] = new_df.apply(
          lambda work: calc_author_retraction_stats(
              work,
              retracted_authors,
              retracted_author_set,
              retracted_author_key
          ),
          axis=1
      )

      # Calculate institution retraction statistics
      new_df['institution_retraction_stats'] = new_df.apply(
          lambda work: calc_institution_retraction_stats(
              work,
              retracted_institutions,
              retracted_institutions_set,
              retracted_institutions_key
          ),
          axis=1
      )

      ### Calculate features for authors

      # Number of distinct authors
      new_df['authors_distinct_count'] = new_df.apply(
          lambda work: authors_distinct_count(work),axis=1
      )

      # Any author has a retraction
      new_df['any_author_has_retraction'] = new_df.apply(
          lambda work: any_author_has_retraction(
              work,retracted_author_set
              ),axis=1
      )

      # Minimum retraction rank of authors
      new_df['min_retracted_author_rank'] = new_df.apply(
          lambda work: min_retracted_author_rank(work['author_retraction_stats']),
          axis=1
      )

      # Do the authors contain a 1% retractor?
      new_df['has_1pct_retracted_author'] = new_df.apply(
          lambda work: has_1pct_retracted_author(work['author_retraction_stats']),
          axis=1
      )

      # Do the authors contain a 5% retractor?
      new_df['has_5pct_retracted_author'] = new_df.apply(
          lambda work: has_5pct_retracted_author(work['author_retraction_stats']),
          axis=1
      )

      # Do the authors contain a 10% retractor?
      new_df['has_10pct_retracted_author'] = new_df.apply(
          lambda work: has_10pct_retracted_author(work['author_retraction_stats']),
          axis=1
      )

      # Top retraction percentile of authors
      new_df['top_percentile_retracted_author'] = new_df.apply(
          lambda work: top_percentile_retracted_author(work['author_retraction_stats']),
          axis=1
      )

      # Fraction of authors with repeat offender retractions
      new_df['frac_author_repeat_offenders'] = new_df.apply(
          lambda work: frac_author_repeat_offenders(
              work['author_retraction_stats'],
              work['authors_distinct_count']
          ),
          axis=1
      )

      ### Calculate features for authors' institutions

      # Any author's institute has a retraction
      new_df['any_institution_has_retraction'] = new_df.apply(
          lambda work: any_institution_has_retraction(
              work,retracted_institutions_set),axis=1
      )

      # Minimum retraction rank of authors' institutions
      new_df['min_retracted_institution_rank'] = new_df.apply(
          lambda work: min_retracted_institution_rank(work['institution_retraction_stats']),
          axis=1
      )

      # Do the authors' institutions contain a 1% retractor?
      new_df['has_1pct_retracted_institution'] = new_df.apply(
          lambda work: has_1pct_retracted_institution(work['institution_retraction_stats']),
          axis=1
      )

      # Do the authors' institutions contain a 5% retractor?
      new_df['has_5pct_retracted_institution'] = new_df.apply(
          lambda work: has_5pct_retracted_institution(work['institution_retraction_stats']),
          axis=1
      )

      # Do the authors' institutions contain a 10% retractor?
      new_df['has_10pct_retracted_institution'] = new_df.apply(
          lambda work: has_10pct_retracted_institution(work['institution_retraction_stats']),
          axis=1
      )

      # Top retraction percentile of authors' institutions
      new_df['top_percentile_retracted_institution'] = new_df.apply(
          lambda work: top_percentile_retracted_institution(work['institution_retraction_stats']),
          axis=1
      )

      output_df = pd.concat([
          output_df,
          new_df[specified_columns]
      ])

      page_count += 1
      if page_count == max_limit_pages:
        break

    output_df.drop('authorships', axis=1, inplace=True)

    output_df.reset_index(drop=True, inplace=True)

    return output_df

## Get data from 2000-2010

In [None]:
total_count_train = Works().filter(
    primary_location={"source": {"id" : "s202381698"}},
    from_publication_date="2020-01-01",
    to_publication_date="2020-12-31"
    ).count()

print(total_count_train)

16685


In [None]:
# Get a number of journals
jid_plosone = "s202381698"
jid_science = "s3880285"
jid_cell = "s110447773"
jid_naturemedicine = "s203256638"
jid_nature = "s137773608"
jid_journalofhealthcareengineering = "s36625193"
jid_computationalandmathematicalmethodsinmedicine = "s36980176"
jid_journalofbiologicalchemistry = "s140251998"
jid_pnas = "s125754415"

jids = f"{jid_plosone}|{jid_science}|{jid_cell}|{jid_naturemedicine}|{jid_nature}|{jid_journalofhealthcareengineering}|{jid_computationalandmathematicalmethodsinmedicine}|{jid_journalofbiologicalchemistry}|{jid_pnas}"

total_count_train = Works().filter(
    primary_location={"source": {"id" : jids}},
    from_publication_date="2000-01-01",
    to_publication_date="2020-12-31"
    ).count()

print(total_count_train)

619598


In [None]:
pager = Works().filter(
    primary_location={"source": {"id" : jids}},
    from_publication_date="2010-01-01",
    to_publication_date="2020-12-31"
    ).paginate(per_page=200,n_max=None)

works_2010_2020_train  = get_works_dataset(pager,-1)

Page: 0
Page: 1
Page: 2
Page: 3
Page: 4
Page: 5
Page: 6
Page: 7
Page: 8
Page: 9
Page: 10
Page: 11
Page: 12
Page: 13
Page: 14
Page: 15
Page: 16
Page: 17
Page: 18
Page: 19
Page: 20
Page: 21
Page: 22
Page: 23
Page: 24
Page: 25
Page: 26
Page: 27
Page: 28
Page: 29
Page: 30
Page: 31
Page: 32
Page: 33
Page: 34
Page: 35
Page: 36
Page: 37
Page: 38
Page: 39
Page: 40
Page: 41
Page: 42
Page: 43
Page: 44
Page: 45
Page: 46
Page: 47
Page: 48
Page: 49
Page: 50
Page: 51
Page: 52
Page: 53
Page: 54
Page: 55
Page: 56
Page: 57
Page: 58
Page: 59
Page: 60
Page: 61
Page: 62
Page: 63
Page: 64
Page: 65
Page: 66
Page: 67
Page: 68
Page: 69
Page: 70
Page: 71
Page: 72
Page: 73
Page: 74
Page: 75
Page: 76
Page: 77
Page: 78
Page: 79
Page: 80
Page: 81
Page: 82
Page: 83
Page: 84
Page: 85
Page: 86
Page: 87
Page: 88
Page: 89
Page: 90
Page: 91
Page: 92
Page: 93
Page: 94
Page: 95
Page: 96
Page: 97
Page: 98
Page: 99
Page: 100
Page: 101
Page: 102
Page: 103
Page: 104
Page: 105
Page: 106
Page: 107
Page: 108
Page: 109
Page: 110


In [None]:
pager = Works().filter(
    primary_location={"source": {"id" : jids}},
    from_publication_date="2021-01-01",
    to_publication_date="2022-12-31"
    ).paginate(per_page=200,n_max=None)

#works_2021_2022_test  = get_works_dataset(pager,-1)

In [None]:
#works_2000_2020_train

In [None]:
works_2010_2020_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220582 entries, 0 to 220581
Data columns (total 22 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   id                                    220582 non-null  object 
 1   title                                 220558 non-null  object 
 2   publication_year                      220582 non-null  object 
 3   countries_distinct_count              220582 non-null  object 
 4   institutions_distinct_count           220582 non-null  object 
 5   referenced_works_count                220582 non-null  object 
 6   cited_by_count                        220582 non-null  object 
 7   authors_distinct_count                220582 non-null  object 
 8   any_author_has_retraction             220582 non-null  bool   
 9   min_retracted_author_rank             220582 non-null  float64
 10  has_1pct_retracted_author             220582 non-null  bool   
 11  

In [None]:
#works_2021_2022_test.info()

In [None]:
#works_2021_2022_test[works_2021_2022_test['is_retracted'] == True]

In [None]:
do_save = True

if do_save:
  works_2010_2020_train.to_pickle('openalex-data-2010-2020-train-06.pkl')
  print('Saved')

Saved
